In [None]:
from __future__ import division, absolute_import
import pandas as pd
import numpy as np
import datetime as dt
import forecastio
import requests
import os
import csv
import urllib
import json
from generate_data import *
from helpers import *

# set variables"
ds_api_key = '63ab81b2d8aee963f8e0c22cd4ec4650'
maps_api_key = 'AIzaSyDDNWV2QqV_SdygOs3A7ucVs-LNaL-PiUI'
source = 'locations.txt'
target = 'geocoded_locations.txt'

# generate geocoded locations file
get_geolocation_data(maps_api_key, source, target)

# # generate weather data for each location and save as CSV
start_date = dt.datetime(2017, 1, 1)
end_date = dt.datetime(2017, 12, 31)
cols = 'historical_columns.txt'
offset = 12
get_weather_data(ds_api_key, target, cols, start_date, end_date, offset)

In [42]:
from __future__ import division, absolute_import
import numpy as np
import pandas as pd
from generate_data import *
from helpers import *


class WeatherGenerator(object):
    """Randomly generates a dataset of artificial but realistic weather
    observations, including condition, temperature, humidity and pressure.
    """

    def __init__(self, obs, start_date, end_date, histdata, geodata):
        self.start_date = start_date
        self.end_date = end_date
        self.obs = obs
        self.geodata = pd.read_csv(get_filepath(geodata))
        self.histdata = aggregate_data(histdata)
        self.locations = self.histdata['Location'].unique().tolist()
        self.output = None

    def initialize_output(self):
        """Initializes an empty data frame to store randomly generate weather
        observations.
        """
        cols = ['Location', 'Position', 'Local Time',
                'Conditions', 'Temperature', 'Pressure', 'Humidity']

        # set dims and store as 'output' class variable
        rows = self.obs
        dims = len(cols)
        self.output = pd.DataFrame(np.zeros((rows, dims)), columns=cols)

    def generate_position_data(self):
        """Populates the 'Location' and 'Position' attributes of the output data
        frame.
        """
        # populate 'Location' field randomly
        self.output['Location'] = np.random.choice(self.locations, self.obs)

        # clean up geodata data frame and create 'Position' attribute
        nc = self.geodata[['Lat', 'Lng', 'Elevation']].round(2)
        nc['Elevation'] = nc['Elevation'].astype(int)
        self.geodata['Position'] = nc.astype(
            str).apply(lambda x: ','.join(x), axis=1)
        self.geodata.drop(columns=['Lat', 'Lng', 'Elevation'], inplace=True)

        # update "Position" column in output data frame
        left = self.output.set_index('Location')  # set left index
        right = self.geodata.set_index('Location')  # set right index
        self.output = left.loc[:, left.columns.union(right.columns)]  # union
        self.output.update(right)  # update self.output "Position" column
        self.output.reset_index(inplace=True)

    def generate_time_data(self):
        """Populates the 'Local Time' field sequentially, by location, using
        a date range from a randomly selected start date
        """
        # generate random dates and append to a list
        sd = self.start_date
        ed = self.end_date
        dates = [random_date(start=sd, end=ed) for d in range(0, obs)]

        # convert to ISO 8601 format and update "Local Time" field
        self.output['Local Time'] = map(lambda x: x.isoformat(), dates)

    def generate_weather_data(self):
        """Populates the 'Temperature', 'Humidity', and 'Pressure' attributes
        using historical values calculated on a monthly level
        """
        months = pd.to_datetime(self.output['Local Time']).dt.month
        self.output['Month'] = months  # set month values for later joins

        # merge output data frame with historical data to get ranges
        keys = ['Location', 'Month']
        m = pd.merge(self.output, self.histdata, how='left',
                     left_on=keys, right_on=keys)

        # use vectorization to uniformly select random pressure, temperature
        # and humidity values between the historical min and max ranges
        r = np.random.rand(m.shape[0])
        m['Temperature'] = ((m['Tmean_high'] - m['Tmean_low']
                             ) * r + m['Tmean_low']).round(1)
        m['Pressure'] = ((m['Pmax'] - m['Pmin']) * r + m['Pmin']).round(1)
        m['Humidity'] = ((m['Hmax'] - m['Hmin']) * r + m['Hmin']).astype(int)

        # drop redundant columns and assign to output
        dcols = ['Month', 'Timezone', 'Pmax', 'Pmin',
                 'Hmax', 'Hmin', 'Tmean_high', 'Tmean_low']
        m.drop(columns=dcols, inplace=True)
        self.output = m


In [43]:
# get grouped dataset
start_date = dt.datetime(2012, 1, 1)
end_date = dt.datetime(2015, 12, 31)
histdata = 'historical_data.csv'
geodata = 'geocoded_locations.txt'
obs = 100

weatherGenerator = WeatherGenerator(obs, start_date, end_date, histdata, geodata)
weatherGenerator.initialize_output()
weatherGenerator.generate_position_data()
weatherGenerator.generate_time_data()
weatherGenerator.generate_weather_data()
output = weatherGenerator.output
histdata = weatherGenerator.histdata
geodata = weatherGenerator.geodata

In [46]:
output.sort_values(['Location', 'Local Time'])

Unnamed: 0,Location,Conditions,Humidity,Local Time,Position,Pressure,Temperature
37,Anchorage,0.0,54,2013-03-23T10:02:29,"61.22,-149.9,33",1009.2,-7.1
6,Anchorage,0.0,49,2013-04-10T01:02:23,"61.22,-149.9,33",1000.2,0.3
64,Anchorage,0.0,64,2013-05-08T22:55:01,"61.22,-149.9,33",1007.9,6.4
42,Anchorage,0.0,67,2013-06-26T10:39:25,"61.22,-149.9,33",1006.7,15.5
49,Anchorage,0.0,78,2013-08-06T16:12:35,"61.22,-149.9,33",997.3,11.7
32,Anchorage,0.0,84,2013-11-14T01:23:15,"61.22,-149.9,33",1009.8,-8.6
10,Anchorage,0.0,87,2013-11-21T08:31:57,"61.22,-149.9,33",1013.7,-7.6
9,Anchorage,0.0,80,2013-12-29T02:27:55,"61.22,-149.9,33",994.7,-5.5
60,Anchorage,0.0,86,2014-02-01T03:00:48,"61.22,-149.9,33",1010.9,-9.0
3,Anchorage,0.0,63,2014-04-01T03:52:10,"61.22,-149.9,33",1014.8,5.9
