In [1]:
import os
import pandas as pd 
import numpy as np
from datetime import datetime
from functools import reduce
import time
import csv

import json
from pandas.io.json import json_normalize

from pprint import pprint


In [2]:
def k_to_c (k):
    return k - 273.15

# function to calculate temperature dew point
#  equation ==> Td = T - ((100 - RH) / 5)

def calculate_dp(T, H):
    return T - ((100 - H) / 5)

# function to create new features based on 3 previous days
def new_features(df, feature, N): 
    # total number of rows
    rows = df.shape[0]
    # a list representing number of days for prior measurements of feature
    # notice that the front of the list needs to be padded with N
    # None values to maintain the constistent rows length for each N
    numb_days_prior_measurements = [None]*N + [df[feature][i-N] for i in range(N, rows)]
    # make a new column name of feature_N and add to DataFrame
    col_name = "{}_{}".format(feature, N)
    df[col_name] = numb_days_prior_measurements

In [32]:
city_ = 'kyoto'
with open( 'json_files/' + city_ + '_weather.json') as f:
        city = json.load(f)

date = []
city_temp = []
city_max = []
city_min = []
city_humidity = []
city_pressure = []
city_wind = []
city_clouds = []
city_desc = []


for measure in city:
    date.append(measure['dt_iso'])
    city_temp.append(measure['main']['temp'])
    city_max.append(measure['main']['temp_max'])
    city_min.append(measure['main']['temp_min'])
    city_pressure.append(measure['main']['pressure'])
    city_humidity.append(measure['main']['humidity'])
    city_wind.append(measure['wind']['speed'])
    city_clouds.append(measure['clouds']['all'])
    city_desc.append(measure['weather'][0]['main'])


In [33]:
# Convert temperature from Kelvin to Celsius
temp_c = []
for k in city_temp:
    c = round(k_to_c(k),2)
    temp_c.append(c)

temp_max_c = []
for k in city_max:
    c = round(k_to_c(k),2)
    temp_max_c.append(c)

temp_min_c = []
for k in city_min:
    c = round(k_to_c(k),2)
    temp_min_c.append(c)



In [26]:
# Calculate dew point
city_dp = []
for T ,H in zip(temp_c, city_humidity):
    dp = calculate_dp(T,H)
    city_dp.append(dp)

city_max_dp = []
for T ,H in zip(temp_max_c, city_humidity):
    dp = calculate_dp(T,H)
    city_max_dp.append(dp)

city_min_dp = []
for T ,H in zip(temp_min_c, city_humidity):
    dp = calculate_dp(T,H)
    city_min_dp.append(dp)

In [34]:
# convert date to show only day without time
city_date = []
for day in date:
    timestamp = datetime.strptime(day,'%Y-%m-%d %H:%M:%S +0000 UTC')
    day_only = datetime.strftime(timestamp,'%Y-%m-%d')
    city_date.append(day_only)

In [35]:
city_dict = {
    "Date": city_date,
    "Avg_temp": temp_c,
    "Temp_max": temp_max_c,
    "Temp_min": temp_min_c,
    "Avg_dwp": city_dp,
    "Max_dwp": city_max_dp,
    "Min_dwp": city_min_dp,
    "Pressure": city_pressure,
    "Humidity": city_humidity,
    "Wind": city_wind,
    "Clouds": city_clouds,
    "Description": city_desc
}

city_df = pd.DataFrame(city_dict)
grouped_city = city_df.groupby('Date')
city_mean = grouped_city[['Avg_temp','Avg_dwp']].mean()
city_max = grouped_city[['Temp_max','Max_dwp']].max()
city_min= grouped_city[['Temp_min','Min_dwp']].min()

dfs = [city_mean, city_max, city_min]

df_final = reduce(lambda left,right: pd.merge(left,right,on='Date'), dfs)
df_final

Unnamed: 0_level_0,Avg_temp,Avg_dwp,Temp_max,Max_dwp,Temp_min,Min_dwp
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-01,5.609583,0.584583,13.00,9.40,-2.67,-5.27
2017-01-02,6.253750,1.670417,12.00,6.60,-0.06,-2.46
2017-01-03,6.304167,0.795833,14.00,5.00,-1.56,-3.31
2017-01-04,6.980000,0.438333,14.00,4.00,0.20,-4.40
2017-01-05,6.185000,-3.490000,10.00,-1.28,2.08,-5.40
2017-01-06,3.526667,-3.940000,10.00,1.00,-5.17,-7.17
2017-01-07,5.455417,-1.427917,10.00,6.40,0.16,-4.60
2017-01-08,5.946250,3.912917,8.00,8.00,2.67,0.00
2017-01-09,7.057083,2.932083,13.00,6.60,-0.33,-1.53
2017-01-10,7.095000,0.470000,13.00,6.80,0.99,-4.00


In [36]:
# Create dict to hold all key, values 

features_city = ['Avg_temp', 'Avg_dwp', 'Temp_max', 'Max_dwp', 'Temp_min', 'Min_dwp']
#N is the number of days prior to the prediction, 3 days for this model
for feature in features_city:  
    if feature != 'Date':
        for N in range(1, 4):
            new_features(df_final, feature, N)

clean_df = df_final.dropna()
clean_df.columns

Index(['Avg_temp', 'Avg_dwp', 'Temp_max', 'Max_dwp', 'Temp_min', 'Min_dwp',
       'Avg_temp_1', 'Avg_temp_2', 'Avg_temp_3', 'Avg_dwp_1', 'Avg_dwp_2',
       'Avg_dwp_3', 'Temp_max_1', 'Temp_max_2', 'Temp_max_3', 'Max_dwp_1',
       'Max_dwp_2', 'Max_dwp_3', 'Temp_min_1', 'Temp_min_2', 'Temp_min_3',
       'Min_dwp_1', 'Min_dwp_2', 'Min_dwp_3'],
      dtype='object')

In [38]:
clean_df.dtypes

Avg_temp      float64
Avg_dwp       float64
Temp_max      float64
Max_dwp       float64
Temp_min      float64
Min_dwp       float64
Avg_temp_1    float64
Avg_temp_2    float64
Avg_temp_3    float64
Avg_dwp_1     float64
Avg_dwp_2     float64
Avg_dwp_3     float64
Temp_max_1    float64
Temp_max_2    float64
Temp_max_3    float64
Max_dwp_1     float64
Max_dwp_2     float64
Max_dwp_3     float64
Temp_min_1    float64
Temp_min_2    float64
Temp_min_3    float64
Min_dwp_1     float64
Min_dwp_2     float64
Min_dwp_3     float64
dtype: object