### Header

In [None]:
# import libraries

# maths
import numpy as np
import pandas as pd
import scipy.stats as stats
from pandas.api.types import is_numeric_dtype

# visual
#from matplotlib_venn import venn2
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon

# others
import os
import re
import time
import datetime as datetime
from datetime import timedelta

In [None]:
# file paths

input_path = '../data/2_input/'
clean_path = '../data/3_clean/'
output_path = '../data/4_output/'

image_path = '../images/'

### Functions

### Import Data

In [None]:
# import clean data

#df_train = pd.read_csv(clean_path + 'train_clean.csv')
#df_test = pd.read_csv(clean_path + 'test_clean.csv')
df_weather = pd.read_csv(clean_path + 'weather_clean.csv')
#df_spray = pd.read_csv(clean_path + 'spray_clean.csv')

#street_map = gpd.read_file('../data/2_input/Chicago Boundaries/geo_export_d41dc94d-31ce-478c-902c-864695385885.shp')

### Explore Data

In [None]:
# print df info

print(df_weather.shape)
df_weather.head()

In [None]:
# date information

print('year:')
print(df_weather['year'].unique())
print('')
print('month:')
print(df_weather['month'].unique())
print('')
print('day:')
print(df_weather['day'].unique())
print('')

In [None]:
table = pd.pivot_table(df_weather, values=['tmax','tmin','tavg'], index=['year', 'month'], aggfunc=np.mean)

- We have weather data for 8 years from 2007 to 2014.
- Only 6 out of 12 months of data available (end spring, summer, early autumn).

In [None]:
df_yr_mth = df_weather.groupby(['year','month'])['tmax','tmin','tavg'].mean()
df_yr_mth = pd.DataFrame(df_yr_mth)
df_yr_mth

In [None]:
year_list = df_weather['year'].unique()
df_list = []

for year in year_list:
    df = df_yr_mth.loc[year]
    df_list.append(df)

In [None]:
sns.set_style('whitegrid')

fig,axes = plt.subplots(4,2,figsize=(20,24))

for idx,df in enumerate(df_list):    
    
    row_idx = int(idx/2)
    col_idx = idx%2    
    #print(idx,row_idx,col_idx)    
      
    y = ['tmax','tmin','tavg']
    
    year = year_list[idx]  
    axes[row_idx,col_idx].title.set_text(year)
    axes[row_idx,col_idx].set_ylim(40,100)

    sns.lineplot(data=df,x=df.index,y=y[0],ax=axes[row_idx,col_idx],color='red',markers=True)
    sns.lineplot(data=df,x=df.index,y=y[1],ax=axes[row_idx,col_idx],color='blue',markers=True)
    sns.lineplot(data=df,x=df.index,y=y[2],ax=axes[row_idx,col_idx],color='orange',markers=True)

In [None]:
plt.figure(figsize=(20,8))

sns.barplot(data=df_weather,x='month',y='tavg',hue='year')

In [None]:
# clean incorrect sunset time
# example: 1860 -> 1900

for idx,row in df_weather.iterrows():

    sunset = int(row['sunset'])
    sunset = str(sunset)
    print(idx,sunset)
    
    if sunset[-2:] == '60':
        
        hour = int(sunset[:2])
        mins = int(sunset[-2:])
        
        mins = '00'
        hour += 1
        hour = str(hour)
        
        df_weather.at[idx,'sunset'] = hour + mins 
        
        print(idx)

In [None]:
df_weather['sunrise_2'] = pd.to_datetime(df_weather['sunrise'], format='%H%M')
df_weather['sunset_2'] = pd.to_datetime(df_weather['sunset'], format='%H%M')

df_weather['daylight'] = df_weather['sunset_2'] - df_weather['sunrise_2']
#df_weather['daylight_2'] = df_weather['daylight'].dt.total_seconds() / 3600

In [None]:
for idx,row in df_weather.iterrows():
    
    daylight = row['daylight']
    daylight = daylight.total_seconds() / 3600    
    df_weather.at[idx,'daylight_2'] = daylight

In [None]:
df_yr_mth = df_weather.groupby(['year','month'])['daylight_2'].mean()
df_yr_mth = pd.DataFrame(df_yr_mth)

In [None]:
df_daylight = df_weather.groupby(['year','month'])['daylight_2'].mean()
df_daylight = pd.DataFrame(df_daylight)

In [None]:
plt.figure(figsize=(20,8))

sns.barplot(data=df_weather,x='month',y='daylight_2',hue='year')