In [274]:
from datetime import datetime
import matplotlib.pyplot as plt
import meteostat
from meteostat import units
import pandas as pd
import numpy as np
from time import time
pd.set_option('display.max_columns', 150)
import geopy.distance

### Import zipcode to latlon to closest WBAN info mapping

In [294]:
# data comes from PrepareZipLatLonData.ipnb
ziplatlonwban = pd.read_csv('Datasets/Weather/zip_lat_lon_wban_data.csv')
ziplatlonwban['WBAN'] = ziplatlonwban['closestWBAN'].str.split(' ').str[0]
ziplatlonwban['wban_distance_km'] = ziplatlonwban['closestWBAN'].str.split(' ').str[1]
ziplatlonwban.drop(['Unnamed: 0', 'closestWBAN'], axis=1, inplace=True)
def zip_str(row):
    zip_str = str(int(row['ZIP']))
    while len(zip_str) < 5:
        zip_str = '0' + zip_str
    return zip_str
ziplatlonwban['zip_str'] = ziplatlonwban.apply(lambda row: zip_str(row), axis=1)
ziplatlonwban['wban_distance_km'] = ziplatlonwban['wban_distance_km'].astype(float)
ziplatlonwban['LAT'] = ziplatlonwban['LAT'].astype(float)
ziplatlonwban['LNG'] = ziplatlonwban['LNG'].astype(float)
ziplatlonwban.head()

Unnamed: 0,ZIP,LAT,LNG,WBAN,wban_distance_km,zip_str
0,601,18.180555,-66.749961,11641,83.956324,601
1,602,18.361945,-67.175597,11641,124.448695,602
2,603,18.455183,-67.119887,11641,118.334861,603
3,606,18.158345,-66.932911,11641,103.115722,606
4,610,18.295366,-67.125135,11641,119.841322,610


## Getting monthly cloudy and sunny data

### Cloudy Data

In [295]:
# Cloudiness: Mean Number of Days (Clear, Partly Cloudy, Cloudy): The categories are determined for daylight hours only.
# CL = Clear 0/10 to 3/10 average sky cover
# PC = Partially Cloudy 4/10 to 7/10 average sky cover 
# CD = Cloudy 8/10 to 10/10 average sky cover
cloudiness = pd.read_csv('DataSets/Weather/cloudyness.csv')
cloudiness.columns = [x.strip() for x in cloudiness.columns]
for column in cloudiness.columns[1:]:
    cloudiness[column] = cloudiness[column].astype(str)
    cloudiness[column] = cloudiness[column].str.strip()
    cloudiness[column] = cloudiness[column].replace('*', '0')
    cloudiness[column] = cloudiness[column].astype(int)
    
cloudiness['WBAN'] = cloudiness['Location'].str[:5]
cloudiness['State'] = cloudiness['Location'].str.split(',').str[1].str.strip()
cloudiness['City'] = cloudiness['Location'].str.split(',').str[0].str[5:].str.strip()
cloudiness.drop('Location', axis=1, inplace=True)

cloudiness = cloudiness.merge(ziplatlonwban, how='right', on='WBAN')
cloudiness = cloudiness[~cloudiness['LAT'].isna()]
cloudiness.columns = [column.strip() for column in cloudiness.columns]
# #Calculating cloudy day stats
# CL_columns = cloudiness.columns[['CL' in x for x in cloudiness.columns]]
# PC_columns = cloudiness.columns[['PC' in x for x in cloudiness.columns]]
# CD_columns = cloudiness.columns[['CD' in x for x in cloudiness.columns]]
# cloudiness['min_CL_days_per_month'] = cloudiness[CL_columns].min(axis=1)
# cloudiness['min_PC_days_per_month'] = cloudiness[PC_columns].min(axis=1)
# cloudiness['min_CD_days_per_month'] = cloudiness[CD_columns].min(axis=1)
# cloudiness['avg_CL_days'] = cloudiness['Ann  CL'] / 12
# cloudiness['avg_PC_days'] = cloudiness['Ann  PC'] / 12
# cloudiness['avg_CD_days'] = cloudiness['Ann  CD'] / 12
# cloudiness = cloudiness[['min_CL_days_per_month', 'min_PC_days_per_month', 'min_CD_days_per_month', 'Ann  CL', 'Ann  PC', 'Ann  CD', 'avg_CL_days', 'avg_PC_days', 'avg_CD_days', 'WBAN', 'State', 'City', 'zip_str', 'LAT', 'LNG', 'wban_distance_km']]

### Sunny Data

In [296]:
#Sunshine - Average Percent of Possible: The total time that sunshine reaches the surface of Earth is expressed as the percentage of the maximum amount possible from sunrise to sunset with clear sky conditions.
sunny_perc = pd.read_csv('DataSets/Weather/sunny_perc.csv')
sunny_perc['WBAN'] = sunny_perc['Location'].str[:5]
sunny_perc['State'] = sunny_perc['Location'].str.split(',').str[1].str.strip()
sunny_perc['City'] = sunny_perc['Location'].str.split(',').str[0].str[5:].str.strip()
sunny_perc.drop('Location', axis=1, inplace=True)

sunny_perc = sunny_perc.merge(ziplatlonwban, how='right', on='WBAN')

# Missing stations are pacific islands. Considered unnecessary so we can drop them.
sunny_perc = sunny_perc[~sunny_perc['LAT'].isna()]
sunny_perc.columns = [column.strip() for column in sunny_perc.columns]

#Calculating min and max sunny percentage and isolating applicable columns
# sunny_perc['min_sunny_perc'] = sunny_perc[['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']].min(axis=1)
# sunny_perc['max_sunny_perc'] = sunny_perc[['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']].max(axis=1)
# sunny_perc = sunny_perc[['min_sunny_perc', 'max_sunny_perc', 'ANN', 'State', 'City', 'WBAN', 'ZIP', 'LAT', 'LNG', 'wban_distance_km']].copy()
sunny_perc['zip_str'] = sunny_perc.apply(lambda row: zip_str(row), axis=1)

In [303]:
sunny_perc = pd.read_csv('DataSets/Weather/sunny_perc.csv')

In [305]:
sunny_perc = pd.read_csv('DataSets/Weather/sunny_perc.csv')
sunny_perc['WBAN'] = sunny_perc['Location'].str[:5]
sunny_perc['State'] = sunny_perc['Location'].str.split(',').str[1].str.strip()
sunny_perc['City'] = sunny_perc['Location'].str.split(',').str[0].str[5:].str.strip()
sunny_perc.drop('Location', axis=1, inplace=True)

In [307]:
sunny_perc[sunny_perc['State'] == "WA"]

Unnamed: 0,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,ANN,WBAN,State,City
141,20.0,28.0,31.0,36.0,37.0,35.0,42,42,44,34,21,16.0,32.0,94240,WA,QUILLAYUTE
142,23.0,38.0,50.0,52.0,55.0,55.0,64,62,57,43,29,20.0,45.0,24233,WA,SEATTLE
143,25.0,37.0,53.0,57.0,63.0,65.0,78,76,70,54,26,22.0,52.0,24157,WA,SPOKANE


In [299]:
ziplatlonwban[ziplatlonwban['zip_str'] == '98037']

Unnamed: 0,ZIP,LAT,LNG,WBAN,wban_distance_km,zip_str
32336,98037,47.839223,-122.285464,24222,7.632564,98037


In [278]:
# for zip_code in ziplatlon['ZIP'].iloc[4000]:
zip_code = '98037'
#     try:
lat = ziplatlonwban[ziplatlonwban['zip_str'] == zip_code]['LAT'].values[0]
lng = ziplatlonwban[ziplatlonwban['zip_str'] == zip_code]['LNG'].values[0]
wban_zip = ziplatlonwban[ziplatlonwban['zip_str'] == zip_code]
#Create point for specfied zip_code
location = meteostat.Point(lat, lng)

#Set time period
start = datetime(2019, 1, 1)
end = datetime(2020, 1, 1)

start_time = time()

#Get monthly data
data = meteostat.Hourly(location, start, end)
data = data.convert(units.imperial)
data = data.fetch()

season_dict = {1:'winter',
              2: 'winter',
              3: 'spring',
              4: 'spring',
              5: 'spring',
              6: 'summer',
              7: 'summer',
              8: 'summer',
              9: 'fall',
              10:'fall',
              11:'fall',
              12:'winter'}

#Weather condition codes
# https://dev.meteostat.net/formats.html#weather-condition-codes
data['sunny'] = data['coco'] < 3
data['rainy'] = (data['coco'] > 6) | (data['prcp'] > 0)

#Scoping to afternoon data for dwpt calculation
afternoon = data[(data.index.hour > 11) & (data.index.hour < 19)]
afternoon_rhum = afternoon.groupby([afternoon.index.month, afternoon.index.day]).mean()['rhum']
afternoon_rhum.index = afternoon_rhum.index.set_names(['Month', 'Day'])
afternoon_rhum = afternoon_rhum.reset_index()
afternoon_rhum_mthly = pd.DataFrame(afternoon_rhum.groupby('Month').mean()['rhum'])
afternoon_rhum_mthly.columns = ['mthly_afternoon_rhum']

grouped_by_day = data.groupby([data.index.month, 
                               data.index.day]).agg({'temp' : ['mean', 'min', 'max', 'count'], 
                                                                'rhum' : ['mean', 'min', 'max'], 
                                                                'prcp' : ['sum', 'min', 'count'], 
                                                                'rainy' : np.max,
                                                                'sunny' : np.max})

grouped_by_day.index = grouped_by_day.index.set_names(['Month', 'Day'])
grouped_by_day = grouped_by_day.reset_index()
grouped_by_day.columns = ['month', 'day', 'tempmean', 'tempmin', 'tempmax', 'tempcount', 'rhummean', 'rhummin', 'rhummax', 'prcpsum', 'prcpmin', 'prcpcount', 'rainymax', 'sunnymax']
grouped_by_day['over90'] = grouped_by_day['tempmax'] > 90

grouped_by_month = grouped_by_day.groupby('month').agg({'tempmean' : 'mean',
                                                       'tempmin' : 'mean',
                                                       'tempmax' : 'mean',
                                                       'tempcount' : 'sum',
                                                       'rhummean' : 'mean',
                                                       'rhummin' : 'mean',
                                                       'rhummax' : 'mean',
                                                       'prcpsum' : 'sum',
                                                       'prcpmin' : 'mean',
                                                        'prcpcount' : 'sum',
                                                       'rainymax' : 'sum',
                                                        'sunnymax' : 'sum',
                                                        'over90' : 'sum'
                                                       })
grouped_by_month['season'] = grouped_by_month.index.map(season_dict)

grouped_by_month = grouped_by_month.merge(afternoon_rhum_mthly, left_index=True, right_index=True, how='left')

#calculating temp-humidity index (High is good)
grouped_by_month['THI'] = grouped_by_month['tempmean'] - 0.55 * (1 - grouped_by_month['mthly_afternoon_rhum']/100) * (grouped_by_month['tempmean'] - 58)
thi_bins = [-1, 75, 85, 95, 105, 1000]
grouped_by_month['THI_score'] = pd.cut(grouped_by_month['THI'], thi_bins, labels = [len(thi_bins) - 2 - i for i in range(0, len(thi_bins) - 1)]).astype(int)

#calculating temp-humidity index (High is good)
temp_min_bins = [-1, 15, 25, 35, 45, 1000]
grouped_by_month['tempmin_score'] = pd.cut(grouped_by_month['tempmin'], temp_min_bins, labels = [i for i in range(0, len(temp_min_bins) - 1)]).astype(int)

#calculating over90 index (High is good)
over90_bins = [-1, 3, 6, 9, 12, 15, 20, 1000]
grouped_by_month['over90_score'] = pd.cut(grouped_by_month['over90'], over90_bins, labels = [len(thi_bins) - i for i in range(0, len(over90_bins) - 1)]).astype(int)

#calculating rainy index (High is good)
rainy_bins = [-1, 3, 6, 10, 15, 20, 25, 1000]
grouped_by_month['rainy_score'] = pd.cut(grouped_by_month['rainymax'], rainy_bins, labels = [len(rainy_bins) - 2 - i for i in range(0, len(rainy_bins) - 1)]).astype(int)

#calculating total rain index (High is good)
# Not sure if we should include this one...
rainy_sum_bins = [0.00001, 1, 2, 3, 4, 5, 6, 1000]
grouped_by_month['rainy_sum_score'] = pd.cut(grouped_by_month['prcpsum'], rainy_sum_bins, labels = [len(rainy_sum_bins) - 2 - i for i in range(0, len(rainy_sum_bins) - 1)])
# Filling months without rain with slightly penalized score as no one likes a complete desert.
grouped_by_month['rainy_sum_score'] = grouped_by_month['rainy_sum_score'].fillna(len(rainy_sum_bins) - 3).astype(int)

#Adding in cloudy data
cloud_score_bins = [0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 70]
CL_columns = cloudiness.columns[['CL' in x for x in cloudiness.columns]]
PC_columns = cloudiness.columns[['PC' in x for x in cloudiness.columns]]
CD_columns = cloudiness.columns[['CD' in x for x in cloudiness.columns]]
cloudiness_temp = cloudiness[cloudiness['zip_str'] == zip_code][CL_columns].T.reset_index().iloc[:12]
cloudiness_temp['PC_days'] = cloudiness[cloudiness['zip_str'] == zip_code][PC_columns].T.reset_index().iloc[:12].iloc[:,1:].values
cloudiness_temp['CD_days'] = cloudiness[cloudiness['zip_str'] == zip_code][CD_columns].T.reset_index().iloc[:12].iloc[:,1:].values
cloudiness_temp = cloudiness_temp.iloc[:,1:]
cloudiness_temp.index = [x+1 for x in range(12)]
cloudiness_temp.columns = ['CL_days', 'PC_days', 'CD_days']
#Creating cloud score
cloudiness_temp['CloudScore'] = cloudiness_temp['CL_days'] * 2 + cloudiness_temp['PC_days']
cloudiness_temp['cloud_score_binned'] = pd.cut(cloudiness_temp['CloudScore'], cloud_score_bins, labels = [i for i in range(0, len(cloud_score_bins) - 1)]).astype(int)

grouped_by_month = grouped_by_month.merge(cloudiness_temp, how='left', left_index=True, right_index=True)

#Adding in sunny data
sunny_bins = [0, 20, 30, 40, 50, 60, 70, 80, 90, 101]
sunny_temp = sunny_perc[sunny_perc['zip_str'] == zip_code].T.reset_index().iloc[:12]
sunny_temp = sunny_temp.iloc[:,1:].astype(int)
sunny_temp.index = [x+1 for x in range(12)]
sunny_temp.columns = ['Sunny_perc']
sunny_temp['sunny_score_binned'] = pd.cut(sunny_temp['Sunny_perc'], sunny_bins, labels = [i for i in range(0, len(sunny_bins) - 1)]).astype(int)

grouped_by_month = grouped_by_month.merge(sunny_temp, how='left', left_index=True, right_index=True)

#Calculating weather index
thi_weight = 25
mintemp_weight = 5
over90_weight = 20
rainy_day_weight = 5
rainy_sum_weight = 5
cloud_score_weight = 20
sunny_perc_weight = 20

max_thi_score = len(thi_bins) - 2
max_temp_min_score = len(temp_min_bins) - 2
max_over_90_score = len(over90_bins) - 2
max_rainy_score = len(rainy_bins) - 2
max_rainy_sum_score = len(rainy_sum_bins) - 2
max_cloud_score = len(cloud_score_bins) - 2
max_sunny_score = len(sunny_bins) - 2

grouped_by_month['WeatherIndex'] = (grouped_by_month['THI_score'] / max_thi_score * thi_weight) \
                                   + (grouped_by_month['tempmin_score'] / max_temp_min_score * mintemp_weight) \
                                   + (grouped_by_month['over90_score'] / max_over_90_score * over90_weight) \
                                   + (grouped_by_month['rainy_score'] / max_rainy_score * rainy_day_weight) \
                                   + (grouped_by_month['rainy_sum_score'] / max_rainy_sum_score * rainy_sum_weight) \
                                   + (grouped_by_month['cloud_score_binned'] / max_cloud_score * cloud_score_weight) \
                                   + (grouped_by_month['sunny_score_binned'] / max_sunny_score * sunny_perc_weight)

#Aggregating by season
grouped_by_season = grouped_by_month.groupby('season').agg({'tempmean' : 'mean',
                                                        'tempmin' : 'mean',
                                                        'tempmax' : 'mean',
                                                        'tempcount' : 'sum',
                                                        'rhummean' : 'mean',
                                                        'rhummin' : 'mean',
                                                        'rhummax' : 'mean',
                                                        'prcpsum' : 'sum',
                                                        'prcpmin' : 'mean',
                                                        'prcpcount' : 'sum',
                                                        'rainymax' : 'sum',
                                                        'sunnymax' : 'sum',
                                                        'over90' : 'sum',
                                                        'mthly_afternoon_rhum' : 'mean',
                                                        'THI' : 'mean',
                                                        'THI_score' : 'mean',
                                                        'tempmin_score' : 'mean',
                                                        'over90_score' : 'mean', 
                                                        'rainy_score' : 'mean',
                                                        'rainy_sum_score' : 'mean',
                                                        'WeatherIndex' : 'mean'
                                                       })
#         grouped_by_day.to_csv('DataSets/Weather/Daily/{}_2019daily.csv'.format(zip_code))
#         grouped_by_month.to_csv('DataSets/Weather/Monthly/{}_2019monthly.csv'.format(zip_code))
#         grouped_by_season.to_csv('DataSets/Weather/Seasonal/{}_2019seasonal.csv'.format(zip_code))

#         end_time = time()
#         print('Output data for zip: {} in {}'.format(zip_code, end_time - start_time))
#     except:
#         pass

ValueError: Wrong number of items passed 0, placement implies 1

In [266]:
grouped_by_month

Unnamed: 0_level_0,tempmean,tempmin,tempmax,tempcount,rhummean,rhummin,rhummax,prcpsum,prcpmin,prcpcount,rainymax,sunnymax,over90,season,mthly_afternoon_rhum,THI,THI_score,tempmin_score,over90_score,rainy_score,rainy_sum_score,CL_days,PC_days,CD_days,CloudScore,cloud_score_binned,Sunny_perc,sunny_score_binned,WeatherIndex
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
1,77.051774,72.635484,82.793548,745,75.643387,60.806452,87.290323,2.578,0.0,745,25,31,0,winter,70.861751,73.99853,4,4,6,1,4,9,18,4,36,6,66,5,78.666667
2,77.653274,73.117857,83.15,672,75.291667,61.357143,87.642857,2.39,0.0,672,24,28,0,winter,70.75,74.491553,4,4,6,1,4,7,16,4,30,4,69,5,74.666667
3,78.655376,73.612903,85.106452,744,71.994624,55.967742,84.83871,1.389,0.0,744,26,31,1,spring,65.677419,74.756174,4,4,6,0,5,9,17,4,35,5,75,6,79.166667
4,79.868472,75.02,85.933333,720,71.095833,56.433333,82.8,3.263,0.0,720,26,30,0,spring,63.819048,75.51675,3,4,6,0,3,7,17,6,31,5,69,5,68.75
5,81.076613,77.041935,86.609677,744,75.247312,61.16129,85.129032,3.043,0.0,744,31,31,2,spring,69.129032,77.158427,3,4,6,0,3,4,16,12,24,3,58,4,62.25
6,82.884722,78.476667,88.383333,720,76.927778,63.333333,88.233333,3.621,0.0,720,21,29,1,summer,70.585714,78.858907,3,4,6,1,3,4,16,10,24,3,63,5,65.583333
7,83.505376,79.141935,88.722581,744,76.133065,63.290323,87.0,6.804,0.0,744,25,31,3,summer,70.041475,79.302807,3,4,6,1,0,5,17,9,27,4,68,5,65.083333
8,83.677554,79.270968,89.432258,744,76.920699,62.225806,88.322581,5.94,0.0,744,26,31,8,summer,70.898618,79.567666,3,4,4,0,1,5,17,8,27,4,67,5,58.416667
9,83.869583,79.64,89.813333,720,77.526389,62.866667,88.2,10.073,0.0,720,28,29,12,fall,71.357143,79.7942,3,4,3,0,0,4,17,9,25,3,65,5,52.25
10,83.084543,78.687097,88.374194,744,76.810484,62.806452,88.774194,3.417,0.0,744,29,31,2,fall,71.119816,79.100089,3,4,6,0,3,5,17,9,27,4,60,4,64.25


In [267]:
grouped_by_season['WeatherIndex'].mean()

66.35416666666666