In [1]:
from datetime import datetime
import matplotlib.pyplot as plt
import meteostat
from meteostat import units
import pandas as pd
import numpy as np
from time import time
pd.set_option('display.max_columns', 150)
import geopy.distance

# Adding closest WBAN stations with cloudy/sunny data to all US zipcodes

## Getting cloudcover data from applicable WBAN stations

In [2]:
cloudiness = pd.read_csv('DataSets/Weather/cloudyness.csv')
cloudiness.columns = [x.strip() for x in cloudiness.columns]
for column in cloudiness.columns[1:]:
    cloudiness[column] = cloudiness[column].astype(str)
    cloudiness[column] = cloudiness[column].str.strip()
    cloudiness[column] = cloudiness[column].replace('*', '0')
    cloudiness[column] = cloudiness[column].astype(int)
    
cloudiness['WBAN'] = cloudiness['Location'].str[:5]
cloudiness['State'] = cloudiness['Location'].str.split(',').str[1].str.strip()
cloudiness['City'] = cloudiness['Location'].str.split(',').str[0].str[5:].str.strip()
cloudiness.drop('Location', axis=1, inplace=True)

# sunny_perc data is narrower than cloudiness and redundant
# sunny_perc = pd.read_csv('DataSets/Weather/sunny_perc.csv')
# sunny_perc['WBAN'] = sunny_perc['Location'].str[:5]
# sunny_perc['State'] = sunny_perc['Location'].str.split(',').str[1].str.strip()
# sunny_perc['City'] = sunny_perc['Location'].str.split(',').str[0].str[5:].str.strip()
# sunny_perc.drop('Location', axis=1, inplace=True)
# sunny_perc.columns = [x.strip() for x in sunny_perc.columns]
# sunny_perc = sunny_perc[~sunny_perc['ANN'].isna()]

## Getting latlng for major WBAN stations

In [3]:
wbanlatlng = pd.read_csv('DataSets/Weather/wbanlatlng.csv', encoding='cp1252')
def correct_wban(wban):
    wban = str(wban)
    while len(wban) < 5:
        wban = '0' + wban
    return wban
wbanlatlng['WBAN'] = wbanlatlng.apply(lambda row: correct_wban(row['WBAN_ID']), axis=1)
wbanlatlng_filtered = wbanlatlng[['WBAN', 'LAT', 'LON']]
wbanlatlng_filtered.columns = ['WBAN', 'LAT', 'LNG']

## Adding latlng information to cloudy WBAN stations

In [4]:
cloudiness_wbans = cloudiness.merge(wbanlatlng_filtered, on='WBAN')

## Getting zipcode list with latlng for all zip codes in the USA

In [5]:
ziplatlon = pd.read_csv('DataSets/ziptolat.csv')

def zip_str(row):
    zip_str = str(int(row['ZIP']))
    while len(zip_str) < 5:
        zip_str = '0' + zip_str
    return zip_str

ziplatlon['ZIP'] = ziplatlon.apply(lambda row: zip_str(row), axis = 1)

In [6]:
cloudiness_wbans.head(2)

Unnamed: 0,Yrs,Jan CL,Jan PC,Jan CD,Feb CL,Feb PC,Feb CD,Mar CL,Mar PC,Mar CD,Apr CL,Apr PC,Apr CD,May CL,May PC,May CD,Jun CL,Jun PC,Jun CD,Jul CL,Jul PC,Jul CD,Aug CL,Aug PC,Aug CD,Sept CL,Sept PC,Sept CD,Oct CL,Oct PC,Oct CD,Nov CL,Nov PC,Nov CD,Dec CL,Dec PC,Dec CD,Ann CL,Ann PC,Ann CD,WBAN,State,City,LAT,LNG
0,37,7,6,18,7,6,15,7,8,16,9,8,13,8,11,12,7,13,10,5,14,12,7,15,10,9,9,11,14,8,9,10,7,13,8,7,16,99,111,155,13876,AL,BIRMINGHAM AP,33.56556,-86.745
1,27,7,6,18,7,6,16,7,8,17,9,7,14,8,10,14,8,11,11,7,13,11,9,13,10,9,9,12,12,7,12,9,7,14,8,6,18,100,101,164,3856,AL,HUNTSVILLE,34.64389,-86.78611


In [7]:
ziplatlon.head(2)

Unnamed: 0,ZIP,LAT,LNG
0,601,18.180555,-66.749961
1,602,18.361945,-67.175597


## Adding closest WBAN stations that have cloud cover info to all US zipcodes

In [None]:
ziplatlonwban = ziplatlon.copy()
def get_closest_wban(input_row):
    try:
        zipcode = input_row['ZIP']
        coord = (input_row['LAT'], input_row['LNG'])

        interim_distance_df = cloudiness_wbans.copy()
        interim_distance_df['distance'] = interim_distance_df.apply(lambda row: geopy.distance.geodesic(coord,
                                                                                                        (row['LAT'], row['LNG'])).km,
                                                                    axis=1)
        
        closest_station = interim_distance_df[interim_distance_df['distance'] == interim_distance_df['distance'].min()]
        closest_wban = closest_station['WBAN'].values[0]
        closest_wban_distance = closest_station['distance'].values[0]

        return closest_wban + ' ' + str(closest_wban_distance)
    except:
        print('Didnt work for zip_code: {}'.format(zipcode))
        return None
    
ziplatlonwban['closestWBAN'] = ziplatlonwban.apply(lambda row: get_closest_wban(row), axis=1)
ziplatlonwban['WBAN'] = ziplatlonwban['closestWBAN'].str.split(' ').str[0]
ziplatlonwban['wban_distance_km'] = ziplatlonwban['closestWBAN'].str.split(' ').str[1].astype(float)
ziplatlonwban['LAT'] = ziplatlonwban['LAT'].astype(float)
ziplatlonwban['LNG'] = ziplatlonwban['LNG'].astype(float)
ziplatlonwban = ziplatlonwban.merge(cloudiness_wbans, left_on='WBAN', right_on='WBAN', how='left', suffixes=('_zipcode', '_wban'))
ziplatlon_wban_cloudiness.to_csv('DataSets/Weather/zip_lat_lon_wban_cloudiness_data.csv')