In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [54]:
pdsi_df=pd.read_csv('../data/pdsi.csv')
pdsi_df['ID']=pdsi_df['ID'].map(lambda x: '0'+str(x) if len(str(x))==9 else x)
pdsi_df.tail()

Unnamed: 0,ID,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
45491,9104052011,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
45492,9104052012,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
45493,9104052013,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
45494,9104052014,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99
45495,9104052015,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99,-99.99


In [55]:
 state_digits={'01': 'Alabama', '02':'Arizona', '03':'Arkansas','04':'California','05':'Colorado','06':'Connecticut',
               '07':'Delaware','08':'Florida','09':'Georgia','10':'Idaho','11':'Illinois','12':'Indiana','13':'Iowa',
               '14':'Kansas','15':'Kentucky','16':'Louisiana','17':'Maine','18':'Maryland','19':'Massachusetts',
               '20':'Michigan','21':'Minnesota','22':'Mississippi','23':'Missouri','24':'Montana','25':'Nebraska',
               '26':'Nevada','27':'New Hampshire','28':'New Jersey', '29':'New Mexico','30':'New York','31':'North Carolina',
               '32':'North Dakota','33':'Ohio','34':'Oklahoma','35':'Oregon','36':'Pennsylvania','37':'Rhode Island',
               '38':'South Carolina','39':'South Dakota','40':'Tennessee','41':'Texas','42':'Utah','43':'Vermont',
               '44':'Virginia','45':'Washington','46':'West Virginia','47':'Wisconsin','48':'Wyoming'}

In [56]:
#this mapping between states and abbreviations will come in handy later
states_abbrev = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}
rev_states={v: k for k, v in states_abbrev.items()}

In [57]:
year=[]
states=[]
pdsi_df['ID']=pdsi_df['ID'].map(lambda x: str(x))
for id in pdsi_df['ID']:
    if int(id[:2])>48:
        states.append('ZZ')
        year.append(2000)
    else:
        states.append(state_digits[id[:2]])
        year.append(int(id[-4:]))
pdsi_df['State']=states
pdsi_df['Year']=year
pdsi_df=pdsi_df[pdsi_df['State']!='ZZ']
pdsi_df['State']=pdsi_df['State'].map(lambda x: rev_states[x])
del pdsi_df['ID']
pdsi_df=pdsi_df[(pdsi_df['Year'] > 1994) & (pdsi_df['Year'] < 2011)].set_index(['Year']).reset_index()
pdsi_df.head()

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,State
0,1995,-0.49,-0.54,-0.97,-0.67,-0.84,0.11,0.04,0.02,0.12,1.1,1.44,1.05,AL
1,1996,1.33,-0.7,-0.35,-0.01,-0.8,-1.0,0.64,0.77,1.64,1.65,1.91,1.99,AL
2,1997,2.03,1.69,1.32,1.13,1.92,4.13,3.2,3.23,3.33,4.02,3.34,2.73,AL
3,1998,2.91,2.91,-0.27,0.35,-0.39,-0.75,-0.47,-0.47,-1.31,-1.62,-1.98,0.43,AL
4,1999,1.49,-0.61,-0.47,-0.82,0.15,1.15,-0.2,-1.19,-1.89,-2.27,-2.09,-2.43,AL


In [58]:
pdsi_station_df=pdsi_df.groupby(['State','Year']).mean().reset_index()
pdsi_station_df.head()

Unnamed: 0,State,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,AL,1995,-0.4325,-0.29,-0.57125,-0.4975,-0.75375,-0.71625,-1.25,-1.30875,-0.8675,1.98375,2.57375,2.3525
1,AL,1996,2.76375,1.75875,2.19125,1.385,-0.0975,-0.375,0.26,0.48375,1.16625,1.065,0.96125,0.98875
2,AL,1997,1.23375,1.285,0.06875,0.825,1.28875,2.6325,2.37625,1.89625,1.645,2.28875,2.555,2.25
3,AL,1998,2.8475,3.0725,1.6775,0.95,-0.71,-1.15125,-1.165,-1.46375,-0.23375,-1.36375,-1.49375,-0.94875
4,AL,1999,-0.1925,-1.2925,-1.27375,-1.8375,-1.0125,0.785,-0.1975,-1.1575,-1.665,-1.445,-1.75875,-2.1125


In [None]:
pdsi_station_df.to_csv('cleaned_data/pdsi_station_df.csv')

In [59]:
months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
rev_months={v: k for k, v in months.items()}
def format_climate_data(dataset,letter):
    data_by_year=dataset.sort(['Year']).set_index(['Year']).reset_index()
    new_data=[]
    #first separate each into each year
    for year in data_by_year['Year'].unique():
        data_year=data_by_year[data_by_year['Year']==year].set_index(['State']).reset_index()
        #then focus on each month in that year
        for col in data_year:
            if (col!='Year') & (col!='State'):
                d={}
                data_year_month=data_year[['Year','State',col]]
                for i in np.arange(len(data_year_month.index)):
                    d[data_year_month['State'][i]+'_'+letter]=data_year_month[col][i]
                d['Month']=months[col]
                d['Year']=data_year['Year'][0]
                new_data.append(d)
    new_data=pd.DataFrame(new_data)
    new_data=new_data.sort(['Year','Month']).set_index(['Year','Month']).reset_index()
    new_data['Month']=new_data['Month'].map(lambda x: rev_months[x])
    return new_data

In [60]:
pdsi_df=format_climate_data(pdsi_station_df, 'D')
pdsi_df_2012=pdsi_df[pdsi_df['Year']==2010]
pdsi_df.tail()

Unnamed: 0,Year,Month,AL_D,AR_D,AZ_D,CA_D,CO_D,CT_D,DE_D,FL_D,GA_D,IA_D,ID_D,IL_D,IN_D,KS_D,KY_D,LA_D,MA_D,MD_D,ME_D,MI_D,MN_D,MO_D,MS_D,MT_D,NC_D,ND_D,NE_D,NH_D,NJ_D,NM_D,NV_D,NY_D,OH_D,OK_D,OR_D,PA_D,RI_D,SC_D,SD_D,TN_D,TX_D,UT_D,VA_D,VT_D,WA_D,WI_D,WV_D,WY_D
187,2010,Aug,-1.4,-1.944444,-0.611429,1.557143,1.28,-2.096667,-2.75,-0.997143,-1.464444,5.784444,1.629,2.717778,-0.713333,2.692222,-0.5575,-1.95,-0.853333,-2.345,0.373333,0.285,2.382222,2.013333,-1.397,3.26,-2.42,5.174444,6.26875,-0.305,-2.766667,0.02,-0.5175,0.773,-0.718,0.01,2.135556,-1.578,2.27,-1.82,6.646667,-0.9875,1.169,0.667143,-2.5,0.51,1.795,2.976667,-1.186667,2.874
188,2010,Sep,-2.0925,-1.851111,-1.14,1.118571,-0.268,-2.493333,-1.61,-1.23,-1.67,6.055556,1.311,2.592222,-1.423333,2.054444,-1.4025,-2.793333,-1.43,-1.96875,1.22,0.782,4.191111,2.781667,-2.233,3.321429,-1.5975,6.445556,5.82,-0.655,-3.006667,-0.67125,-0.9,0.559,-1.077,-0.192222,2.17,-1.255,1.62,-1.795714,7.204444,-1.4425,1.49,-0.095714,-2.015,0.163333,2.312,3.642222,-1.456667,1.616
189,2010,Oct,-2.0,-2.383333,-0.367143,2.052857,-0.32,-0.973333,-0.735,-2.07,-1.955556,5.042222,1.591,1.791111,-2.148889,-0.357778,-1.84,-3.082222,1.366667,-1.38,2.03,-0.593,4.067778,0.991667,-2.419,2.637143,-1.64,6.017778,4.71125,2.13,-1.606667,-0.93125,1.165,2.141,-1.293,-0.855556,2.345556,0.754,1.57,-2.084286,6.454444,-1.245,-0.98,1.25,-1.765,3.006667,2.207,3.321111,-1.635,1.318
190,2010,Nov,-1.64125,-2.332222,-0.778571,1.514286,-0.516,-1.27,-1.06,-2.144286,-2.107778,4.663333,1.727,1.6,-1.787778,-0.245556,-1.625,-2.647778,1.073333,-1.7275,2.273333,-0.948,4.095556,0.918333,-1.97,2.995714,-2.075,5.663333,4.22,1.68,-2.223333,-1.3,1.2125,1.756,-0.836,-0.935556,2.174444,0.64,1.09,-2.434286,5.748889,-0.78,-1.333,1.438571,-1.996667,2.443333,2.001,2.975556,-1.778333,1.79
191,2010,Dec,-2.14125,-2.84,-0.571429,2.762857,-0.366,0.476667,-1.27,-2.26,-2.424444,4.428889,2.279,1.26,-2.155556,-0.528889,-1.7525,-3.204444,1.093333,-1.78,3.083333,-1.294,4.635556,0.416667,-2.622,3.084286,-2.1025,6.13,3.68875,1.84,-1.993333,-1.44125,2.5725,1.653,-1.033,-1.286667,2.642222,0.788,1.08,-2.824286,5.747778,-1.285,-1.732,3.084286,-1.778333,2.53,2.142,2.995556,-1.485,2.129


##To be put in part 2

In [None]:
def make_map_pdsi(year, start_month, end_month): 
    input_df = pdsi_station_df.loc[temp['Year'] == year]
    input_df['mean_pdsi'] = (pdsi_station_df.ix[:,start_month:end_month]).mean(axis=1)
    for index, row in input_df.iterrows():
        state_full=states_abbrev[row['State']]
        input_df['State'][index]=state_full
    make_map(input_df.set_index('State').mean_pdsi,"Mean Palmer Drought Severity Index by State from "+str(start_month)+" to "+str(end_month)+" "+str(year))