# Nenana Ice Classic Data Processing
This notebook was used to process the data gathered for this project.

In [1]:
# imports

# data
import numpy as np
import pandas as pd

# date processing
import datetime

# filter warnings
import warnings
warnings.filterwarnings("ignore")

# garbage collection
import gc

## Helper Functions

In [2]:
def slope(y):
    x = range(len(y))
    # y = mx + b
    m, b = np.polyfit(x, y, 1)
    return m

def accel(y):
    t = range(len(y))
    # y = 1/2 g t^2 + v t + y0
    a, v, y0 = np.polyfit(t, y, 2)
    return 5*a

## Reading data from files

In [3]:
ice_df = pd.read_csv('../data/raw_ice_thickness_1989-2019.csv')
ice_df.head()

Unnamed: 0,Date,Date.1,Thickness
0,0,2019-01-16,16.0
1,1,2019-02-07,16.0
2,2,2019-02-26,23.5
3,3,2019-03-04,32.5
4,4,2019-03-13,25.7


In [4]:
weather_df = pd.read_csv('../data/raw_weather_1989-2020.csv')
weather_df.head()

Unnamed: 0,Date,time,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,windGust,windGustTime,icon,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,summary,ozone
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,,


In [5]:
winners_df = pd.read_csv('../data/ice_classic_winning_times.csv')
winners_df.head()

Unnamed: 0,Date,Time
0,1917-04-30,11:30 AM
1,1918-05-11,9:33 AM
2,1919-05-03,2:33 PM
3,1920-05-11,10:46 AM
4,1921-05-11,6:42 AM


## Merge the DataFrames into one

In [6]:
merged_df = weather_df.merge(winners_df, how = 'left', on = 'Date')
merged_df.head()

Unnamed: 0,Date,time,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,windGustTime,icon,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,summary,ozone,Time
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,,


### Rename a couple of columns to avoid potential confusion
The column 'time' contains the time that weather measurements were taken; 'Time' contains the winning time for a given year. This could be confusing, so I renamed 'time' to 'readingTime' and 'Time' to 'winningTime.'

In [7]:
merged_df.rename(columns = {'time' : 'readingTime', 'Time' : 'winningTime'}, inplace = True)

In [8]:
merged_df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,windGustTime,icon,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,summary,ozone,winningTime
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,,


### Calculate the number of hours of daylight
Daylight hours fluctuate a lot in Alaska. Articles I had read about the NIC mention that things like how much snow covers the ice, and therefore how much direct sunlight the ice receives, affect the rate the river ice melts.

I proxied snow depth with precipitation measurements. I theorized that the number of sunlight hours in a day may be a proxy for how much sunlight the river ice receives. I also had information regarding cloud cover which may help in that regard.

I calculated the number of daylight hours as a percentage of the day. That way its value is already scaled for use in modeling later.

In [9]:
def calc_daylight(sunrise, sunset):
    daylight_hours = (sunset - sunrise) / 60 / 60 /24
    return daylight_hours

In [10]:
merged_df['daylightHours'] = calc_daylight(merged_df['sunriseTime'], merged_df['sunsetTime'])

In [11]:
merged_df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,icon,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,summary,ozone,winningTime,daylightHours
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,,0.423611
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,,0.428472
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,,0.433333
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,,0.438194
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,,0.442361


In [12]:
merged_df['daylightHours'].value_counts()

0.842361    10
0.840972    10
0.845139    10
0.472917    10
0.829167    10
            ..
0.800000     1
0.757639     1
0.652778     1
0.804167     1
0.806944     1
Name: daylightHours, Length: 580, dtype: int64

In [13]:
ice_df.head()

Unnamed: 0,Date,Date.1,Thickness
0,0,2019-01-16,16.0
1,1,2019-02-07,16.0
2,2,2019-02-26,23.5
3,3,2019-03-04,32.5
4,4,2019-03-13,25.7


In [14]:
ice_df.drop(columns = 'Date', inplace = True)

In [15]:
ice_df.rename(columns = {'Date.1' : 'Date'}, inplace = True)

In [16]:
ice_df.head()

Unnamed: 0,Date,Thickness
0,2019-01-16,16.0
1,2019-02-07,16.0
2,2019-02-26,23.5
3,2019-03-04,32.5
4,2019-03-13,25.7


In [17]:
df = merged_df.merge(ice_df, how = 'left', on = 'Date')

In [18]:
df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,summary,ozone,winningTime,daylightHours,Thickness
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,0.423611,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,0.428472,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,0.433333,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,0.438194,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,0.442361,


### Rename other columns
There are a couple of other columns that I decided to rename to make them a little more descriptive of the values they contain.

In [19]:
df.columns

Index(['Date', 'readingTime', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'temperatureHigh', 'temperatureHighTime', 'temperatureLow',
       'temperatureLowTime', 'apparentTemperatureHigh',
       'apparentTemperatureHighTime', 'apparentTemperatureLow',
       'apparentTemperatureLowTime', 'dewPoint', 'humidity', 'windSpeed',
       'windBearing', 'cloudCover', 'uvIndex', 'uvIndexTime', 'visibility',
       'temperatureMin', 'temperatureMinTime', 'temperatureMax',
       'temperatureMaxTime', 'apparentTemperatureMin',
       'apparentTemperatureMinTime', 'apparentTemperatureMax',
       'apparentTemperatureMaxTime', 'precipType', 'pressure', 'windGust',
       'windGustTime', 'icon', 'precipIntensity', 'precipIntensityMax',
       'precipIntensityMaxTime', 'precipProbability', 'precipAccumulation',
       'summary', 'ozone', 'winningTime', 'daylightHours', 'Thickness'],
      dtype='object')

In [20]:
df.rename(columns = {'summary':'weatherSummary', 'Thickness':'iceThickness', 'pressure':'atmoPressure'}, inplace = True)

In [21]:
df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,weatherSummary,ozone,winningTime,daylightHours,iceThickness
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,0.423611,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,0.428472,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,0.433333,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,0.438194,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,0.442361,


In [22]:
# delete unneeded variables
del merged_df, ice_df, winners_df, weather_df
gc.collect()

0

### Dealing With Missing Values

---
#### Forward fill ice thickness values until next observed value
Since ice thickness is only measured periodically, I chose to make the naive assumption that it remains constant until the next measurement.

In [23]:
df['iceThickness'].fillna(method = 'pad', inplace = True)

In [24]:
df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,weatherSummary,ozone,winningTime,daylightHours,iceThickness
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,0.423611,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,0.428472,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,0.433333,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,0.438194,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,0.442361,


In [25]:
df['iceThickness'].isna().sum()

15

In [26]:
df.loc[df['iceThickness'].isna() == False]

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,weatherSummary,ozone,winningTime,daylightHours,iceThickness
15,1989-03-16,6.060420e+08,6.060679e+08,6.061105e+08,0.34,24.80,6.060996e+08,-10.25,6.061500e+08,18.16,...,,,,,,,,,0.493056,37.5
16,1989-03-17,6.061284e+08,6.061540e+08,6.061970e+08,0.37,20.65,6.061788e+08,1.70,6.062436e+08,18.19,...,,,,,,,,,0.497917,37.5
17,1989-03-18,6.062148e+08,6.062402e+08,6.062836e+08,0.40,11.75,6.062791e+08,-5.48,6.063184e+08,2.00,...,,,,,,,,,0.502083,37.5
18,1989-03-19,6.063012e+08,6.063264e+08,6.063702e+08,0.43,14.80,6.063557e+08,-0.26,6.064056e+08,5.22,...,,,,,,,,,0.506944,37.5
19,1989-03-20,6.063876e+08,6.064126e+08,6.064568e+08,0.46,11.69,6.064416e+08,-14.70,6.064978e+08,1.71,...,,,,,,,,,0.511806,37.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2674,2019-05-27,1.558944e+09,1.558958e+09,1.559030e+09,0.80,68.04,1.558998e+09,49.06,1.559052e+09,67.54,...,0.0003,0.0016,1.559013e+09,0.12,,Partly cloudy throughout the day.,359.8,,0.827778,25.3
2675,2019-05-28,1.559030e+09,1.559045e+09,1.559116e+09,0.83,67.04,1.559093e+09,46.89,1.559132e+09,66.54,...,0.0008,0.0107,1.559056e+09,0.73,,Drizzle in the morning.,359.0,,0.832639,25.3
2676,2019-05-29,1.559117e+09,1.559131e+09,1.559203e+09,0.86,70.52,1.559172e+09,52.30,1.559221e+09,70.02,...,0.0001,0.0002,1.559120e+09,0.07,,Partly cloudy throughout the day.,347.8,,0.836806,25.3
2677,2019-05-30,1.559203e+09,1.559217e+09,1.559290e+09,0.89,75.52,1.559267e+09,57.20,1.559312e+09,75.02,...,0.0001,0.0001,1.559265e+09,0.09,,Partly cloudy throughout the day.,349.5,,0.840972,25.3


The first ~2 weeks' values were missed by the first fill operation; I backfilled them to eliminate NaN values in the iceThickness column.

In [27]:
df['iceThickness'].fillna(method = 'bfill', inplace = True)

In [28]:
df['iceThickness'].isna().sum()

0

In [29]:
df['iceThickness'].value_counts()

36.0    194
38.0    148
35.5    114
33.5    111
25.0    110
       ... 
31.9      3
46.4      3
58.0      3
24.0      2
27.0      1
Name: iceThickness, Length: 126, dtype: int64

---
#### Display DataFrame information
I wanted an idea of what other missing data I needed to deal with.

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         2679 non-null   object 
 1   readingTime                  2679 non-null   float64
 2   sunriseTime                  2679 non-null   float64
 3   sunsetTime                   2679 non-null   float64
 4   moonPhase                    2679 non-null   float64
 5   temperatureHigh              2662 non-null   float64
 6   temperatureHighTime          2662 non-null   float64
 7   temperatureLow               2609 non-null   float64
 8   temperatureLowTime           2609 non-null   float64
 9   apparentTemperatureHigh      2662 non-null   float64
 10  apparentTemperatureHighTime  2662 non-null   float64
 11  apparentTemperatureLow       2609 non-null   float64
 12  apparentTemperatureLowTime   2609 non-null   float64
 13  dewPoint          

---
#### temperatureMin, temperatureMax
There were 56 missing values in these columns. I decided on the naive approach of setting the missing values to the overnight low and daytime high, respectively. There were few enough missing that I didn't think this approach would overly bias a model.

In [31]:
df['temperatureMin'].fillna(value = df['temperatureLow'], inplace = True)
df['temperatureMax'].fillna(value = df['temperatureHigh'], inplace = True)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         2679 non-null   object 
 1   readingTime                  2679 non-null   float64
 2   sunriseTime                  2679 non-null   float64
 3   sunsetTime                   2679 non-null   float64
 4   moonPhase                    2679 non-null   float64
 5   temperatureHigh              2662 non-null   float64
 6   temperatureHighTime          2662 non-null   float64
 7   temperatureLow               2609 non-null   float64
 8   temperatureLowTime           2609 non-null   float64
 9   apparentTemperatureHigh      2662 non-null   float64
 10  apparentTemperatureHighTime  2662 non-null   float64
 11  apparentTemperatureLow       2609 non-null   float64
 12  apparentTemperatureLowTime   2609 non-null   float64
 13  dewPoint          

There were still missing values after that treatment. I decided to replace the remaining missing values with the median value for the year the missing value occurred in.

In [33]:
# convert Date from string to datetime object
df['Date'] = pd.to_datetime(df['Date'])

In [34]:
# what years are missing?
min_missing_years = df.loc[df['temperatureMin'].isna() == True]['Date'].dt.year.unique().tolist()

In [35]:
min_missing_years

[1989, 1991, 1995, 1996, 1999, 2002, 2003]

In [36]:
# what is the median temperatureMin for each year?
min_median = df.groupby(df['Date'].dt.year)['temperatureMin'].median()

In [37]:
# show median temperatureMin for each year
min_median.values

array([ 21.64 ,  25.085,  27.75 ,  20.39 ,  24.505,  23.78 , -14.66 ,
        32.41 ,  19.805,  27.87 ,  15.39 ,  22.505,  20.76 ,  22.35 ,
        27.74 ,  28.495,  17.785,  20.495,  28.415,  24.415,  24.285,
        27.54 ,  20.96 ,  26.405,  12.54 ,  26.815,  28.26 ,  28.36 ,
        26.035,  22.895,  28.075])

In [38]:
# validate that the index is each year
min_median.index

Int64Index([1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
            2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
            2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019],
           dtype='int64', name='Date')

In [39]:
# make a dict of the missing years and the medians of temperatureMin of those years
missing_year_medians = {}
for n, val in enumerate(min_median):
    if min_median.index[n] in min_missing_years:
        missing_year_medians[min_median.index[n]] = val
    else:
        pass
missing_year_medians

{1989: 21.64,
 1991: 27.75,
 1995: -14.66,
 1996: 32.41,
 1999: 15.39,
 2002: 22.35,
 2003: 27.74}

In [40]:
# set the missing values to the median for the year
for year in missing_year_medians:
    df['temperatureMin'].loc[(df['Date'].dt.year == year) & (df['temperatureMin'].isna() == True)] = missing_year_medians[year]

In [41]:
# what years are missing?
max_missing_years = df.loc[df['temperatureMax'].isna() == True]['Date'].dt.year.unique().tolist()

In [42]:
max_missing_years

[1993, 1994, 1999, 2003]

In [43]:
# what is the median temperatureMax for each year?
max_median = df.groupby(df['Date'].dt.year)['temperatureMax'].median()

In [44]:
# show median temperatureMax for each year
max_median.values

array([40.56 , 47.1  , 49.51 , 38.66 , 47.8  , 46.955, 22.125, 55.4  ,
       45.185, 48.75 , 36.14 , 42.77 , 39.05 , 38.01 , 50.54 , 48.77 ,
       41.54 , 38.975, 50.225, 41.22 , 43.445, 47.975, 42.54 , 46.755,
       33.105, 49.62 , 48.8  , 49.665, 46.59 , 43.5  , 48.06 ])

In [45]:
# validate that the index is each year
max_median.index

Int64Index([1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
            2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
            2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019],
           dtype='int64', name='Date')

In [46]:
# make a dict of the missing years and the medians of temperatureMax of those years
missing_year_medians = {}
for n, val in enumerate(max_median):
    if max_median.index[n] in max_missing_years:
        missing_year_medians[max_median.index[n]] = val
    else:
        pass
missing_year_medians

{1993: 47.8, 1994: 46.955, 1999: 36.14, 2003: 50.54}

In [47]:
# set the missing values to the median for the year
for year in missing_year_medians:
    df['temperatureMax'].loc[(df['Date'].dt.year == year) & (df['temperatureMax'].isna() == True)] = missing_year_medians[year]

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Date                         2679 non-null   datetime64[ns]
 1   readingTime                  2679 non-null   float64       
 2   sunriseTime                  2679 non-null   float64       
 3   sunsetTime                   2679 non-null   float64       
 4   moonPhase                    2679 non-null   float64       
 5   temperatureHigh              2662 non-null   float64       
 6   temperatureHighTime          2662 non-null   float64       
 7   temperatureLow               2609 non-null   float64       
 8   temperatureLowTime           2609 non-null   float64       
 9   apparentTemperatureHigh      2662 non-null   float64       
 10  apparentTemperatureHighTime  2662 non-null   float64       
 11  apparentTemperatureLow       2609 non-null 

---
#### (event)Time, apparentTemperature(category), sunriseTime, sunsetTime, weatherSummary, precipProbability, precipIntensityMax, visibility, temperatureHigh, temperatureLow, dewPoint
I didn't think any of these would help with this prediction task.

My reasoning:
* Most "(event)Time" features dropped were because they only captured a single, specific event (the time the daytime high temperature was recorded, for example)
  * I may go back and get hourly data; something like length of time that the temperature was high or low for a day might have an impact. For now, I passed this by due to time constraints.
* "apparentTemperature(category)" features were dropped because they're a representation of what conditions feel like, not what they actually are.
* Sunrise and sunset times were dropped because that information was captured in 'daylightHours'
* "weatherSummary"  was dropped because it's a text summary of information captured by other features.
* "precipProbability" was dropped because this is past data and the presence/amount of precipitation is known.
* "precipIntensityMax" was dropped because the precipitation rate is already captured by "precipIntensity" and I don't think knowing the maximum precipitation rate adds anything.
* "visibility" was dropped because atmospheric conditions are already captured by other features (e.g., "uvIndex," "precipIntensity")
* "temperatureHigh" and "temperatureLow" were dropped because the first is the daytime high and the second is the nighttime low; the 24 hour maximum and minimum are captured by "temperatureMax" and "temperatureMin" respectively
* "dewPoint" was dropped because it's correlated with temperature and humidity

In [49]:
df.drop(columns = ['readingTime', 'weatherSummary', 'sunriseTime', 'sunsetTime', 'precipProbability',
                   'apparentTemperatureHigh', 'apparentTemperatureHighTime', 'apparentTemperatureLow',
                   'apparentTemperatureLowTime', 'uvIndexTime', 'apparentTemperatureMin',
                   'apparentTemperatureMinTime', 'apparentTemperatureMax', 'apparentTemperatureMaxTime',
                   'visibility', 'precipIntensityMax', 'dewPoint',
                   'temperatureHigh', 'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
                   'temperatureMinTime', 'temperatureMaxTime'],
        inplace = True)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    2679 non-null   datetime64[ns]
 1   moonPhase               2679 non-null   float64       
 2   humidity                2628 non-null   float64       
 3   windSpeed               2571 non-null   float64       
 4   windBearing             2565 non-null   float64       
 5   cloudCover              2536 non-null   float64       
 6   uvIndex                 2640 non-null   float64       
 7   temperatureMin          2679 non-null   float64       
 8   temperatureMax          2679 non-null   float64       
 9   precipType              442 non-null    object        
 10  atmoPressure            1956 non-null   float64       
 11  windGust                1289 non-null   float64       
 12  windGustTime            1289 non-null   float64 

---
#### precipAccumulation
Since precipAccumulation is defined as "The amount of snowfall accumulation expected to occur (over the hour or day, respectively), in inches. (If no snowfall is expected, this property will not be defined.)," I decided to fill missing values in that column with 0.

In [51]:
df['precipAccumulation'].isna().sum()

2501

In [52]:
df['precipAccumulation'].fillna(value = 0, downcast = 'infer', inplace = True)

In [53]:
df['precipAccumulation'].isna().sum()

0

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    2679 non-null   datetime64[ns]
 1   moonPhase               2679 non-null   float64       
 2   humidity                2628 non-null   float64       
 3   windSpeed               2571 non-null   float64       
 4   windBearing             2565 non-null   float64       
 5   cloudCover              2536 non-null   float64       
 6   uvIndex                 2640 non-null   float64       
 7   temperatureMin          2679 non-null   float64       
 8   temperatureMax          2679 non-null   float64       
 9   precipType              442 non-null    object        
 10  atmoPressure            1956 non-null   float64       
 11  windGust                1289 non-null   float64       
 12  windGustTime            1289 non-null   float64 

---
#### precipType
I wanted to keep this column since rain or snow could impact the river ice, so I filled NaNs with 'None'.

In [55]:
df['precipType'].value_counts()

snow    241
rain    201
Name: precipType, dtype: int64

In [56]:
df['precipType'].fillna(value = 'None', inplace = True)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    2679 non-null   datetime64[ns]
 1   moonPhase               2679 non-null   float64       
 2   humidity                2628 non-null   float64       
 3   windSpeed               2571 non-null   float64       
 4   windBearing             2565 non-null   float64       
 5   cloudCover              2536 non-null   float64       
 6   uvIndex                 2640 non-null   float64       
 7   temperatureMin          2679 non-null   float64       
 8   temperatureMax          2679 non-null   float64       
 9   precipType              2679 non-null   object        
 10  atmoPressure            1956 non-null   float64       
 11  windGust                1289 non-null   float64       
 12  windGustTime            1289 non-null   float64 

---
#### icon
The 'icon' column was missing values in just under half of the columns. It didn't appear to capture any unique information that wasn't already captured in other columns, so I dropped it.

In [58]:
df['icon'].value_counts()

clear-day            654
partly-cloudy-day    307
snow                 162
rain                 125
fog                   74
cloudy                44
wind                   1
Name: icon, dtype: int64

In [59]:
df.drop(columns = 'icon', inplace = True)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    2679 non-null   datetime64[ns]
 1   moonPhase               2679 non-null   float64       
 2   humidity                2628 non-null   float64       
 3   windSpeed               2571 non-null   float64       
 4   windBearing             2565 non-null   float64       
 5   cloudCover              2536 non-null   float64       
 6   uvIndex                 2640 non-null   float64       
 7   temperatureMin          2679 non-null   float64       
 8   temperatureMax          2679 non-null   float64       
 9   precipType              2679 non-null   object        
 10  atmoPressure            1956 non-null   float64       
 11  windGust                1289 non-null   float64       
 12  windGustTime            1289 non-null   float64 

---
#### winningTime
I filled missing values in 'winningTime' with zeroes, since only there will only be an entry in this column for the day that the ice broke.

In [61]:
df['winningTime'].fillna(value = 0, inplace = True)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    2679 non-null   datetime64[ns]
 1   moonPhase               2679 non-null   float64       
 2   humidity                2628 non-null   float64       
 3   windSpeed               2571 non-null   float64       
 4   windBearing             2565 non-null   float64       
 5   cloudCover              2536 non-null   float64       
 6   uvIndex                 2640 non-null   float64       
 7   temperatureMin          2679 non-null   float64       
 8   temperatureMax          2679 non-null   float64       
 9   precipType              2679 non-null   object        
 10  atmoPressure            1956 non-null   float64       
 11  windGust                1289 non-null   float64       
 12  windGustTime            1289 non-null   float64 

---
#### ozone, precipIntensityMaxTime, windGust, windGustTime
I decided to drop the 'ozone,' 'precipIntensityMaxTime,' 'windGust,' and 'windGustTime' columns because they were missing so much information, and I had no good strategy for imputing those values.

In [63]:
df.drop(columns = ['ozone', 'precipIntensityMaxTime', 'windGust', 'windGustTime'], inplace = True)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                2679 non-null   datetime64[ns]
 1   moonPhase           2679 non-null   float64       
 2   humidity            2628 non-null   float64       
 3   windSpeed           2571 non-null   float64       
 4   windBearing         2565 non-null   float64       
 5   cloudCover          2536 non-null   float64       
 6   uvIndex             2640 non-null   float64       
 7   temperatureMin      2679 non-null   float64       
 8   temperatureMax      2679 non-null   float64       
 9   precipType          2679 non-null   object        
 10  atmoPressure        1956 non-null   float64       
 11  precipIntensity     1406 non-null   float64       
 12  precipAccumulation  2679 non-null   float64       
 13  winningTime         2679 non-null   object      

---
#### atmoPressure
Barometric pressure is used as an aid in forecasting weather. Falling pressure indicates incoming inclement weather and rising pressure indicates incoming fair weather. With temperature, precipitation, and UV index information available, I chose to drop this column.

In [65]:
df.drop(columns = 'atmoPressure', inplace = True)

---
#### windSpeed, windBearing
I decided to replace missing values in these columns with zeroes.

In [66]:
df['windSpeed'].fillna(value = 0, inplace = True)
df['windBearing'].fillna(value = 0, inplace = True)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                2679 non-null   datetime64[ns]
 1   moonPhase           2679 non-null   float64       
 2   humidity            2628 non-null   float64       
 3   windSpeed           2679 non-null   float64       
 4   windBearing         2679 non-null   float64       
 5   cloudCover          2536 non-null   float64       
 6   uvIndex             2640 non-null   float64       
 7   temperatureMin      2679 non-null   float64       
 8   temperatureMax      2679 non-null   float64       
 9   precipType          2679 non-null   object        
 10  precipIntensity     1406 non-null   float64       
 11  precipAccumulation  2679 non-null   float64       
 12  winningTime         2679 non-null   object        
 13  daylightHours       2679 non-null   float64     

---
#### humidity
There were only 59 values missing. This is a small percentage (under 2%), so I replaced the missing values with the median.

In [68]:
df['humidity'].fillna(value = df['humidity'].median(), inplace = True)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                2679 non-null   datetime64[ns]
 1   moonPhase           2679 non-null   float64       
 2   humidity            2679 non-null   float64       
 3   windSpeed           2679 non-null   float64       
 4   windBearing         2679 non-null   float64       
 5   cloudCover          2536 non-null   float64       
 6   uvIndex             2640 non-null   float64       
 7   temperatureMin      2679 non-null   float64       
 8   temperatureMax      2679 non-null   float64       
 9   precipType          2679 non-null   object        
 10  precipIntensity     1406 non-null   float64       
 11  precipAccumulation  2679 non-null   float64       
 12  winningTime         2679 non-null   object        
 13  daylightHours       2679 non-null   float64     

---
#### cloudCover
I decided to drop the 'cloudCover' column, because 'uvIndex' seemed like a better proxy for the amount/strength of sunlight on a given day.

In [70]:
df['cloudCover'].isna().sum()

143

In [71]:
df.drop(columns = ['cloudCover'], inplace = True)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                2679 non-null   datetime64[ns]
 1   moonPhase           2679 non-null   float64       
 2   humidity            2679 non-null   float64       
 3   windSpeed           2679 non-null   float64       
 4   windBearing         2679 non-null   float64       
 5   uvIndex             2640 non-null   float64       
 6   temperatureMin      2679 non-null   float64       
 7   temperatureMax      2679 non-null   float64       
 8   precipType          2679 non-null   object        
 9   precipIntensity     1406 non-null   float64       
 10  precipAccumulation  2679 non-null   float64       
 11  winningTime         2679 non-null   object        
 12  daylightHours       2679 non-null   float64       
 13  iceThickness        2679 non-null   float64     

---
#### precipIntensity
This is defined as "the intensity (in inches of liquid water per hour) of precipitation occurring at the given time. This value is conditional on probability (that is, assuming any precipitation occurs at all)," so I replaced missing values with zeroes.

In [73]:
df['precipIntensity'].fillna(value = 0, inplace = True)

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                2679 non-null   datetime64[ns]
 1   moonPhase           2679 non-null   float64       
 2   humidity            2679 non-null   float64       
 3   windSpeed           2679 non-null   float64       
 4   windBearing         2679 non-null   float64       
 5   uvIndex             2640 non-null   float64       
 6   temperatureMin      2679 non-null   float64       
 7   temperatureMax      2679 non-null   float64       
 8   precipType          2679 non-null   object        
 9   precipIntensity     2679 non-null   float64       
 10  precipAccumulation  2679 non-null   float64       
 11  winningTime         2679 non-null   object        
 12  daylightHours       2679 non-null   float64       
 13  iceThickness        2679 non-null   float64     

---
#### uvIndex
This was another case where the number of missing values was small (40), comprising a little over 1% of the total values. I decided to replace the missing values with the median.

In [75]:
df['uvIndex'].fillna(value = df['uvIndex'].median(), inplace = True)

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2679 entries, 0 to 2678
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                2679 non-null   datetime64[ns]
 1   moonPhase           2679 non-null   float64       
 2   humidity            2679 non-null   float64       
 3   windSpeed           2679 non-null   float64       
 4   windBearing         2679 non-null   float64       
 5   uvIndex             2679 non-null   float64       
 6   temperatureMin      2679 non-null   float64       
 7   temperatureMax      2679 non-null   float64       
 8   precipType          2679 non-null   object        
 9   precipIntensity     2679 non-null   float64       
 10  precipAccumulation  2679 non-null   float64       
 11  winningTime         2679 non-null   object        
 12  daylightHours       2679 non-null   float64       
 13  iceThickness        2679 non-null   float64     

In [None]:
df.to_csv('../data/cleaned_data.csv')

In [None]:
# read data
df = pd.read_csv('../data/cleaned_data.csv')

In [None]:
df.head()

In [None]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [None]:
df.shape

## Feature Engineering

In [77]:
df['Date'] = pd.to_datetime(df['Date'])

In [78]:
df['winningTime'] = pd.to_datetime(df['winningTime'], errors = 'ignore')

In [80]:
# create column for ordinal day of year
df['dayOfYear'] = df['Date'].dt.dayofyear

In [82]:
# # create binary 'winningDate' column
df['winningDate'] = 0
idx = df.loc[df['winningTime'] != '1970-01-01 00:00:00'].index
df['winningDate'].loc[idx] = 1

In [83]:
df['winningDate'].value_counts()

0    2650
1      29
Name: winningDate, dtype: int64

In [84]:
df.loc[df['winningDate'] == 1]

Unnamed: 0,Date,moonPhase,humidity,windSpeed,windBearing,uvIndex,temperatureMin,temperatureMax,precipType,precipIntensity,precipAccumulation,winningTime,daylightHours,iceThickness,dayOfYear,winningDate
49,1989-05-01,0.87,0.47,3.72,277.0,4.0,36.22,61.53,,0.0,0.0,2020-03-04 20:14:00,0.707639,40.0,121,1
134,1990-04-24,0.0,0.61,0.0,0.0,4.0,28.66,44.74,,0.0,0.0,2020-03-04 17:19:00,0.674306,36.0,114,1
228,1991-05-01,0.6,0.49,4.13,139.0,4.0,31.43,57.62,,0.0,0.0,2020-03-04 00:04:00,0.70625,38.0,121,1
332,1992-05-14,0.45,0.49,5.83,239.0,5.0,30.52,51.11,,0.0,0.0,2020-03-04 06:26:00,0.770833,47.0,135,1
401,1993-04-23,0.07,0.52,7.72,330.0,4.0,31.28,45.5,,0.0,0.0,2020-03-04 13:01:00,0.670139,25.0,113,1
499,1994-04-29,0.67,0.31,4.39,22.0,4.0,26.51,59.54,,0.0,0.0,2020-03-04 23:01:00,0.697222,51.0,119,1
585,1996-05-05,0.6,0.44,4.42,64.0,4.0,32.47,53.42,,0.0,0.0,2020-03-04 12:32:00,0.728472,45.0,126,1
672,1997-04-30,0.79,0.48,4.05,252.0,3.0,25.75,61.35,,0.0,0.0,2020-03-04 10:28:00,0.703472,36.0,120,1
754,1998-04-20,0.8,0.6,4.06,287.0,3.0,25.97,52.56,rain,0.0,0.0,2020-03-04 16:54:00,0.654861,38.0,110,1
921,2000-05-01,0.93,0.46,4.97,211.0,4.0,34.92,55.95,,0.0,0.0,2020-03-04 10:47:00,0.709722,36.0,122,1


The information for 1995 and 1999 are missing some dates, including the winning date. I decided to drop the data for those years, since there is no target as a result. Survival analysis would also falsely treat those years as censored.

In [85]:
drop_1995_1999 = df.loc[(df['Date'].dt.year == 1995)|(df['Date'].dt.year == 1999)].index

In [86]:
df.drop(index = drop_1995_1999, inplace = True)

Drop all records for a year that are after the ice broke.

In [87]:
year_list = sorted(list(set(df['Date'].dt.year)))

In [88]:
print(year_list)

[1989, 1990, 1991, 1992, 1993, 1994, 1996, 1997, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]


In [89]:
# get winning date indices
idx_w = df.loc[df['winningDate'] == 1].index
idx_w

Int64Index([  49,  134,  228,  332,  401,  499,  585,  672,  754,  921, 1020,
            1105, 1174, 1261, 1357, 1453, 1540, 1641, 1728, 1818, 1915, 1996,
            2115, 2182, 2273, 2364, 2464, 2556, 2631],
           dtype='int64')

In [90]:
# make a list of indices to drop
drop_index = []
for i, year in enumerate(year_list):
    idx_y = df.loc[df['Date'].dt.year == year].index
    for idx in idx_y:
        if idx > idx_w[i]:
            drop_index.append(idx)
        else:
            pass

In [91]:
# drop observations that occurred after the winning date in a year
for idx in drop_index:
    df.drop(index = idx, inplace = True)

In [92]:
df.shape

(1677, 16)

In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1677 entries, 0 to 2631
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                1677 non-null   datetime64[ns]
 1   moonPhase           1677 non-null   float64       
 2   humidity            1677 non-null   float64       
 3   windSpeed           1677 non-null   float64       
 4   windBearing         1677 non-null   float64       
 5   uvIndex             1677 non-null   float64       
 6   temperatureMin      1677 non-null   float64       
 7   temperatureMax      1677 non-null   float64       
 8   precipType          1677 non-null   object        
 9   precipIntensity     1677 non-null   float64       
 10  precipAccumulation  1677 non-null   float64       
 11  winningTime         1677 non-null   datetime64[ns]
 12  daylightHours       1677 non-null   float64       
 13  iceThickness        1677 non-null   float64     

In [94]:
df['past'] = (df['Date'] < '2015-01-01').astype(np.int)

In [95]:
df['future'] = 1 - df['past']

In [96]:
df.head(3).append(df.tail(3))

Unnamed: 0,Date,moonPhase,humidity,windSpeed,windBearing,uvIndex,temperatureMin,temperatureMax,precipType,precipIntensity,precipAccumulation,winningTime,daylightHours,iceThickness,dayOfYear,winningDate,past,future
0,1989-03-01,0.8,0.7,8.42,236.0,1.0,20.58,29.65,,0.0,0.0,1970-01-01 00:00:00,0.423611,37.5,60,0,1,0
1,1989-03-02,0.83,0.68,8.59,266.0,1.0,-7.38,29.07,,0.0,0.0,1970-01-01 00:00:00,0.428472,37.5,61,0,1,0
2,1989-03-03,0.87,0.5,5.84,344.0,1.0,-19.23,1.84,,0.0,0.0,1970-01-01 00:00:00,0.433333,37.5,62,0,1,0
2629,2019-04-12,0.27,0.61,2.73,2.0,2.0,32.86,46.83,,0.0,0.0,1970-01-01 00:00:00,0.616667,25.3,102,0,0,1
2630,2019-04-13,0.3,0.71,2.1,7.0,2.0,32.8,49.32,rain,0.0007,0.0,1970-01-01 00:00:00,0.622222,25.3,103,0,0,1
2631,2019-04-14,0.34,0.68,4.74,345.0,2.0,30.46,47.89,rain,0.0003,0.0,2020-03-04 00:21:00,0.626389,25.3,104,1,0,1


In [97]:
df['precipType'].value_counts()

None    1412
snow     215
rain      50
Name: precipType, dtype: int64

In [98]:
# encode precipType
df = df.merge(pd.get_dummies(data = df['precipType'], prefix = 'precip', drop_first = True, sparse = True),
              how = 'left', left_index = True, right_index = True)

In [100]:
# drop precipType after encoding
df.drop(columns = 'precipType', inplace = True)
gc.collect()

21

Create column for daily average temperature

In [101]:
df['temperatureAvg'] = (df['temperatureMin'] + df['temperatureMax']) / 2

Create columns for number of "hot days," "cold days," and snow accumulated since Apr 1 in a given year.

I defined a "hot day" as a day where: day_average_temp > median(year_avg_temp) + std_dev(year_avg_temp)

A "cold day" is a day where: day_average_temp < median(year_avg_temp) - std_dev(year_avg_temp)

In [103]:
hot_count = []
cold_count = []
daily_accumulation = []
for year in year_list:
    hot_temp_count = 0
    cold_temp_count = 0
    daily_accum = 0
    temp_df = df.loc[df['Date'].dt.year == year]
    hot_threshold = temp_df['temperatureAvg'].median() + temp_df['temperatureAvg'].std()
    cold_threshold = temp_df['temperatureAvg'].median() - temp_df['temperatureAvg'].std()
    for idx in temp_df.index:
        current_temp = temp_df['temperatureAvg'].loc[idx]
        if temp_df['precip_snow'].loc[idx] == 1:
            daily_accum += temp_df['precipAccumulation'].loc[idx]
        else:
            pass
        if current_temp >= hot_threshold:
            hot_temp_count += 1
        elif current_temp <= cold_threshold:
            cold_temp_count += 1
        else:
            pass
        hot_count.append(hot_temp_count)
        cold_count.append(cold_temp_count)
        daily_accumulation.append(daily_accum)

In [104]:
df['numHotDays'] = hot_count
df['numColdDays'] = cold_count
df['accumulatedSnow'] = daily_accumulation

#### Save data before adding rolling average features.

In [None]:
df.to_csv('../data/pre-moving-average_data.csv', index = False)

Create columns for moving average features.

In [105]:
ma_cols = ['humidity', 'windSpeed', 'windBearing', 'uvIndex', 'precipIntensity', 'iceThickness', 'temperatureAvg', 'numHotDays', 'numColdDays']
windows = [3, 5, 7, 10]

In [106]:
# # first add new columns with dummy info
for col in ma_cols:
    for window in windows:
        label_ma = col + '_MA' + str(window)
        df[label_ma] = 0
        
        label_slope = col + '_MA-slope' + str(window)
        df[label_slope] = 0
        
        label_accel = col + '_MA-accel' + str(window)
        df[label_accel] = 0
        
        label_std = col + '_MA-std_dev' + str(window)
        df[label_std] = 0

In [107]:
# Update each year with its rolling averages
for year in year_list:
    temp_df = df.loc[df['Date'].dt.year == year]
    for col in ma_cols:
        for window in windows:
            # assign labels
            label_ma = col + '_MA' + str(window)
            label_slope = col + '_MA-slope' + str(window)
            label_accel = col + '_MA-accel' + str(window)
            label_std = col + '_MA-std_dev' + str(window)
            # for each year, update row values in new columns
            for idx in temp_df.index:
                df[label_ma].loc[idx] = temp_df[col].rolling(window).mean().loc[idx]
                df[label_slope].loc[idx] = temp_df[col].rolling(window).apply(lambda x: slope(x)).loc[idx]
                df[label_accel].loc[idx] = temp_df[col].rolling(window).apply(lambda x: accel(x)).loc[idx]
                df[label_std].loc[idx] = temp_df[col].rolling(window).std().loc[idx]

In [108]:
df.sample(7)

Unnamed: 0,Date,moonPhase,humidity,windSpeed,windBearing,uvIndex,temperatureMin,temperatureMax,precipIntensity,precipAccumulation,...,numColdDays_MA-accel5,numColdDays_MA-std_dev5,numColdDays_MA7,numColdDays_MA-slope7,numColdDays_MA-accel7,numColdDays_MA-std_dev7,numColdDays_MA10,numColdDays_MA-slope10,numColdDays_MA-accel10,numColdDays_MA-std_dev10
2445,2017-04-12,0.56,0.52,1.42,36.0,3.0,17.39,51.72,0.0,0.0,...,9.408312e-15,0.0,21.0,-7.448512e-16,-4.315977e-15,2.254134e-07,21.0,8.417784e-16,2.604707e-15,0.0
2159,2014-04-02,0.12,0.52,2.48,324.0,2.0,7.18,34.55,0.0,0.0,...,1.960065e-15,1.096274e-07,10.0,-7.448512e-16,-3.082841e-16,5.625065e-08,10.0,5.261115e-16,-6.511766e-16,9.050403e-08
1609,2008-04-04,0.96,0.45,4.78,125.0,2.0,35.55,51.56,0.0,0.0,...,1.960065e-15,4.532011e-08,10.0,-7.448512e-16,-3.082841e-16,9.695302e-08,10.0,5.261115e-16,-6.511766e-16,9.138569e-08
1240,2004-04-03,0.46,0.66,4.76,317.0,3.0,-0.86,27.3,0.0,0.0,...,-0.3571429,0.83666,11.571429,0.5714286,-0.1190476,1.272418,10.7,0.5757576,-0.01893939,1.766981
2521,2018-03-27,0.38,0.3,4.44,84.0,2.0,15.18,34.38,0.0,0.0,...,-0.7142857,0.4472136,5.428571,0.3214286,-0.4166667,0.7867958,5.0,0.2909091,4.439841e-16,0.942809
1528,2007-04-15,0.95,0.42,3.95,75.0,3.0,33.29,51.6,0.0,0.0,...,1.960065e-15,7.450581e-09,10.0,-7.448512e-16,-3.082841e-16,0.0,10.0,5.261115e-16,-6.511766e-16,4.632959e-08
1876,2011-03-26,0.77,0.61,1.72,282.0,2.0,6.5,33.94,0.0,0.0,...,3.332111e-15,0.0,7.0,-5.586384e-16,-1.387278e-15,2.31648e-08,7.0,3.156669e-16,2.959894e-16,7.375698e-08


In [109]:
df.shape

(1677, 167)

#### Save results to file

In [110]:
df.to_csv('../data/features_added.csv', index = False)

#### Read data from file

In [111]:
df = pd.read_csv('../data/features_added.csv')

In [112]:
df.head()

### Drop columns that are highly correlated
* temperatureMin and temperatureMax information was captured in temperatureAvg
* precipAccumulation information was captured in accumulatedSnow

In [113]:
df.drop(columns = ['temperatureMin', 'temperatureMax', 'precipAccumulation'],
        inplace = True)

### Create training and testing DataFrames

In [114]:
train = df.loc[df['past'] == 1]
train.drop(columns = ['past', 'future'], inplace = True)

In [115]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1399 entries, 0 to 2182
Columns: 162 entries, Date to numColdDays_MA-std_dev10
dtypes: Sparse[int64, 0](2), datetime64[ns](2), float64(154), int64(4)
memory usage: 1.7 MB


In [116]:
test = df.loc[df['future'] == 1]
test.drop(columns = ['past', 'future'], inplace = True)

In [117]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 278 entries, 2219 to 2631
Columns: 162 entries, Date to numColdDays_MA-std_dev10
dtypes: Sparse[int64, 0](2), datetime64[ns](2), float64(154), int64(4)
memory usage: 350.4 KB


In [118]:
train.dropna(inplace = True)

In [119]:
test.dropna(inplace = True)

In [120]:
df.drop(columns = ['past', 'future'], inplace = True)
gc.collect()

0

#### Save training and testing DataFrames to file

In [121]:
train.to_csv('../data/model_training_data.csv', index = False)

In [122]:
test.to_csv('../data/model_testing_data.csv', index = False)