# Nenana Ice Classic Data Processing
This notebook was used to process the data gathered for this project.

In [1]:
# imports

# data
import numpy as np
import pandas as pd

# date processing
import datetime

# filter warnings
import warnings
warnings.filterwarnings("ignore")

# garbage collection
import gc

## Helper Functions

In [2]:
def slope(y):
    x = range(len(y))
    # y = mx + b
    m, b = np.polyfit(x, y, 1)
    return m

def accel(y):
    t = range(len(y))
    # y = 1/2 g t^2 + v t + y0
    a, v, y0 = np.polyfit(t, y, 2)
    return 5*a

## Reading data from files

In [3]:
ice_df = pd.read_csv('../data/raw_ice_thickness_1989-2019.csv')
ice_df.head()

Unnamed: 0,Date,Date.1,Thickness
0,0,2019-01-16,16.0
1,1,2019-02-07,16.0
2,2,2019-02-26,23.5
3,3,2019-03-04,32.5
4,4,2019-03-13,25.7


In [4]:
weather_df = pd.read_csv('../data/raw_weather_1989-2020.csv')
weather_df.head()

Unnamed: 0,Date,time,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipAccumulation,windGust,windGustTime,icon,precipProbability,summary,ozone
0,1989-01-01,599648400.0,599687760.0,599702820.0,0.81,15.77,599673600.0,10.7,599716800.0,13.85,...,,,,,,,,,,
1,1989-01-02,599734800.0,599774040.0,599789340.0,0.84,17.8,599788620.0,-10.28,599842380.0,15.37,...,,,,,,,,,,
2,1989-01-03,599821200.0,599860380.0,599875920.0,0.88,4.69,599878800.0,-16.58,599912040.0,3.28,...,,,,,,,,,,
3,1989-01-04,599907600.0,599946720.0,599962440.0,0.91,7.67,599940000.0,2.57,600009240.0,0.52,...,,,,,,,,,,
4,1989-01-05,599994000.0,600033000.0,600048960.0,0.94,14.7,600058800.0,10.64,600062820.0,5.23,...,,,,,,,,,,


In [5]:
winners_df = pd.read_csv('../data/ice_classic_winning_times.csv')
winners_df.head()

Unnamed: 0,Date,Time
0,1917-04-30,11:30 AM
1,1918-05-11,9:33 AM
2,1919-05-03,2:33 PM
3,1920-05-11,10:46 AM
4,1921-05-11,6:42 AM


## Merge the DataFrames into one

In [6]:
merged_df = weather_df.merge(winners_df, how = 'left', on = 'Date')
merged_df.head()

Unnamed: 0,Date,time,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensityMax,precipIntensityMaxTime,precipAccumulation,windGust,windGustTime,icon,precipProbability,summary,ozone,Time
0,1989-01-01,599648400.0,599687760.0,599702820.0,0.81,15.77,599673600.0,10.7,599716800.0,13.85,...,,,,,,,,,,
1,1989-01-02,599734800.0,599774040.0,599789340.0,0.84,17.8,599788620.0,-10.28,599842380.0,15.37,...,,,,,,,,,,
2,1989-01-03,599821200.0,599860380.0,599875920.0,0.88,4.69,599878800.0,-16.58,599912040.0,3.28,...,,,,,,,,,,
3,1989-01-04,599907600.0,599946720.0,599962440.0,0.91,7.67,599940000.0,2.57,600009240.0,0.52,...,,,,,,,,,,
4,1989-01-05,599994000.0,600033000.0,600048960.0,0.94,14.7,600058800.0,10.64,600062820.0,5.23,...,,,,,,,,,,


### Rename a couple of columns to avoid potential confusion
The column 'time' contains the time that weather measurements were taken; 'Time' contains the winning time for a given year. This could be confusing, so I renamed 'time' to 'readingTime' and 'Time' to 'winningTime.'

In [7]:
merged_df.rename(columns = {'time' : 'readingTime', 'Time' : 'winningTime'}, inplace = True)

In [8]:
merged_df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensityMax,precipIntensityMaxTime,precipAccumulation,windGust,windGustTime,icon,precipProbability,summary,ozone,winningTime
0,1989-01-01,599648400.0,599687760.0,599702820.0,0.81,15.77,599673600.0,10.7,599716800.0,13.85,...,,,,,,,,,,
1,1989-01-02,599734800.0,599774040.0,599789340.0,0.84,17.8,599788620.0,-10.28,599842380.0,15.37,...,,,,,,,,,,
2,1989-01-03,599821200.0,599860380.0,599875920.0,0.88,4.69,599878800.0,-16.58,599912040.0,3.28,...,,,,,,,,,,
3,1989-01-04,599907600.0,599946720.0,599962440.0,0.91,7.67,599940000.0,2.57,600009240.0,0.52,...,,,,,,,,,,
4,1989-01-05,599994000.0,600033000.0,600048960.0,0.94,14.7,600058800.0,10.64,600062820.0,5.23,...,,,,,,,,,,


### Calculate the number of hours of daylight
Daylight hours fluctuate a lot in Alaska. Articles I had read about the NIC mention that things like how much snow covers the ice, and therefore how much direct sunlight the ice receives, affect the rate the river ice melts.

I proxied snow depth with precipitation measurements. I theorized that the number of sunlight hours in a day may be a proxy for how much sunlight the river ice receives. I also had information regarding cloud cover which may help in that regard.

I calculated the number of daylight hours as a percentage of the day. That way its value is already scaled for use in modeling later.

In [9]:
def calc_daylight(sunrise, sunset):
    daylight_hours = (sunset - sunrise) / 60 / 60 /24
    return daylight_hours

In [10]:
merged_df['daylightHours'] = calc_daylight(merged_df['sunriseTime'], merged_df['sunsetTime'])

In [11]:
merged_df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensityMaxTime,precipAccumulation,windGust,windGustTime,icon,precipProbability,summary,ozone,winningTime,daylightHours
0,1989-01-01,599648400.0,599687760.0,599702820.0,0.81,15.77,599673600.0,10.7,599716800.0,13.85,...,,,,,,,,,,0.174306
1,1989-01-02,599734800.0,599774040.0,599789340.0,0.84,17.8,599788620.0,-10.28,599842380.0,15.37,...,,,,,,,,,,0.177083
2,1989-01-03,599821200.0,599860380.0,599875920.0,0.88,4.69,599878800.0,-16.58,599912040.0,3.28,...,,,,,,,,,,0.179861
3,1989-01-04,599907600.0,599946720.0,599962440.0,0.91,7.67,599940000.0,2.57,600009240.0,0.52,...,,,,,,,,,,0.181944
4,1989-01-05,599994000.0,600033000.0,600048960.0,0.94,14.7,600058800.0,10.64,600062820.0,5.23,...,,,,,,,,,,0.184722


In [12]:
merged_df['daylightHours'].value_counts()

0.173611    13
0.188194    13
0.184722    13
0.181250    13
0.175694    12
            ..
0.511806     1
0.412500     1
0.263194     1
0.270833     1
0.446528     1
Name: daylightHours, Length: 910, dtype: int64

In [13]:
ice_df.head()

Unnamed: 0,Date,Date.1,Thickness
0,0,2019-01-16,16.0
1,1,2019-02-07,16.0
2,2,2019-02-26,23.5
3,3,2019-03-04,32.5
4,4,2019-03-13,25.7


In [14]:
ice_df.drop(columns = 'Date', inplace = True)

In [15]:
ice_df.rename(columns = {'Date.1' : 'Date'}, inplace = True)

In [16]:
ice_df.head()

Unnamed: 0,Date,Thickness
0,2019-01-16,16.0
1,2019-02-07,16.0
2,2019-02-26,23.5
3,2019-03-04,32.5
4,2019-03-13,25.7


In [17]:
df = merged_df.merge(ice_df, how = 'left', on = 'Date')

In [18]:
df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipAccumulation,windGust,windGustTime,icon,precipProbability,summary,ozone,winningTime,daylightHours,Thickness
0,1989-01-01,599648400.0,599687760.0,599702820.0,0.81,15.77,599673600.0,10.7,599716800.0,13.85,...,,,,,,,,,0.174306,
1,1989-01-02,599734800.0,599774040.0,599789340.0,0.84,17.8,599788620.0,-10.28,599842380.0,15.37,...,,,,,,,,,0.177083,
2,1989-01-03,599821200.0,599860380.0,599875920.0,0.88,4.69,599878800.0,-16.58,599912040.0,3.28,...,,,,,,,,,0.179861,
3,1989-01-04,599907600.0,599946720.0,599962440.0,0.91,7.67,599940000.0,2.57,600009240.0,0.52,...,,,,,,,,,0.181944,
4,1989-01-05,599994000.0,600033000.0,600048960.0,0.94,14.7,600058800.0,10.64,600062820.0,5.23,...,,,,,,,,,0.184722,


### Rename other columns
There are a couple of other columns that I decided to rename to make them a little more descriptive of the values they contain.

In [19]:
df.columns

Index(['Date', 'readingTime', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'temperatureHigh', 'temperatureHighTime', 'temperatureLow',
       'temperatureLowTime', 'apparentTemperatureHigh',
       'apparentTemperatureHighTime', 'apparentTemperatureLow',
       'apparentTemperatureLowTime', 'dewPoint', 'humidity', 'windSpeed',
       'windBearing', 'cloudCover', 'uvIndex', 'uvIndexTime', 'visibility',
       'temperatureMin', 'temperatureMinTime', 'temperatureMax',
       'temperatureMaxTime', 'apparentTemperatureMin',
       'apparentTemperatureMinTime', 'apparentTemperatureMax',
       'apparentTemperatureMaxTime', 'pressure', 'precipType',
       'precipIntensity', 'precipIntensityMax', 'precipIntensityMaxTime',
       'precipAccumulation', 'windGust', 'windGustTime', 'icon',
       'precipProbability', 'summary', 'ozone', 'winningTime', 'daylightHours',
       'Thickness'],
      dtype='object')

In [20]:
df.rename(columns = {'summary':'weatherSummary', 'Thickness':'iceThickness', 'pressure':'atmoPressure'}, inplace = True)

In [21]:
df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipAccumulation,windGust,windGustTime,icon,precipProbability,weatherSummary,ozone,winningTime,daylightHours,iceThickness
0,1989-01-01,599648400.0,599687760.0,599702820.0,0.81,15.77,599673600.0,10.7,599716800.0,13.85,...,,,,,,,,,0.174306,
1,1989-01-02,599734800.0,599774040.0,599789340.0,0.84,17.8,599788620.0,-10.28,599842380.0,15.37,...,,,,,,,,,0.177083,
2,1989-01-03,599821200.0,599860380.0,599875920.0,0.88,4.69,599878800.0,-16.58,599912040.0,3.28,...,,,,,,,,,0.179861,
3,1989-01-04,599907600.0,599946720.0,599962440.0,0.91,7.67,599940000.0,2.57,600009240.0,0.52,...,,,,,,,,,0.181944,
4,1989-01-05,599994000.0,600033000.0,600048960.0,0.94,14.7,600058800.0,10.64,600062820.0,5.23,...,,,,,,,,,0.184722,


In [22]:
# delete unneeded variables
del merged_df, ice_df, winners_df, weather_df
gc.collect()

0

### Dealing With Missing Values

---
#### Forward fill ice thickness values until next observed value
Since ice thickness is only measured periodically, I chose to make the naive assumption that it remains constant until the next measurement.

In [23]:
df['iceThickness'].fillna(method = 'pad', inplace = True)

In [24]:
df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipAccumulation,windGust,windGustTime,icon,precipProbability,weatherSummary,ozone,winningTime,daylightHours,iceThickness
0,1989-01-01,599648400.0,599687760.0,599702820.0,0.81,15.77,599673600.0,10.7,599716800.0,13.85,...,,,,,,,,,0.174306,
1,1989-01-02,599734800.0,599774040.0,599789340.0,0.84,17.8,599788620.0,-10.28,599842380.0,15.37,...,,,,,,,,,0.177083,
2,1989-01-03,599821200.0,599860380.0,599875920.0,0.88,4.69,599878800.0,-16.58,599912040.0,3.28,...,,,,,,,,,0.179861,
3,1989-01-04,599907600.0,599946720.0,599962440.0,0.91,7.67,599940000.0,2.57,600009240.0,0.52,...,,,,,,,,,0.181944,
4,1989-01-05,599994000.0,600033000.0,600048960.0,0.94,14.7,600058800.0,10.64,600062820.0,5.23,...,,,,,,,,,0.184722,


In [25]:
df['iceThickness'].isna().sum()

56

In [26]:
df.loc[df['iceThickness'].isna() == False]

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipAccumulation,windGust,windGustTime,icon,precipProbability,weatherSummary,ozone,winningTime,daylightHours,iceThickness
56,1989-02-26,6.044868e+08,6.045165e+08,6.045520e+08,0.70,22.66,6.045478e+08,-15.24,6.046020e+08,21.26,...,,,,,,,,,0.410417,42.0
57,1989-02-27,6.045732e+08,6.046027e+08,6.046385e+08,0.73,9.71,6.046308e+08,1.70,6.046416e+08,-0.71,...,,,,,,,,,0.414583,42.0
58,1989-02-28,6.046596e+08,6.046889e+08,6.047251e+08,0.76,27.37,6.047221e+08,14.52,6.047396e+08,21.31,...,,,,,,,,,0.419444,42.0
59,1989-03-01,6.047460e+08,6.047751e+08,6.048117e+08,0.80,29.65,6.048144e+08,6.68,6.048612e+08,20.64,...,,,,,,,,,0.423611,42.0
60,1989-03-02,6.048324e+08,6.048613e+08,6.048983e+08,0.83,11.69,6.048576e+08,-15.29,6.049256e+08,-0.54,...,,,,,,,,,0.428472,42.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4451,2019-05-27,1.558944e+09,1.558958e+09,1.559030e+09,0.80,68.43,1.558999e+09,49.42,1.559052e+09,67.93,...,,10.80,1.558951e+09,clear-day,0.12,Partly cloudy throughout the day.,359.8,,0.827778,25.3
4452,2019-05-28,1.559030e+09,1.559045e+09,1.559116e+09,0.83,67.16,1.559093e+09,47.52,1.559132e+09,66.66,...,,11.80,1.559099e+09,rain,0.73,Drizzle in the morning.,359.0,,0.832639,25.3
4453,2019-05-29,1.559117e+09,1.559131e+09,1.559203e+09,0.86,70.55,1.559175e+09,52.98,1.559220e+09,70.05,...,,6.05,1.559175e+09,partly-cloudy-day,0.07,Partly cloudy throughout the day.,347.8,,0.836806,25.3
4454,2019-05-30,1.559203e+09,1.559217e+09,1.559290e+09,0.89,73.62,1.559272e+09,56.56,1.559301e+09,73.12,...,,6.83,1.559278e+09,partly-cloudy-day,0.09,Partly cloudy throughout the day.,349.5,,0.840972,25.3


The first ~2 weeks' values were missed by the first fill operation; I backfilled them to eliminate NaN values in the iceThickness column.

In [27]:
df['iceThickness'].fillna(method = 'bfill', inplace = True)

In [28]:
df['iceThickness'].isna().sum()

0

In [29]:
df['iceThickness'].value_counts()

36.0    276
38.0    216
42.0    215
25.0    173
40.0    166
       ... 
31.9      3
49.3      3
49.7      3
55.0      3
27.0      1
Name: iceThickness, Length: 138, dtype: int64

---
#### Display DataFrame information
I wanted an idea of what other missing data I needed to deal with.

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         4456 non-null   object 
 1   readingTime                  4456 non-null   float64
 2   sunriseTime                  4456 non-null   float64
 3   sunsetTime                   4456 non-null   float64
 4   moonPhase                    4456 non-null   float64
 5   temperatureHigh              4420 non-null   float64
 6   temperatureHighTime          4420 non-null   float64
 7   temperatureLow               4310 non-null   float64
 8   temperatureLowTime           4310 non-null   float64
 9   apparentTemperatureHigh      4420 non-null   float64
 10  apparentTemperatureHighTime  4420 non-null   float64
 11  apparentTemperatureLow       4310 non-null   float64
 12  apparentTemperatureLowTime   4310 non-null   float64
 13  dewPoint          

---
#### temperatureMin, temperatureMax
There were 112 missing values in these columns. I decided on the naive approach of setting the missing values to the overnight low and daytime high, respectively. There were few enough missing that I didn't think this approach would overly bias a model.

In [31]:
df['temperatureMin'].fillna(value = df['temperatureLow'], inplace = True)
df['temperatureMax'].fillna(value = df['temperatureHigh'], inplace = True)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         4456 non-null   object 
 1   readingTime                  4456 non-null   float64
 2   sunriseTime                  4456 non-null   float64
 3   sunsetTime                   4456 non-null   float64
 4   moonPhase                    4456 non-null   float64
 5   temperatureHigh              4420 non-null   float64
 6   temperatureHighTime          4420 non-null   float64
 7   temperatureLow               4310 non-null   float64
 8   temperatureLowTime           4310 non-null   float64
 9   apparentTemperatureHigh      4420 non-null   float64
 10  apparentTemperatureHighTime  4420 non-null   float64
 11  apparentTemperatureLow       4310 non-null   float64
 12  apparentTemperatureLowTime   4310 non-null   float64
 13  dewPoint          

There were still missing values after that treatment. I decided to replace the remaining missing values with the median value for the year the missing value occurred in.

In [33]:
# convert Date from string to datetime object
df['Date'] = pd.to_datetime(df['Date'])

In [34]:
# what years are missing?
min_missing_years = df.loc[df['temperatureMin'].isna() == True]['Date'].dt.year.unique().tolist()

In [35]:
min_missing_years

[1989, 1991, 1992, 1995, 1996, 1998, 1999, 2000, 2002, 2003]

In [36]:
# what is the median temperatureMin for each year?
min_median = df.groupby(df['Date'].dt.year)['temperatureMin'].median()

In [37]:
# show median temperatureMin for each year
min_median.values

array([  0.35 ,   7.55 ,   9.575,  12.69 ,  12.46 ,   3.395, -15.57 ,
        31.24 ,   6.23 ,  16.94 ,  -6.67 ,   9.83 ,  11.69 ,  10.24 ,
         8.13 ,   0.165,   9.96 ,   9.88 ,   2.63 ,   8.09 ,   8.1  ,
        10.1  ,   7.83 ,   6.355,   1.38 ,  11.62 ,  17.95 ,  19.265,
         4.28 ,  14.61 ,  21.39 ])

In [38]:
# validate that the index is each year
min_median.index

Int64Index([1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
            2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
            2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019],
           dtype='int64', name='Date')

In [39]:
# make a dict of the missing years and the medians of temperatureMin of those years
missing_year_medians = {}
for n, val in enumerate(min_median):
    if min_median.index[n] in min_missing_years:
        missing_year_medians[min_median.index[n]] = val
    else:
        pass
missing_year_medians

{1989: 0.35,
 1991: 9.575,
 1992: 12.69,
 1995: -15.57,
 1996: 31.24,
 1998: 16.94,
 1999: -6.67,
 2000: 9.83,
 2002: 10.24,
 2003: 8.13}

In [40]:
# set the missing values to the median for the year
for year in missing_year_medians:
    df['temperatureMin'].loc[(df['Date'].dt.year == year) & (df['temperatureMin'].isna() == True)] = missing_year_medians[year]

In [41]:
# what years are missing?
max_missing_years = df.loc[df['temperatureMax'].isna() == True]['Date'].dt.year.unique().tolist()

In [42]:
max_missing_years

[1989, 1993, 1994, 1998, 1999, 2000, 2002, 2003]

In [43]:
# what is the median temperatureMax for each year?
max_median = df.groupby(df['Date'].dt.year)['temperatureMax'].median()

In [44]:
# show median temperatureMax for each year
max_median.values

array([25.68 , 29.92 , 32.57 , 29.76 , 33.195, 23.56 , 12.65 , 43.54 ,
       32.65 , 38.17 , 17.46 , 31.075, 29.09 , 30.17 , 30.13 , 22.885,
       29.3  , 27.32 , 20.97 , 28.37 , 26.33 , 31.84 , 27.39 , 28.36 ,
       23.56 , 35.05 , 37.01 , 36.71 , 24.71 , 30.94 , 36.77 ])

In [45]:
# validate that the index is each year
max_median.index

Int64Index([1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
            2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
            2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019],
           dtype='int64', name='Date')

In [46]:
# make a dict of the missing years and the medians of temperatureMax of those years
missing_year_medians = {}
for n, val in enumerate(max_median):
    if max_median.index[n] in max_missing_years:
        missing_year_medians[max_median.index[n]] = val
    else:
        pass
missing_year_medians

{1989: 25.68,
 1993: 33.195,
 1994: 23.56,
 1998: 38.17,
 1999: 17.46,
 2000: 31.075000000000003,
 2002: 30.17,
 2003: 30.130000000000003}

In [47]:
# set the missing values to the median for the year
for year in missing_year_medians:
    df['temperatureMax'].loc[(df['Date'].dt.year == year) & (df['temperatureMax'].isna() == True)] = missing_year_medians[year]

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Date                         4456 non-null   datetime64[ns]
 1   readingTime                  4456 non-null   float64       
 2   sunriseTime                  4456 non-null   float64       
 3   sunsetTime                   4456 non-null   float64       
 4   moonPhase                    4456 non-null   float64       
 5   temperatureHigh              4420 non-null   float64       
 6   temperatureHighTime          4420 non-null   float64       
 7   temperatureLow               4310 non-null   float64       
 8   temperatureLowTime           4310 non-null   float64       
 9   apparentTemperatureHigh      4420 non-null   float64       
 10  apparentTemperatureHighTime  4420 non-null   float64       
 11  apparentTemperatureLow       4310 non-null 

---
#### (event)Time, apparentTemperature(category), sunriseTime, sunsetTime, weatherSummary, precipProbability, precipIntensityMax, visibility, temperatureHigh, temperatureLow, dewPoint
I didn't think any of these would help with this prediction task.

My reasoning:
* Most "(event)Time" features dropped were because they only captured a single, specific event (the time the daytime high temperature was recorded, for example)
  * I may go back and get hourly data; something like length of time that the temperature was high or low for a day might have an impact. For now, I passed this by due to time constraints.
* "apparentTemperature(category)" features were dropped because they're a representation of what conditions feel like, not what they actually are.
* Sunrise and sunset times were dropped because that information was captured in 'daylightHours'
* "weatherSummary"  was dropped because it's a text summary of information captured by other features.
* "precipProbability" was dropped because this is past data and the presence/amount of precipitation is known.
* "precipIntensityMax" was dropped because the precipitation rate is already captured by "precipIntensity" and I don't think knowing the maximum precipitation rate adds anything.
* "visibility" was dropped because atmospheric conditions are already captured by other features (e.g., "uvIndex," "precipIntensity")
* "temperatureHigh" and "temperatureLow" were dropped because the first is the daytime high and the second is the nighttime low; the 24 hour maximum and minimum are captured by "temperatureMax" and "temperatureMin" respectively
* "dewPoint" was dropped because it's correlated with temperature and humidity

In [49]:
df.drop(columns = ['readingTime', 'weatherSummary', 'sunriseTime', 'sunsetTime', 'precipProbability',
                   'apparentTemperatureHigh', 'apparentTemperatureHighTime', 'apparentTemperatureLow',
                   'apparentTemperatureLowTime', 'uvIndexTime', 'apparentTemperatureMin',
                   'apparentTemperatureMinTime', 'apparentTemperatureMax', 'apparentTemperatureMaxTime',
                   'visibility', 'precipIntensityMax', 'dewPoint',
                   'temperatureHigh', 'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
                   'temperatureMinTime', 'temperatureMaxTime'],
        inplace = True)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    4456 non-null   datetime64[ns]
 1   moonPhase               4456 non-null   float64       
 2   humidity                4285 non-null   float64       
 3   windSpeed               4207 non-null   float64       
 4   windBearing             4171 non-null   float64       
 5   cloudCover              4196 non-null   float64       
 6   uvIndex                 4417 non-null   float64       
 7   temperatureMin          4456 non-null   float64       
 8   temperatureMax          4456 non-null   float64       
 9   atmoPressure            3233 non-null   float64       
 10  precipType              870 non-null    object        
 11  precipIntensity         2327 non-null   float64       
 12  precipIntensityMaxTime  846 non-null    float64 

---
#### precipAccumulation
Since precipAccumulation is defined as "The amount of snowfall accumulation expected to occur (over the hour or day, respectively), in inches. (If no snowfall is expected, this property will not be defined.)," I decided to fill missing values in that column with 0.

In [51]:
df['precipAccumulation'].isna().sum()

3941

In [52]:
df['precipAccumulation'].fillna(value = 0, downcast = 'infer', inplace = True)

In [53]:
df['precipAccumulation'].isna().sum()

0

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    4456 non-null   datetime64[ns]
 1   moonPhase               4456 non-null   float64       
 2   humidity                4285 non-null   float64       
 3   windSpeed               4207 non-null   float64       
 4   windBearing             4171 non-null   float64       
 5   cloudCover              4196 non-null   float64       
 6   uvIndex                 4417 non-null   float64       
 7   temperatureMin          4456 non-null   float64       
 8   temperatureMax          4456 non-null   float64       
 9   atmoPressure            3233 non-null   float64       
 10  precipType              870 non-null    object        
 11  precipIntensity         2327 non-null   float64       
 12  precipIntensityMaxTime  846 non-null    float64 

---
#### precipType
I wanted to keep this column since rain or snow could impact the river ice, so I filled NaNs with 'None'.

In [55]:
df['precipType'].value_counts()

snow    664
rain    206
Name: precipType, dtype: int64

In [56]:
df['precipType'].fillna(value = 'None', inplace = True)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    4456 non-null   datetime64[ns]
 1   moonPhase               4456 non-null   float64       
 2   humidity                4285 non-null   float64       
 3   windSpeed               4207 non-null   float64       
 4   windBearing             4171 non-null   float64       
 5   cloudCover              4196 non-null   float64       
 6   uvIndex                 4417 non-null   float64       
 7   temperatureMin          4456 non-null   float64       
 8   temperatureMax          4456 non-null   float64       
 9   atmoPressure            3233 non-null   float64       
 10  precipType              4456 non-null   object        
 11  precipIntensity         2327 non-null   float64       
 12  precipIntensityMaxTime  846 non-null    float64 

---
#### icon
The 'icon' column was missing values in just under half of the columns. It didn't appear to capture any unique information that wasn't already captured in other columns, so I dropped it.

In [58]:
df['icon'].value_counts()

clear-day            1061
partly-cloudy-day     486
snow                  471
rain                  128
cloudy                 85
wind                    9
fog                     5
Name: icon, dtype: int64

In [59]:
df.drop(columns = 'icon', inplace = True)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    4456 non-null   datetime64[ns]
 1   moonPhase               4456 non-null   float64       
 2   humidity                4285 non-null   float64       
 3   windSpeed               4207 non-null   float64       
 4   windBearing             4171 non-null   float64       
 5   cloudCover              4196 non-null   float64       
 6   uvIndex                 4417 non-null   float64       
 7   temperatureMin          4456 non-null   float64       
 8   temperatureMax          4456 non-null   float64       
 9   atmoPressure            3233 non-null   float64       
 10  precipType              4456 non-null   object        
 11  precipIntensity         2327 non-null   float64       
 12  precipIntensityMaxTime  846 non-null    float64 

---
#### winningTime
I filled missing values in 'winningTime' with zeroes, since there will only be an entry in this column for the day that the ice broke.

In [61]:
df['winningTime'].fillna(value = 0, inplace = True)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    4456 non-null   datetime64[ns]
 1   moonPhase               4456 non-null   float64       
 2   humidity                4285 non-null   float64       
 3   windSpeed               4207 non-null   float64       
 4   windBearing             4171 non-null   float64       
 5   cloudCover              4196 non-null   float64       
 6   uvIndex                 4417 non-null   float64       
 7   temperatureMin          4456 non-null   float64       
 8   temperatureMax          4456 non-null   float64       
 9   atmoPressure            3233 non-null   float64       
 10  precipType              4456 non-null   object        
 11  precipIntensity         2327 non-null   float64       
 12  precipIntensityMaxTime  846 non-null    float64 

---
#### ozone, precipIntensityMaxTime, windGust, windGustTime
I decided to drop the 'ozone,' 'precipIntensityMaxTime,' 'windGust,' and 'windGustTime' columns because they were missing so much information, and I had no good strategy for imputing those values.

In [63]:
df.drop(columns = ['ozone', 'precipIntensityMaxTime', 'windGust', 'windGustTime'], inplace = True)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   moonPhase           4456 non-null   float64       
 2   humidity            4285 non-null   float64       
 3   windSpeed           4207 non-null   float64       
 4   windBearing         4171 non-null   float64       
 5   cloudCover          4196 non-null   float64       
 6   uvIndex             4417 non-null   float64       
 7   temperatureMin      4456 non-null   float64       
 8   temperatureMax      4456 non-null   float64       
 9   atmoPressure        3233 non-null   float64       
 10  precipType          4456 non-null   object        
 11  precipIntensity     2327 non-null   float64       
 12  precipAccumulation  4456 non-null   float64       
 13  winningTime         4456 non-null   object      

---
#### atmoPressure
Barometric pressure is used as an aid in forecasting weather. Falling pressure indicates incoming inclement weather and rising pressure indicates incoming fair weather. With temperature, precipitation, and UV index information available, I chose to drop this column.

In [65]:
df.drop(columns = 'atmoPressure', inplace = True)

---
#### windSpeed, windBearing
I decided to replace missing values in these columns with zeroes.

In [66]:
df['windSpeed'].fillna(value = 0, inplace = True)
df['windBearing'].fillna(value = 0, inplace = True)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   moonPhase           4456 non-null   float64       
 2   humidity            4285 non-null   float64       
 3   windSpeed           4456 non-null   float64       
 4   windBearing         4456 non-null   float64       
 5   cloudCover          4196 non-null   float64       
 6   uvIndex             4417 non-null   float64       
 7   temperatureMin      4456 non-null   float64       
 8   temperatureMax      4456 non-null   float64       
 9   precipType          4456 non-null   object        
 10  precipIntensity     2327 non-null   float64       
 11  precipAccumulation  4456 non-null   float64       
 12  winningTime         4456 non-null   object        
 13  daylightHours       4456 non-null   float64     

---
#### humidity
There were only 171 values missing. This is a small percentage (3.8%), so I replaced the missing values with the median.

In [68]:
df['humidity'].fillna(value = df['humidity'].median(), inplace = True)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   moonPhase           4456 non-null   float64       
 2   humidity            4456 non-null   float64       
 3   windSpeed           4456 non-null   float64       
 4   windBearing         4456 non-null   float64       
 5   cloudCover          4196 non-null   float64       
 6   uvIndex             4417 non-null   float64       
 7   temperatureMin      4456 non-null   float64       
 8   temperatureMax      4456 non-null   float64       
 9   precipType          4456 non-null   object        
 10  precipIntensity     2327 non-null   float64       
 11  precipAccumulation  4456 non-null   float64       
 12  winningTime         4456 non-null   object        
 13  daylightHours       4456 non-null   float64     

---
#### cloudCover
After looking at the records surrounding the missing values, I decided to take the known value before a gap, and the known value after a gap, and fill between with the mean of the two values.

In [71]:
# how many missing values?
df['cloudCover'].isna().sum()

260

In [70]:
# check value counts
df['cloudCover'].value_counts()

0.00    774
1.00    134
0.01     93
0.02     83
0.03     70
       ... 
0.73     15
0.70     15
0.76     15
0.83     15
0.71     12
Name: cloudCover, Length: 101, dtype: int64

In [89]:
# print the value for the index before, the index, and the index after for inspection
# and for cross-checking the next step
cc_idx = df.loc[df['cloudCover'].isna() == True].index
for idx in cc_idx:
    print(df['cloudCover'].loc[[idx-1, idx, idx+1]])

20    0.0
21    NaN
22    NaN
Name: cloudCover, dtype: float64
21    NaN
22    NaN
23    0.0
Name: cloudCover, dtype: float64
25    0.0
26    NaN
27    0.0
Name: cloudCover, dtype: float64
95    0.3
96    NaN
97    NaN
Name: cloudCover, dtype: float64
96   NaN
97   NaN
98   NaN
Name: cloudCover, dtype: float64
97     NaN
98     NaN
99    0.26
Name: cloudCover, dtype: float64
101    0.0
102    NaN
103    NaN
Name: cloudCover, dtype: float64
102     NaN
103     NaN
104    0.24
Name: cloudCover, dtype: float64
207    0.02
208     NaN
209    0.20
Name: cloudCover, dtype: float64
291    0.00
292     NaN
293    0.19
Name: cloudCover, dtype: float64
293    0.19
294     NaN
295     NaN
Name: cloudCover, dtype: float64
294   NaN
295   NaN
296   NaN
Name: cloudCover, dtype: float64
295     NaN
296     NaN
297    0.27
Name: cloudCover, dtype: float64
302    0.31
303     NaN
304     NaN
Name: cloudCover, dtype: float64
303     NaN
304     NaN
305    0.44
Name: cloudCover, dtype: float64
329    0.3

In [115]:
# loop to find the indices before and after missing values, calculate mean of before and after gap
# values, then set missing values to the mean
start_idx, end_idx, mid_val = 0, 0, 0
found_start, found_end = False, False
for idx in cc_idx:
    if found_start and found_end:
        mid_val = (df['cloudCover'].loc[start_idx] + df['cloudCover'].loc[end_idx]) / 2
        df['cloudCover'].loc[range(start_idx, end_idx + 1)] = mid_val
        found_start, found_end = False, False
        start_idx, end_idx, mid_val = 0, 0, 0
    if not found_start:
        if np.isnan(df['cloudCover'].loc[idx - 1]) == False:
            start_idx = idx - 1
            found_start = True
        else:
            pass
    if found_start:
        if np.isnan(df['cloudCover'].loc[idx + 1]) == False:
            end_idx = idx + 1
            found_end = True
        else:
            pass
            

The above loop worked for all but 1 observation. I manually fixed that one in the same manner.

In [121]:
# manually correct observation missed by loop. previous value = 0.33, next value = 0.51, mean = 0.42
df['cloudCover'].loc[4293] = 0.42

In [122]:
df['cloudCover'].isna().sum()

0

In [123]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   moonPhase           4456 non-null   float64       
 2   humidity            4456 non-null   float64       
 3   windSpeed           4456 non-null   float64       
 4   windBearing         4456 non-null   float64       
 5   cloudCover          4456 non-null   float64       
 6   uvIndex             4417 non-null   float64       
 7   temperatureMin      4456 non-null   float64       
 8   temperatureMax      4456 non-null   float64       
 9   precipType          4456 non-null   object        
 10  precipIntensity     2327 non-null   float64       
 11  precipAccumulation  4456 non-null   float64       
 12  winningTime         4456 non-null   object        
 13  daylightHours       4456 non-null   float64     

---
#### precipIntensity
This is defined as "the intensity (in inches of liquid water per hour) of precipitation occurring at the given time. This value is conditional on probability (that is, assuming any precipitation occurs at all)," so I replaced missing values with zeroes.

In [124]:
df['precipIntensity'].fillna(value = 0, inplace = True)

In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   moonPhase           4456 non-null   float64       
 2   humidity            4456 non-null   float64       
 3   windSpeed           4456 non-null   float64       
 4   windBearing         4456 non-null   float64       
 5   cloudCover          4456 non-null   float64       
 6   uvIndex             4417 non-null   float64       
 7   temperatureMin      4456 non-null   float64       
 8   temperatureMax      4456 non-null   float64       
 9   precipType          4456 non-null   object        
 10  precipIntensity     4456 non-null   float64       
 11  precipAccumulation  4456 non-null   float64       
 12  winningTime         4456 non-null   object        
 13  daylightHours       4456 non-null   float64     

---
#### uvIndex
This was another case where the number of missing values was small (39), comprising 0.88% of the total values. I decided to replace the missing values with the median.

In [126]:
df['uvIndex'].isna().sum()

39

In [127]:
df['uvIndex'].fillna(value = df['uvIndex'].median(), inplace = True)

In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   moonPhase           4456 non-null   float64       
 2   humidity            4456 non-null   float64       
 3   windSpeed           4456 non-null   float64       
 4   windBearing         4456 non-null   float64       
 5   cloudCover          4456 non-null   float64       
 6   uvIndex             4456 non-null   float64       
 7   temperatureMin      4456 non-null   float64       
 8   temperatureMax      4456 non-null   float64       
 9   precipType          4456 non-null   object        
 10  precipIntensity     4456 non-null   float64       
 11  precipAccumulation  4456 non-null   float64       
 12  winningTime         4456 non-null   object        
 13  daylightHours       4456 non-null   float64     

### Save cleaned data to file

In [129]:
df.to_csv('../data/cleaned_data.csv', index_label = 'Date')

### Read data from file

In [130]:
# read data
df = pd.read_csv('../data/cleaned_data.csv')

In [131]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,moonPhase,humidity,windSpeed,windBearing,cloudCover,uvIndex,temperatureMin,temperatureMax,precipType,precipIntensity,precipAccumulation,winningTime,daylightHours,iceThickness
0,0,1989-01-01,0.81,0.83,4.67,148.0,0.18,0.0,3.68,25.8,,0.0,0.0,0,0.174306,42.0
1,1,1989-01-02,0.84,0.8,4.3,131.0,0.16,0.0,-8.27,24.93,,0.0,0.0,0,0.177083,42.0
2,2,1989-01-03,0.88,0.69,2.26,104.0,0.22,0.0,-10.28,4.69,,0.0,0.0,0,0.179861,42.0
3,3,1989-01-04,0.91,0.6,7.51,69.0,0.17,0.0,-16.58,7.67,,0.0,0.0,0,0.181944,42.0
4,4,1989-01-05,0.94,0.71,7.3,78.0,0.13,0.0,2.57,14.7,,0.0,0.0,0,0.184722,42.0


In [132]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [133]:
df.shape

(4456, 15)

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4456 entries, 0 to 4455
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                4456 non-null   object 
 1   moonPhase           4456 non-null   float64
 2   humidity            4456 non-null   float64
 3   windSpeed           4456 non-null   float64
 4   windBearing         4456 non-null   float64
 5   cloudCover          4456 non-null   float64
 6   uvIndex             4456 non-null   float64
 7   temperatureMin      4456 non-null   float64
 8   temperatureMax      4456 non-null   float64
 9   precipType          4456 non-null   object 
 10  precipIntensity     4456 non-null   float64
 11  precipAccumulation  4456 non-null   float64
 12  winningTime         4456 non-null   object 
 13  daylightHours       4456 non-null   float64
 14  iceThickness        4456 non-null   float64
dtypes: float64(12), object(3)
memory usage: 522.3+ KB


## Feature Engineering

In [135]:
df['Date'] = pd.to_datetime(df['Date'])

In [144]:
df['winningTime'] = pd.to_datetime(df['winningTime'], errors = 'ignore')

In [150]:
df.groupby('winningTime')['winningDate'].unique()

winningTime
0           [1]
10:28 AM    [1]
10:47 AM    [1]
10:53 PM    [1]
11:01 PM    [1]
12:00 PM    [1]
12:01 PM    [1]
12:04 AM    [1]
12:21 AM    [1]
12:32 PM    [1]
1:00 PM     [1]
1:01 PM     [1]
1:18 PM     [1]
2:16 PM     [1]
2:25 PM     [1]
2:41 PM     [1]
3:39 PM     [1]
3:47 PM     [1]
3:48 PM     [1]
4:24 PM     [1]
4:54 PM     [1]
5:19 PM     [1]
5:29 PM     [1]
6:22 PM     [1]
6:26 AM     [1]
7:39 PM     [1]
8:14 PM     [1]
8:41 PM     [1]
9:06 AM     [1]
9:27 PM     [1]
Name: winningDate, dtype: object

In [137]:
# create column for ordinal day of year
df['dayOfYear'] = df['Date'].dt.dayofyear

In [151]:
# # create binary 'winningDate' column
df['winningDate'] = 0
idx = df.loc[df['winningTime'] != '0'].index
df['winningDate'].loc[idx] = 1

In [152]:
df['winningDate'].value_counts()

0    4427
1      29
Name: winningDate, dtype: int64

In [153]:
df.loc[df['winningDate'] == 1]

Unnamed: 0,Date,moonPhase,humidity,windSpeed,windBearing,cloudCover,uvIndex,temperatureMin,temperatureMax,precipType,precipIntensity,precipAccumulation,winningTime,daylightHours,iceThickness,dayOfYear,winningDate
108,1989-05-01,0.87,0.47,3.72,277.0,0.14,4.0,36.22,61.53,,0.0,0.0,8:14 PM,0.707639,40.0,121,1
252,1990-04-24,0.0,0.61,0.0,0.0,0.16,4.0,28.66,44.74,,0.0,0.0,5:19 PM,0.674306,36.0,114,1
400,1991-05-01,0.6,0.49,4.13,139.0,0.1,4.0,31.43,57.62,,0.0,0.0,12:04 AM,0.70625,38.0,121,1
556,1992-05-14,0.45,0.49,5.83,239.0,0.32,5.0,30.52,51.11,,0.0,0.0,6:26 AM,0.770833,47.0,135,1
684,1993-04-23,0.07,0.52,7.72,330.0,0.02,4.0,31.28,45.5,,0.0,0.0,1:01 PM,0.670139,25.0,113,1
839,1994-04-29,0.67,0.31,4.39,22.0,0.13,4.0,26.51,59.54,,0.0,0.0,11:01 PM,0.697222,51.0,119,1
1007,1996-05-05,0.6,0.44,4.42,64.0,0.38,4.0,32.47,53.42,,0.0,0.0,12:32 PM,0.728472,45.0,126,1
1153,1997-04-30,0.79,0.48,4.05,252.0,0.565,1.0,25.75,61.35,,0.0,0.0,10:28 AM,0.703472,36.0,120,1
1291,1998-04-20,0.8,0.6,4.06,287.0,0.35,3.0,25.97,52.56,rain,0.0,0.0,4:54 PM,0.654861,38.0,110,1
1577,2000-05-01,0.93,0.46,4.97,211.0,0.88,4.0,34.92,55.95,,0.0,0.0,10:47 AM,0.709722,36.0,122,1


The information for 1995 and 1999 are missing some dates, including the winning date. I decided to drop the data for those years, since there is no target as a result. Survival analysis would also falsely treat those years as censored.

In [154]:
drop_1995_1999 = df.loc[(df['Date'].dt.year == 1995)|(df['Date'].dt.year == 1999)].index

In [155]:
df.drop(index = drop_1995_1999, inplace = True)

Drop all records for a year that are after the ice broke.

In [156]:
year_list = sorted(list(set(df['Date'].dt.year)))

In [157]:
print(year_list)

[1989, 1990, 1991, 1992, 1993, 1994, 1996, 1997, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]


In [158]:
# get winning date indices
idx_w = df.loc[df['winningDate'] == 1].index
idx_w

Int64Index([ 108,  252,  400,  556,  684,  839, 1007, 1153, 1291, 1577, 1735,
            1875, 2003, 2150, 2305, 2460, 2606, 2767, 2913, 3062, 3218, 3359,
            3537, 3663, 3813, 3964, 4123, 4274, 4408],
           dtype='int64')

In [159]:
# make a list of indices to drop
drop_index = []
for i, year in enumerate(year_list):
    idx_y = df.loc[df['Date'].dt.year == year].index
    for idx in idx_y:
        if idx > idx_w[i]:
            drop_index.append(idx)
        else:
            pass

In [160]:
# drop observations that occurred after the winning date in a year
for idx in drop_index:
    df.drop(index = idx, inplace = True)

In [161]:
df.shape

(3336, 17)

In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3336 entries, 0 to 4408
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                3336 non-null   datetime64[ns]
 1   moonPhase           3336 non-null   float64       
 2   humidity            3336 non-null   float64       
 3   windSpeed           3336 non-null   float64       
 4   windBearing         3336 non-null   float64       
 5   cloudCover          3336 non-null   float64       
 6   uvIndex             3336 non-null   float64       
 7   temperatureMin      3336 non-null   float64       
 8   temperatureMax      3336 non-null   float64       
 9   precipType          3336 non-null   object        
 10  precipIntensity     3336 non-null   float64       
 11  precipAccumulation  3336 non-null   float64       
 12  winningTime         3336 non-null   object        
 13  daylightHours       3336 non-null   float64     

In [163]:
df['past'] = (df['Date'] < '2015-01-01').astype(np.int)

In [164]:
df['future'] = 1 - df['past']

In [165]:
df.head(3).append(df.tail(3))

Unnamed: 0,Date,moonPhase,humidity,windSpeed,windBearing,cloudCover,uvIndex,temperatureMin,temperatureMax,precipType,precipIntensity,precipAccumulation,winningTime,daylightHours,iceThickness,dayOfYear,winningDate,past,future
0,1989-01-01,0.81,0.83,4.67,148.0,0.18,0.0,3.68,25.8,,0.0,0.0,0,0.174306,42.0,1,0,1,0
1,1989-01-02,0.84,0.8,4.3,131.0,0.16,0.0,-8.27,24.93,,0.0,0.0,0,0.177083,42.0,2,0,1,0
2,1989-01-03,0.88,0.69,2.26,104.0,0.22,0.0,-10.28,4.69,,0.0,0.0,0,0.179861,42.0,3,0,1,0
4406,2019-04-12,0.27,0.6,2.5,322.0,0.84,2.0,33.09,46.88,,0.0,0.0,0,0.616667,25.3,102,0,0,1
4407,2019-04-13,0.3,0.7,2.14,16.0,0.9,2.0,33.09,49.34,rain,0.0007,0.0,0,0.622222,25.3,103,0,0,1
4408,2019-04-14,0.34,0.67,4.9,338.0,0.54,2.0,32.54,47.91,rain,0.0003,0.0,12:21 AM,0.626389,25.3,104,1,0,1


In [166]:
df['precipType'].value_counts()

None    2657
snow     627
rain      52
Name: precipType, dtype: int64

In [167]:
# encode precipType
df = df.merge(pd.get_dummies(data = df['precipType'], prefix = 'precip', drop_first = True, sparse = True),
              how = 'left', left_index = True, right_index = True)

In [168]:
# drop precipType after encoding
df.drop(columns = 'precipType', inplace = True)
gc.collect()

1205

Create column for daily average temperature

In [169]:
df['temperatureAvg'] = (df['temperatureMin'] + df['temperatureMax']) / 2

Create columns for number of "hot days," "cold days," and snow accumulated since Apr 1 in a given year.

I defined a "hot day" as a day where: day_average_temp > median(year_avg_temp) + std_dev(year_avg_temp)

A "cold day" is a day where: day_average_temp < median(year_avg_temp) - std_dev(year_avg_temp)

In [170]:
hot_count = []
cold_count = []
daily_accumulation = []
for year in year_list:
    hot_temp_count = 0
    cold_temp_count = 0
    daily_accum = 0
    temp_df = df.loc[df['Date'].dt.year == year]
    hot_threshold = temp_df['temperatureAvg'].median() + temp_df['temperatureAvg'].std()
    cold_threshold = temp_df['temperatureAvg'].median() - temp_df['temperatureAvg'].std()
    for idx in temp_df.index:
        current_temp = temp_df['temperatureAvg'].loc[idx]
        if temp_df['precip_snow'].loc[idx] == 1:
            daily_accum += temp_df['precipAccumulation'].loc[idx]
        else:
            pass
        if current_temp >= hot_threshold:
            hot_temp_count += 1
        elif current_temp <= cold_threshold:
            cold_temp_count += 1
        else:
            pass
        hot_count.append(hot_temp_count)
        cold_count.append(cold_temp_count)
        daily_accumulation.append(daily_accum)

In [171]:
df['numHotDays'] = hot_count
df['numColdDays'] = cold_count
df['accumulatedSnow'] = daily_accumulation

#### Save data before adding rolling average features.

In [172]:
df.to_csv('../data/pre-moving-average_data.csv', index = False)

Create columns for moving average features.

In [173]:
ma_cols = ['humidity', 'windSpeed', 'windBearing', 'uvIndex', 'precipIntensity', 'iceThickness', 'temperatureAvg', 'numHotDays', 'numColdDays']
windows = [3, 5, 7, 10]

In [174]:
# # first add new columns with dummy info
for col in ma_cols:
    for window in windows:
        label_ma = col + '_MA' + str(window)
        df[label_ma] = 0
        
        label_slope = col + '_MA-slope' + str(window)
        df[label_slope] = 0
        
        label_accel = col + '_MA-accel' + str(window)
        df[label_accel] = 0
        
        label_std = col + '_MA-std_dev' + str(window)
        df[label_std] = 0

started at 8:55

In [175]:
# Update each year with its rolling averages
for year in year_list:
    temp_df = df.loc[df['Date'].dt.year == year]
    for col in ma_cols:
        for window in windows:
            # assign labels
            label_ma = col + '_MA' + str(window)
            label_slope = col + '_MA-slope' + str(window)
            label_accel = col + '_MA-accel' + str(window)
            label_std = col + '_MA-std_dev' + str(window)
            # for each year, update row values in new columns
            for idx in temp_df.index:
                df[label_ma].loc[idx] = temp_df[col].rolling(window).mean().loc[idx]
                df[label_slope].loc[idx] = temp_df[col].rolling(window).apply(lambda x: slope(x)).loc[idx]
                df[label_accel].loc[idx] = temp_df[col].rolling(window).apply(lambda x: accel(x)).loc[idx]
                df[label_std].loc[idx] = temp_df[col].rolling(window).std().loc[idx]

completed at 11:09

In [176]:
df.sample(7)

Unnamed: 0,Date,moonPhase,humidity,windSpeed,windBearing,cloudCover,uvIndex,temperatureMin,temperatureMax,precipIntensity,...,numColdDays_MA-accel5,numColdDays_MA-std_dev5,numColdDays_MA7,numColdDays_MA-slope7,numColdDays_MA-accel7,numColdDays_MA-std_dev7,numColdDays_MA10,numColdDays_MA-slope10,numColdDays_MA-accel10,numColdDays_MA-std_dev10
155,1990-01-17,0.73,0.83,0.0,0.0,0.38,0.0,2.79,19.74,0.0,...,1.568052e-15,7.450581e-09,3.0,-4.65532e-17,-3.853551e-16,4.10346e-08,2.7,0.1515152,-0.2651515,0.6749486
1046,1997-01-13,0.19,0.58,4.59,104.0,0.565,0.0,5.64,41.74,0.0,...,5.880195e-16,0.0,7.857143,0.1071429,-0.297619,0.3779645,7.0,0.4242424,-0.4734848,1.490712
3448,2013-02-20,0.35,0.61,2.51,331.0,0.4,0.0,-18.29,3.65,0.0002,...,0.7142857,1.30384,6.571429,0.6785714,0.4166667,1.511858,6.1,0.430303,0.3977273,1.449138
102,1989-04-17,0.41,0.59,0.0,0.0,0.12,2.0,0.35,33.61,0.0,...,3.92013e-15,0.0,20.0,-1.489702e-15,-6.165682e-16,0.0,20.0,1.052223e-15,-1.302353e-15,0.0
548,1992-05-06,0.16,0.57,5.28,52.0,0.12,5.0,14.75,48.56,0.0,...,3.92013e-16,0.0,13.0,-1.117277e-15,3.699409e-15,0.0,13.0,4.208892e-16,-1.065562e-15,1.835459e-07
1790,2002-02-01,0.67,0.73,0.0,0.0,0.41,0.0,-24.55,-4.84,0.0,...,1.071429,0.8944272,9.0,0.5714286,-2.620415e-15,1.290994,7.8,0.7272727,-0.2083333,2.250926
59,1989-03-01,0.8,0.7,8.42,236.0,0.29,1.0,20.58,29.65,0.0,...,9.408312e-15,0.0,19.0,-1.489702e-15,-2.466273e-15,0.0,19.0,1.262668e-15,1.775936e-15,0.0


In [177]:
df.shape

(3336, 168)

#### Save results to file

In [178]:
df.to_csv('../data/ma_features_added.csv', index = False)

#### Read data from file

In [111]:
df = pd.read_csv('../data/ma_features_added.csv')

In [112]:
df.head()

### Drop columns that are highly correlated
* temperatureMin and temperatureMax information was captured in temperatureAvg
* precipAccumulation information was captured in accumulatedSnow

In [179]:
df.drop(columns = ['temperatureMin', 'temperatureMax', 'precipAccumulation'],
        inplace = True)

### Create training and testing DataFrames

In [180]:
train = df.loc[df['past'] == 1]
train.drop(columns = ['past', 'future'], inplace = True)

In [181]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2762 entries, 0 to 3663
Columns: 163 entries, Date to numColdDays_MA-std_dev10
dtypes: Sparse[int64, 0](2), datetime64[ns](1), float64(155), int64(4), object(1)
memory usage: 3.4+ MB


In [182]:
test = df.loc[df['future'] == 1]
test.drop(columns = ['past', 'future'], inplace = True)

In [183]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 574 entries, 3700 to 4408
Columns: 163 entries, Date to numColdDays_MA-std_dev10
dtypes: Sparse[int64, 0](2), datetime64[ns](1), float64(155), int64(4), object(1)
memory usage: 728.4+ KB


In [184]:
train.dropna(inplace = True)

In [185]:
test.dropna(inplace = True)

In [186]:
df.drop(columns = ['past', 'future'], inplace = True)
gc.collect()

0

#### Save training and testing DataFrames to file

In [187]:
train.to_csv('../data/model_training_data.csv', index = False)

In [188]:
test.to_csv('../data/model_testing_data.csv', index = False)