# Nenana Ice Classic Data Processing
This notebook was used to process the data gathered for this project.

In [1]:
# imports

# data
import numpy as np
import pandas as pd

# date processing
import datetime

# filter warnings
import warnings
warnings.filterwarnings("ignore")

# garbage collection
import gc

## Helper Functions

In [2]:
def slope(y):
    x = range(len(y))
    # y = mx + b
    m, b = np.polyfit(x, y, 1)
    return m

def accel(y):
    t = range(len(y))
    # y = 1/2 g t^2 + v t + y0
    a, v, y0 = np.polyfit(t, y, 2)
    return 5*a

In [44]:
def gap_fill_mean(data, missing_val_field):
    '''
    This function fills in missing values in a DataFrame column with the mean of the values before
    and after a missing value.
    '''
    missing_idx = []
    start_idx, mean_val, end_idx = 0, 0, 0
    found_start, found_end = False, False
    for idx in data.index:
        if np.isnan(data[missing_val_field].loc[idx]) == True:
            missing_idx.append(idx)
            # if missing value at index, check if index is first or last entry in data
            # if not first or last, check previous and next records for values
            if idx != data.index[0] and idx != data.index[-1]:
                if np.isnan(data[missing_val_field].loc[idx - 1]) == False:
                    start_idx = idx - 1
                    found_start = True
                if np.isnan(data[missing_val_field].loc[idx + 1]) == False:
                    end_idx = idx + 1
                    found_end = True
            else:
                # if first entry is missing
                if idx == data.index[0]:
                    # check 2nd entry; if 2nd exists, make that index the end
                    if np.isnan(data[missing_val_field].loc[idx + 1]) == False:
                        end_idx = idx + 1
                        found_end = True
                    else:
                        # if 2nd entry also missing, end index will be found but not start
                        pass # handle later
                else:
                    # if last entry is missing, check next to last; if exists, make that start & end
                    if np.isnan(data[missing_val_field].loc[idx - 1]) == False:
                        start_idx = idx - 1
                        found_start = True
                    else:
                        # if next to last also missing, start index will be found but not end
                        pass # handle later
        # normal process, start and end indices are found
        if found_start and found_end:
            mean_val = np.mean([data[missing_val_field].loc[start_idx], data[missing_val_field].loc[end_idx]])
            for upd_idx in range(start_idx + 1, end_idx):
                data[missing_val_field].loc[upd_idx] = mean_val
            found_start, found_end = False, False
            start_idx, mean_val, end_idx = 0, 0, 0
        # values missing at start of data, end index was found but not start
        elif found_end and not found_start:
            # update values from start of data until end index
            for upd_idx in range(data.index[0], end_idx):
                data[missing_val_field].loc[upd_idx] = data[missing_val_field].loc[end_idx]
            found_end = False
            end_idx = 0
        # value(s) missing at end of data
        elif idx == data.index[-1] and found_start and not found_end:
            # update values from start index until end of data
            for upd_idx in range(start_idx + 1, data.index[-1]):
                data[missing_val_field].loc[upd_idx] = data[missing_val_field].loc[start_idx]
            # set last entry to most recent known value
            data[missing_val_field].loc[data.index[-1]] = data[missing_val_field].loc[start_idx]
            found_start = False
            start_idx = 0

## Reading data from files

In [3]:
ice_df = pd.read_csv('../data/raw_ice_thickness_1989-2019.csv')
ice_df.head()

Unnamed: 0,Date,Date.1,Thickness
0,0,2019-01-16,16.0
1,1,2019-02-07,16.0
2,2,2019-02-26,23.5
3,3,2019-03-04,32.5
4,4,2019-03-13,25.7


In [4]:
ice_df.drop(columns = 'Date', inplace = True)

In [5]:
ice_df.rename(columns = {'Date.1' : 'Date', 'Thickness' : 'iceThickness'}, inplace = True)

In [6]:
ice_df.shape

(412, 2)

In [7]:
ice_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 412 entries, 0 to 411
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          412 non-null    object 
 1   iceThickness  412 non-null    float64
dtypes: float64(1), object(1)
memory usage: 6.6+ KB


In [8]:
weather_df = pd.read_csv('../data/raw_weather_1989-2020.csv')
weather_df.head()

Unnamed: 0,Date,time,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipAccumulation,windGust,windGustTime,icon,precipProbability,summary,ozone
0,1989-01-01,599648400.0,599687760.0,599702820.0,0.81,15.77,599673600.0,10.7,599716800.0,13.85,...,,,,,,,,,,
1,1989-01-02,599734800.0,599774040.0,599789340.0,0.84,17.8,599788620.0,-10.28,599842380.0,15.37,...,,,,,,,,,,
2,1989-01-03,599821200.0,599860380.0,599875920.0,0.88,4.69,599878800.0,-16.58,599912040.0,3.28,...,,,,,,,,,,
3,1989-01-04,599907600.0,599946720.0,599962440.0,0.91,7.67,599940000.0,2.57,600009240.0,0.52,...,,,,,,,,,,
4,1989-01-05,599994000.0,600033000.0,600048960.0,0.94,14.7,600058800.0,10.64,600062820.0,5.23,...,,,,,,,,,,


In [9]:
weather_df.rename(columns = {'time' : 'readingTime',
                             'summary' : 'weatherSummary',
                             'pressure' : 'atmoPressure'}, inplace = True)

In [10]:
weather_df.shape

(4456, 41)

In [11]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4456 entries, 0 to 4455
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         4456 non-null   object 
 1   readingTime                  4456 non-null   float64
 2   sunriseTime                  4456 non-null   float64
 3   sunsetTime                   4456 non-null   float64
 4   moonPhase                    4456 non-null   float64
 5   temperatureHigh              4420 non-null   float64
 6   temperatureHighTime          4420 non-null   float64
 7   temperatureLow               4310 non-null   float64
 8   temperatureLowTime           4310 non-null   float64
 9   apparentTemperatureHigh      4420 non-null   float64
 10  apparentTemperatureHighTime  4420 non-null   float64
 11  apparentTemperatureLow       4310 non-null   float64
 12  apparentTemperatureLowTime   4310 non-null   float64
 13  dewPoint          

In [12]:
winners_df = pd.read_csv('../data/ice_classic_winning_times.csv')
winners_df.head()

Unnamed: 0,Date,Time
0,1917-04-30,11:30 AM
1,1918-05-11,9:33 AM
2,1919-05-03,2:33 PM
3,1920-05-11,10:46 AM
4,1921-05-11,6:42 AM


In [13]:
winners_df.rename(columns = {'Time' : 'winningTime'}, inplace = True)

In [14]:
winners_df.shape

(103, 2)

In [15]:
winners_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         103 non-null    object
 1   winningTime  103 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [16]:
river_df = pd.read_csv('../data/river_flow_data.csv')
river_df.head()

Unnamed: 0,dateTime,value,qualifiers,dateTime.1
0,0,7600,A,1989-01-01
1,1,7600,A,1989-01-02
2,2,7600,A,1989-01-03
3,3,7600,A,1989-01-04
4,4,7600,A,1989-01-05


In [17]:
river_df.drop(columns = ['dateTime', 'qualifiers'], inplace = True)

In [18]:
river_df.rename(columns = {'dateTime.1' : 'Date', 'value' : 'flowVolume'}, inplace = True)

In [19]:
river_df.shape

(4688, 2)

In [20]:
river_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4688 entries, 0 to 4687
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   flowVolume  4688 non-null   int64 
 1   Date        4688 non-null   object
dtypes: int64(1), object(1)
memory usage: 73.4+ KB


## Merge the DataFrames into one

In [21]:
merged_df = weather_df.merge(winners_df, how = 'left', on = 'Date')
merged_df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensityMax,precipIntensityMaxTime,precipAccumulation,windGust,windGustTime,icon,precipProbability,weatherSummary,ozone,winningTime
0,1989-01-01,599648400.0,599687760.0,599702820.0,0.81,15.77,599673600.0,10.7,599716800.0,13.85,...,,,,,,,,,,
1,1989-01-02,599734800.0,599774040.0,599789340.0,0.84,17.8,599788620.0,-10.28,599842380.0,15.37,...,,,,,,,,,,
2,1989-01-03,599821200.0,599860380.0,599875920.0,0.88,4.69,599878800.0,-16.58,599912040.0,3.28,...,,,,,,,,,,
3,1989-01-04,599907600.0,599946720.0,599962440.0,0.91,7.67,599940000.0,2.57,600009240.0,0.52,...,,,,,,,,,,
4,1989-01-05,599994000.0,600033000.0,600048960.0,0.94,14.7,600058800.0,10.64,600062820.0,5.23,...,,,,,,,,,,


In [22]:
merged_df.shape

(4456, 42)

In [23]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         4456 non-null   object 
 1   readingTime                  4456 non-null   float64
 2   sunriseTime                  4456 non-null   float64
 3   sunsetTime                   4456 non-null   float64
 4   moonPhase                    4456 non-null   float64
 5   temperatureHigh              4420 non-null   float64
 6   temperatureHighTime          4420 non-null   float64
 7   temperatureLow               4310 non-null   float64
 8   temperatureLowTime           4310 non-null   float64
 9   apparentTemperatureHigh      4420 non-null   float64
 10  apparentTemperatureHighTime  4420 non-null   float64
 11  apparentTemperatureLow       4310 non-null   float64
 12  apparentTemperatureLowTime   4310 non-null   float64
 13  dewPoint          

In [24]:
merged_df = merged_df.merge(ice_df, how = 'left', on = 'Date')
merged_df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensityMaxTime,precipAccumulation,windGust,windGustTime,icon,precipProbability,weatherSummary,ozone,winningTime,iceThickness
0,1989-01-01,599648400.0,599687760.0,599702820.0,0.81,15.77,599673600.0,10.7,599716800.0,13.85,...,,,,,,,,,,
1,1989-01-02,599734800.0,599774040.0,599789340.0,0.84,17.8,599788620.0,-10.28,599842380.0,15.37,...,,,,,,,,,,
2,1989-01-03,599821200.0,599860380.0,599875920.0,0.88,4.69,599878800.0,-16.58,599912040.0,3.28,...,,,,,,,,,,
3,1989-01-04,599907600.0,599946720.0,599962440.0,0.91,7.67,599940000.0,2.57,600009240.0,0.52,...,,,,,,,,,,
4,1989-01-05,599994000.0,600033000.0,600048960.0,0.94,14.7,600058800.0,10.64,600062820.0,5.23,...,,,,,,,,,,


In [25]:
merged_df.shape

(4456, 43)

In [26]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 43 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         4456 non-null   object 
 1   readingTime                  4456 non-null   float64
 2   sunriseTime                  4456 non-null   float64
 3   sunsetTime                   4456 non-null   float64
 4   moonPhase                    4456 non-null   float64
 5   temperatureHigh              4420 non-null   float64
 6   temperatureHighTime          4420 non-null   float64
 7   temperatureLow               4310 non-null   float64
 8   temperatureLowTime           4310 non-null   float64
 9   apparentTemperatureHigh      4420 non-null   float64
 10  apparentTemperatureHighTime  4420 non-null   float64
 11  apparentTemperatureLow       4310 non-null   float64
 12  apparentTemperatureLowTime   4310 non-null   float64
 13  dewPoint          

In [27]:
df = merged_df.merge(river_df, how = 'left', on = 'Date')
df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipAccumulation,windGust,windGustTime,icon,precipProbability,weatherSummary,ozone,winningTime,iceThickness,flowVolume
0,1989-01-01,599648400.0,599687760.0,599702820.0,0.81,15.77,599673600.0,10.7,599716800.0,13.85,...,,,,,,,,,,7600
1,1989-01-02,599734800.0,599774040.0,599789340.0,0.84,17.8,599788620.0,-10.28,599842380.0,15.37,...,,,,,,,,,,7600
2,1989-01-03,599821200.0,599860380.0,599875920.0,0.88,4.69,599878800.0,-16.58,599912040.0,3.28,...,,,,,,,,,,7600
3,1989-01-04,599907600.0,599946720.0,599962440.0,0.91,7.67,599940000.0,2.57,600009240.0,0.52,...,,,,,,,,,,7600
4,1989-01-05,599994000.0,600033000.0,600048960.0,0.94,14.7,600058800.0,10.64,600062820.0,5.23,...,,,,,,,,,,7600


In [28]:
df.shape

(4456, 44)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         4456 non-null   object 
 1   readingTime                  4456 non-null   float64
 2   sunriseTime                  4456 non-null   float64
 3   sunsetTime                   4456 non-null   float64
 4   moonPhase                    4456 non-null   float64
 5   temperatureHigh              4420 non-null   float64
 6   temperatureHighTime          4420 non-null   float64
 7   temperatureLow               4310 non-null   float64
 8   temperatureLowTime           4310 non-null   float64
 9   apparentTemperatureHigh      4420 non-null   float64
 10  apparentTemperatureHighTime  4420 non-null   float64
 11  apparentTemperatureLow       4310 non-null   float64
 12  apparentTemperatureLowTime   4310 non-null   float64
 13  dewPoint          

In [30]:
df.columns

Index(['Date', 'readingTime', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'temperatureHigh', 'temperatureHighTime', 'temperatureLow',
       'temperatureLowTime', 'apparentTemperatureHigh',
       'apparentTemperatureHighTime', 'apparentTemperatureLow',
       'apparentTemperatureLowTime', 'dewPoint', 'humidity', 'windSpeed',
       'windBearing', 'cloudCover', 'uvIndex', 'uvIndexTime', 'visibility',
       'temperatureMin', 'temperatureMinTime', 'temperatureMax',
       'temperatureMaxTime', 'apparentTemperatureMin',
       'apparentTemperatureMinTime', 'apparentTemperatureMax',
       'apparentTemperatureMaxTime', 'atmoPressure', 'precipType',
       'precipIntensity', 'precipIntensityMax', 'precipIntensityMaxTime',
       'precipAccumulation', 'windGust', 'windGustTime', 'icon',
       'precipProbability', 'weatherSummary', 'ozone', 'winningTime',
       'iceThickness', 'flowVolume'],
      dtype='object')

In [31]:
# delete unneeded variables
del merged_df, ice_df, winners_df, weather_df, river_df
gc.collect()

0

#### Convert Date And Time Columns To Datetime Object

In [32]:
# convert Date from string to datetime object
df['Date'] = pd.to_datetime(df['Date'], yearfirst = True, infer_datetime_format = True)
df['winningTime'] = pd.to_datetime(df['winningTime'], infer_datetime_format = True, errors = 'ignore')

### Dealing With Missing Values

---
#### (event)Time, apparentTemperature(category), weatherSummary, precipProbability, precipIntensityMax, visibility, temperatureHigh, temperatureLow, dewPoint, icon, ozone, windGust
I didn't think any of these would help with this prediction task.

My reasoning:
* Most "(event)Time" features dropped were because they only captured a single, specific event (the time the daytime high temperature was recorded, for example)
  * I may go back and get hourly data; something like length of time that the temperature was high or low for a day might have an impact. For now, I passed this by due to time constraints.
* "apparentTemperature(category)" features were dropped because they're a representation of what conditions feel like, not what they actually are.
* "weatherSummary"  was dropped because it's a text summary of information captured by other features.
* "precipProbability" was dropped because this is past data and the presence/amount of precipitation is known.
* "precipIntensityMax" was dropped because the precipitation rate is already captured by "precipIntensity" and I don't think knowing the maximum precipitation rate adds anything.
* "visibility" was dropped because atmospheric conditions are already captured by other features (e.g., "uvIndex," "precipIntensity")
* "temperatureHigh" and "temperatureLow" were dropped because the first is the daytime high and the second is the nighttime low; the 24 hour maximum and minimum are captured by "temperatureMax" and "temperatureMin" respectively
* "dewPoint" was dropped because it's correlated with temperature and humidity
* "icon" was dropped because it was missing values in just under half of the columns and it didn't appear to capture any unique information that wasn't already captured in other columns.
* "ozone" and "windGust" columns were dropped because they were missing so much information, and I had no good strategy for imputing those missing values.

In [33]:
df.drop(columns = ['readingTime', 'weatherSummary', 'precipProbability', 'apparentTemperatureHigh',
                   'apparentTemperatureHighTime', 'apparentTemperatureLow', 'apparentTemperatureLowTime',
                   'uvIndexTime', 'apparentTemperatureMin', 'apparentTemperatureMinTime',
                   'apparentTemperatureMax', 'apparentTemperatureMaxTime', 'visibility', 'precipIntensityMax',
                   'dewPoint', 'temperatureHigh', 'temperatureHighTime', 'temperatureLow',
                   'temperatureLowTime', 'temperatureMinTime', 'temperatureMaxTime', 'icon', 'ozone',
                   'precipIntensityMaxTime', 'windGust', 'windGustTime'],
        inplace = True)

In [34]:
gc.collect()

0

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   sunriseTime         4456 non-null   float64       
 2   sunsetTime          4456 non-null   float64       
 3   moonPhase           4456 non-null   float64       
 4   humidity            4285 non-null   float64       
 5   windSpeed           4207 non-null   float64       
 6   windBearing         4171 non-null   float64       
 7   cloudCover          4196 non-null   float64       
 8   uvIndex             4417 non-null   float64       
 9   temperatureMin      4344 non-null   float64       
 10  temperatureMax      4344 non-null   float64       
 11  atmoPressure        3233 non-null   float64       
 12  precipType          870 non-null    object        
 13  precipIntensity     2327 non-null   float64     

---
#### Forward fill ice thickness values until next observed value
Since ice thickness is only measured periodically, I chose to make the naive assumption that it remains constant until the next measurement.

In [36]:
df['iceThickness'].fillna(method = 'pad', inplace = True)

In [37]:
df.head()

Unnamed: 0,Date,sunriseTime,sunsetTime,moonPhase,humidity,windSpeed,windBearing,cloudCover,uvIndex,temperatureMin,temperatureMax,atmoPressure,precipType,precipIntensity,precipAccumulation,winningTime,iceThickness,flowVolume
0,1989-01-01,599687760.0,599702820.0,0.81,0.83,4.67,148.0,0.18,0.0,3.68,25.8,,,,,NaT,,7600
1,1989-01-02,599774040.0,599789340.0,0.84,0.8,4.3,131.0,0.16,0.0,-8.27,24.93,,,,,NaT,,7600
2,1989-01-03,599860380.0,599875920.0,0.88,0.69,2.26,104.0,0.22,0.0,-10.28,4.69,,,,,NaT,,7600
3,1989-01-04,599946720.0,599962440.0,0.91,0.6,7.51,69.0,0.17,0.0,-16.58,7.67,,,,,NaT,,7600
4,1989-01-05,600033000.0,600048960.0,0.94,0.71,7.3,78.0,0.13,0.0,2.57,14.7,,,,,NaT,,7600


In [38]:
df['iceThickness'].isna().sum()

56

In [39]:
df.loc[df['iceThickness'].isna() == False]

Unnamed: 0,Date,sunriseTime,sunsetTime,moonPhase,humidity,windSpeed,windBearing,cloudCover,uvIndex,temperatureMin,temperatureMax,atmoPressure,precipType,precipIntensity,precipAccumulation,winningTime,iceThickness,flowVolume
56,1989-02-26,6.045165e+08,6.045520e+08,0.70,0.71,7.81,248.0,0.06,1.0,-4.22,22.67,,,,,NaT,42.0,5800
57,1989-02-27,6.046027e+08,6.046385e+08,0.73,0.49,4.61,86.0,0.00,1.0,-15.30,9.71,,,,,NaT,42.0,5800
58,1989-02-28,6.046889e+08,6.047251e+08,0.76,0.43,4.84,85.0,0.22,1.0,3.59,27.37,,,,,NaT,42.0,5800
59,1989-03-01,6.047751e+08,6.048117e+08,0.80,0.70,8.42,236.0,0.29,1.0,20.58,29.65,,,,,NaT,42.0,6000
60,1989-03-02,6.048613e+08,6.048983e+08,0.83,0.68,8.59,266.0,0.10,1.0,-7.38,29.07,,,,,NaT,42.0,6000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4451,2019-05-27,1.558958e+09,1.559030e+09,0.80,0.45,3.16,204.0,0.29,4.0,52.98,68.43,1006.8,rain,0.0003,,NaT,25.3,28200
4452,2019-05-28,1.559045e+09,1.559116e+09,0.83,0.57,2.26,346.0,0.62,4.0,49.42,67.16,1013.8,rain,0.0008,,NaT,25.3,30100
4453,2019-05-29,1.559131e+09,1.559203e+09,0.86,0.54,2.13,119.0,0.44,4.0,47.52,70.55,1014.4,rain,0.0001,,NaT,25.3,30100
4454,2019-05-30,1.559217e+09,1.559290e+09,0.89,0.43,3.24,87.0,0.41,5.0,52.98,73.62,1011.0,rain,0.0001,,NaT,25.3,29200


The first 56 days' values were missed by the first fill operation; I backfilled them to eliminate NaN values in the iceThickness column.

In [40]:
df['iceThickness'].fillna(method = 'bfill', inplace = True)

In [41]:
df['iceThickness'].isna().sum()

0

In [42]:
df['iceThickness'].value_counts()

36.0    276
38.0    216
42.0    215
25.0    173
40.0    166
       ... 
31.9      3
49.3      3
49.7      3
55.0      3
27.0      1
Name: iceThickness, Length: 138, dtype: int64

---
#### Display DataFrame information
I wanted an idea of what other missing data I needed to deal with.

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   sunriseTime         4456 non-null   float64       
 2   sunsetTime          4456 non-null   float64       
 3   moonPhase           4456 non-null   float64       
 4   humidity            4285 non-null   float64       
 5   windSpeed           4207 non-null   float64       
 6   windBearing         4171 non-null   float64       
 7   cloudCover          4196 non-null   float64       
 8   uvIndex             4417 non-null   float64       
 9   temperatureMin      4344 non-null   float64       
 10  temperatureMax      4344 non-null   float64       
 11  atmoPressure        3233 non-null   float64       
 12  precipType          870 non-null    object        
 13  precipIntensity     2327 non-null   float64     

---
#### temperatureMin, temperatureMax
There were 112 missing values in these columns. I decided to fill in missing values with the mean of the values before and after a missing value.

In [45]:
# for each year in the data, fill missing values with mean for previous known and next known values
year_list = [year for year in df['Date'].dt.year.unique()]
for year in year_list:
    temp_df = df.loc[df['Date'].dt.year == year]
    gap_fill_mean(temp_df, 'temperatureMin')
    gap_fill_mean(temp_df, 'temperatureMax')
    for idx in temp_df.index:
        df['temperatureMin'].loc[idx] = temp_df['temperatureMin'].loc[idx]
        df['temperatureMax'].loc[idx] = temp_df['temperatureMax'].loc[idx]

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   sunriseTime         4456 non-null   float64       
 2   sunsetTime          4456 non-null   float64       
 3   moonPhase           4456 non-null   float64       
 4   humidity            4285 non-null   float64       
 5   windSpeed           4207 non-null   float64       
 6   windBearing         4171 non-null   float64       
 7   cloudCover          4196 non-null   float64       
 8   uvIndex             4417 non-null   float64       
 9   temperatureMin      4456 non-null   float64       
 10  temperatureMax      4456 non-null   float64       
 11  atmoPressure        3233 non-null   float64       
 12  precipType          870 non-null    object        
 13  precipIntensity     2327 non-null   float64     

---
#### precipAccumulation
Since precipAccumulation is defined as "The amount of snowfall accumulation expected to occur (over the hour or day, respectively), in inches. (If no snowfall is expected, this property will not be defined.)," I decided to fill missing values in that column with 0.

In [48]:
df['precipAccumulation'].isna().sum()

3941

In [49]:
df['precipAccumulation'].fillna(value = 0, downcast = 'infer', inplace = True)

In [50]:
df['precipAccumulation'].isna().sum()

0

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   sunriseTime         4456 non-null   float64       
 2   sunsetTime          4456 non-null   float64       
 3   moonPhase           4456 non-null   float64       
 4   humidity            4285 non-null   float64       
 5   windSpeed           4207 non-null   float64       
 6   windBearing         4171 non-null   float64       
 7   cloudCover          4196 non-null   float64       
 8   uvIndex             4417 non-null   float64       
 9   temperatureMin      4456 non-null   float64       
 10  temperatureMax      4456 non-null   float64       
 11  atmoPressure        3233 non-null   float64       
 12  precipType          870 non-null    object        
 13  precipIntensity     2327 non-null   float64     

---
#### precipType
I wanted to keep this column since rain or snow could impact the river ice, so I filled NaNs with 'None'.

In [52]:
df['precipType'].value_counts()

snow    664
rain    206
Name: precipType, dtype: int64

In [53]:
df['precipType'].fillna(value = 'None', inplace = True)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   sunriseTime         4456 non-null   float64       
 2   sunsetTime          4456 non-null   float64       
 3   moonPhase           4456 non-null   float64       
 4   humidity            4285 non-null   float64       
 5   windSpeed           4207 non-null   float64       
 6   windBearing         4171 non-null   float64       
 7   cloudCover          4196 non-null   float64       
 8   uvIndex             4417 non-null   float64       
 9   temperatureMin      4456 non-null   float64       
 10  temperatureMax      4456 non-null   float64       
 11  atmoPressure        3233 non-null   float64       
 12  precipType          4456 non-null   object        
 13  precipIntensity     2327 non-null   float64     

---
#### winningTime
I filled missing values in 'winningTime' with zeroes, since there will only be an entry in this column for the day that the ice broke.

In [55]:
df['winningTime'].fillna(value = 0, inplace = True)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   sunriseTime         4456 non-null   float64       
 2   sunsetTime          4456 non-null   float64       
 3   moonPhase           4456 non-null   float64       
 4   humidity            4285 non-null   float64       
 5   windSpeed           4207 non-null   float64       
 6   windBearing         4171 non-null   float64       
 7   cloudCover          4196 non-null   float64       
 8   uvIndex             4417 non-null   float64       
 9   temperatureMin      4456 non-null   float64       
 10  temperatureMax      4456 non-null   float64       
 11  atmoPressure        3233 non-null   float64       
 12  precipType          4456 non-null   object        
 13  precipIntensity     2327 non-null   float64     

---
#### atmoPressure
Barometric pressure is used as an aid in forecasting weather. Falling pressure indicates incoming inclement weather and rising pressure indicates incoming fair weather. With temperature, precipitation, and UV index information available, I chose to drop this column.

In [57]:
df.drop(columns = 'atmoPressure', inplace = True)

---
#### windSpeed, windBearing
I decided to replace missing values in these columns with zeroes.

In [58]:
df['windSpeed'].fillna(value = 0, inplace = True)
df['windBearing'].fillna(value = 0, inplace = True)

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   sunriseTime         4456 non-null   float64       
 2   sunsetTime          4456 non-null   float64       
 3   moonPhase           4456 non-null   float64       
 4   humidity            4285 non-null   float64       
 5   windSpeed           4456 non-null   float64       
 6   windBearing         4456 non-null   float64       
 7   cloudCover          4196 non-null   float64       
 8   uvIndex             4417 non-null   float64       
 9   temperatureMin      4456 non-null   float64       
 10  temperatureMax      4456 non-null   float64       
 11  precipType          4456 non-null   object        
 12  precipIntensity     2327 non-null   float64       
 13  precipAccumulation  4456 non-null   float64     

---
#### humidity
I decided to take the known value before a gap, and the known value after a gap, and fill between with the mean of the two values.

In [60]:
df['humidity'].isna().sum()

171

In [61]:
gap_fill_mean(df, 'humidity')

In [62]:
df['humidity'].isna().sum()

0

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   sunriseTime         4456 non-null   float64       
 2   sunsetTime          4456 non-null   float64       
 3   moonPhase           4456 non-null   float64       
 4   humidity            4456 non-null   float64       
 5   windSpeed           4456 non-null   float64       
 6   windBearing         4456 non-null   float64       
 7   cloudCover          4196 non-null   float64       
 8   uvIndex             4417 non-null   float64       
 9   temperatureMin      4456 non-null   float64       
 10  temperatureMax      4456 non-null   float64       
 11  precipType          4456 non-null   object        
 12  precipIntensity     2327 non-null   float64       
 13  precipAccumulation  4456 non-null   float64     

---
#### cloudCover
After looking at the records surrounding the missing values, I decided to take the known value before a gap, and the known value after a gap, and fill between with the mean of the two values.

In [64]:
# how many missing values?
df['cloudCover'].isna().sum()

260

In [65]:
gap_fill_mean(df, 'cloudCover')

In [67]:
df['cloudCover'].isna().sum()

0

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   sunriseTime         4456 non-null   float64       
 2   sunsetTime          4456 non-null   float64       
 3   moonPhase           4456 non-null   float64       
 4   humidity            4456 non-null   float64       
 5   windSpeed           4456 non-null   float64       
 6   windBearing         4456 non-null   float64       
 7   cloudCover          4456 non-null   float64       
 8   uvIndex             4417 non-null   float64       
 9   temperatureMin      4456 non-null   float64       
 10  temperatureMax      4456 non-null   float64       
 11  precipType          4456 non-null   object        
 12  precipIntensity     2327 non-null   float64       
 13  precipAccumulation  4456 non-null   float64     

---
#### precipIntensity
This is defined as "the intensity (in inches of liquid water per hour) of precipitation occurring at the given time. This value is conditional on probability (that is, assuming any precipitation occurs at all)," so I replaced missing values with zeroes.

In [68]:
df['precipIntensity'].fillna(value = 0, inplace = True)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   sunriseTime         4456 non-null   float64       
 2   sunsetTime          4456 non-null   float64       
 3   moonPhase           4456 non-null   float64       
 4   humidity            4456 non-null   float64       
 5   windSpeed           4456 non-null   float64       
 6   windBearing         4456 non-null   float64       
 7   cloudCover          4456 non-null   float64       
 8   uvIndex             4417 non-null   float64       
 9   temperatureMin      4456 non-null   float64       
 10  temperatureMax      4456 non-null   float64       
 11  precipType          4456 non-null   object        
 12  precipIntensity     4456 non-null   float64       
 13  precipAccumulation  4456 non-null   float64     

---
#### uvIndex
This was another case where the number of missing values was small (39), comprising 0.88% of the total values. I decided to replace the missing values with the median.

In [70]:
df['uvIndex'].isna().sum()

39

In [71]:
gap_fill_mean(df, 'uvIndex')

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4456 entries, 0 to 4455
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4456 non-null   datetime64[ns]
 1   sunriseTime         4456 non-null   float64       
 2   sunsetTime          4456 non-null   float64       
 3   moonPhase           4456 non-null   float64       
 4   humidity            4456 non-null   float64       
 5   windSpeed           4456 non-null   float64       
 6   windBearing         4456 non-null   float64       
 7   cloudCover          4456 non-null   float64       
 8   uvIndex             4456 non-null   float64       
 9   temperatureMin      4456 non-null   float64       
 10  temperatureMax      4456 non-null   float64       
 11  precipType          4456 non-null   object        
 12  precipIntensity     4456 non-null   float64       
 13  precipAccumulation  4456 non-null   float64     

### Save cleaned data to file

In [73]:
df.to_csv('../data/cleaned_data.csv', index_label = 'Date')

### Read data from file

In [None]:
# read data
df = pd.read_csv('../data/cleaned_data.csv')

In [None]:
df.head()

In [None]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [None]:
df.shape

In [None]:
df.info()

## Feature Engineering

In [75]:
# create column for ordinal day of year
df['dayOfYear'] = df['Date'].dt.dayofyear

In [81]:
# # create binary 'winningDate' column
df['winningDate'] = 0
idx = df.loc[df['winningTime'] != 0].index
df['winningDate'].loc[idx] = 1


In [82]:
df['winningDate'].value_counts()

0    4427
1      29
Name: winningDate, dtype: int64

In [83]:
df.loc[df['winningDate'] == 1]

Unnamed: 0,Date,sunriseTime,sunsetTime,moonPhase,humidity,windSpeed,windBearing,cloudCover,uvIndex,temperatureMin,temperatureMax,precipType,precipIntensity,precipAccumulation,winningTime,iceThickness,flowVolume,dayOfYear,winningDate
108,1989-05-01,610032300.0,610093400.0,0.87,0.47,3.72,277.0,0.14,4.0,36.22,61.53,,0.0,0.0,2020-03-25 20:14:00,40.0,62000,121,1
252,1990-04-24,640965000.0,641023300.0,0.0,0.61,0.0,0.0,0.16,4.0,28.66,44.74,,0.0,0.0,2020-03-25 17:19:00,36.0,15000,114,1
400,1991-05-01,673104400.0,673165400.0,0.6,0.49,4.13,139.0,0.1,4.0,31.43,57.62,,0.0,0.0,2020-03-25 00:04:00,38.0,40000,121,1
556,1992-05-14,705847100.0,705913700.0,0.45,0.49,5.83,239.0,0.32,5.0,30.52,51.11,,0.0,0.0,2020-03-25 06:26:00,47.0,18000,135,1
684,1993-04-23,735573200.0,735631100.0,0.07,0.52,7.72,330.0,0.02,4.0,31.28,45.5,,0.0,0.0,2020-03-25 13:01:00,25.0,19000,113,1
839,1994-04-29,767626400.0,767686600.0,0.67,0.31,4.39,22.0,0.13,4.0,26.51,59.54,,0.0,0.0,2020-03-25 23:01:00,51.0,42000,119,1
1007,1996-05-05,831301800.0,831364700.0,0.6,0.44,4.42,64.0,0.38,4.0,32.47,53.42,,0.0,0.0,2020-03-25 12:32:00,45.0,27000,126,1
1153,1997-04-30,862406900.0,862467700.0,0.79,0.48,4.05,252.0,0.565,1.5,25.75,61.35,,0.0,0.0,2020-03-25 10:28:00,36.0,17000,120,1
1291,1998-04-20,893081100.0,893137700.0,0.8,0.6,4.06,287.0,0.35,3.0,25.97,52.56,rain,0.0,0.0,2020-03-25 16:54:00,38.0,15500,110,1
1577,2000-05-01,957187400.0,957248800.0,0.93,0.46,4.97,211.0,0.88,4.0,34.92,55.95,,0.0,0.0,2020-03-25 10:47:00,36.0,20000,122,1


The information for 1995 and 1999 are missing some dates, including the winning date. I decided to drop the data for those years, since there is no target as a result. Survival analysis would also falsely treat those years as censored.

In [84]:
drop_1995_1999 = df.loc[(df['Date'].dt.year == 1995)|(df['Date'].dt.year == 1999)].index

In [85]:
df.drop(index = drop_1995_1999, inplace = True)

Drop all records for a year that are after the ice broke.

In [87]:
year_list = sorted(list(set(df['Date'].dt.year)))

[1989,
 1990,
 1991,
 1992,
 1993,
 1994,
 1996,
 1997,
 1998,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019]

In [88]:
print(year_list)

[1989, 1990, 1991, 1992, 1993, 1994, 1996, 1997, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]


In [89]:
# get winning date indices
idx_w = df.loc[df['winningDate'] == 1].index
idx_w

Int64Index([ 108,  252,  400,  556,  684,  839, 1007, 1153, 1291, 1577, 1735,
            1875, 2003, 2150, 2305, 2460, 2606, 2767, 2913, 3062, 3218, 3359,
            3537, 3663, 3813, 3964, 4123, 4274, 4408],
           dtype='int64')

In [90]:
# make a list of indices to drop
drop_index = []
for i, year in enumerate(year_list):
    idx_y = df.loc[df['Date'].dt.year == year].index
    for idx in idx_y:
        if idx > idx_w[i]:
            drop_index.append(idx)
        else:
            pass

In [91]:
# drop observations that occurred after the winning date in a year
for idx in drop_index:
    df.drop(index = idx, inplace = True)

In [92]:
df.shape

(3336, 19)

In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3336 entries, 0 to 4408
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                3336 non-null   datetime64[ns]
 1   sunriseTime         3336 non-null   float64       
 2   sunsetTime          3336 non-null   float64       
 3   moonPhase           3336 non-null   float64       
 4   humidity            3336 non-null   float64       
 5   windSpeed           3336 non-null   float64       
 6   windBearing         3336 non-null   float64       
 7   cloudCover          3336 non-null   float64       
 8   uvIndex             3336 non-null   float64       
 9   temperatureMin      3336 non-null   float64       
 10  temperatureMax      3336 non-null   float64       
 11  precipType          3336 non-null   object        
 12  precipIntensity     3336 non-null   float64       
 13  precipAccumulation  3336 non-null   float64     

### Calculate the number of hours of daylight
Daylight hours fluctuate a lot in Alaska. Articles I had read about the NIC mention that things like how much snow covers the ice, and therefore how much direct sunlight the ice receives, affect the rate the river ice melts.

I proxied snow depth with precipitation measurements. I theorized that the number of sunlight hours in a day may be a proxy for how much sunlight the river ice receives. I also had information regarding cloud cover which may help in that regard.

I calculated the number of daylight hours as a percentage of the day. That way its value is already scaled for use in modeling later.

In [94]:
def calc_daylight(sunrise, sunset):
    daylight_hours = (sunset - sunrise) / 60 / 60 /24
    return daylight_hours

In [95]:
df['daylightHours'] = calc_daylight(df['sunriseTime'], df['sunsetTime'])

In [96]:
df.head()

Unnamed: 0,Date,sunriseTime,sunsetTime,moonPhase,humidity,windSpeed,windBearing,cloudCover,uvIndex,temperatureMin,temperatureMax,precipType,precipIntensity,precipAccumulation,winningTime,iceThickness,flowVolume,dayOfYear,winningDate,daylightHours
0,1989-01-01,599687760.0,599702820.0,0.81,0.83,4.67,148.0,0.18,0.0,3.68,25.8,,0.0,0.0,0,42.0,7600,1,0,0.174306
1,1989-01-02,599774040.0,599789340.0,0.84,0.8,4.3,131.0,0.16,0.0,-8.27,24.93,,0.0,0.0,0,42.0,7600,2,0,0.177083
2,1989-01-03,599860380.0,599875920.0,0.88,0.69,2.26,104.0,0.22,0.0,-10.28,4.69,,0.0,0.0,0,42.0,7600,3,0,0.179861
3,1989-01-04,599946720.0,599962440.0,0.91,0.6,7.51,69.0,0.17,0.0,-16.58,7.67,,0.0,0.0,0,42.0,7600,4,0,0.181944
4,1989-01-05,600033000.0,600048960.0,0.94,0.71,7.3,78.0,0.13,0.0,2.57,14.7,,0.0,0.0,0,42.0,7600,5,0,0.184722


In [97]:
df['daylightHours'].value_counts()

0.184722    13
0.188194    13
0.179167    12
0.174306    12
0.177778    11
            ..
0.443750     1
0.273611     1
0.726389     1
0.302778     1
0.364583     1
Name: daylightHours, Length: 748, dtype: int64

In [98]:
df.drop(columns = ['sunriseTime', 'sunsetTime'], inplace = True)

In [99]:
df['past'] = (df['Date'] < '2015-01-01').astype(np.int)

In [100]:
df['future'] = 1 - df['past']

In [101]:
df.head(3).append(df.tail(3))

Unnamed: 0,Date,moonPhase,humidity,windSpeed,windBearing,cloudCover,uvIndex,temperatureMin,temperatureMax,precipType,precipIntensity,precipAccumulation,winningTime,iceThickness,flowVolume,dayOfYear,winningDate,daylightHours,past,future
0,1989-01-01,0.81,0.83,4.67,148.0,0.18,0.0,3.68,25.8,,0.0,0.0,0,42.0,7600,1,0,0.174306,1,0
1,1989-01-02,0.84,0.8,4.3,131.0,0.16,0.0,-8.27,24.93,,0.0,0.0,0,42.0,7600,2,0,0.177083,1,0
2,1989-01-03,0.88,0.69,2.26,104.0,0.22,0.0,-10.28,4.69,,0.0,0.0,0,42.0,7600,3,0,0.179861,1,0
4406,2019-04-12,0.27,0.6,2.5,322.0,0.84,2.0,33.09,46.88,,0.0,0.0,0,25.3,11400,102,0,0.616667,0,1
4407,2019-04-13,0.3,0.7,2.14,16.0,0.9,2.0,33.09,49.34,rain,0.0007,0.0,0,25.3,12100,103,0,0.622222,0,1
4408,2019-04-14,0.34,0.67,4.9,338.0,0.54,2.0,32.54,47.91,rain,0.0003,0.0,2020-03-25 00:21:00,25.3,12700,104,1,0.626389,0,1


In [102]:
df['precipType'].value_counts()

None    2657
snow     627
rain      52
Name: precipType, dtype: int64

In [103]:
# encode precipType
df = df.merge(pd.get_dummies(data = df['precipType'], prefix = 'precip', drop_first = True, sparse = True),
              how = 'left', left_index = True, right_index = True)

In [104]:
# drop precipType after encoding
df.drop(columns = 'precipType', inplace = True)
gc.collect()

193

In [105]:
df.head()

Unnamed: 0,Date,moonPhase,humidity,windSpeed,windBearing,cloudCover,uvIndex,temperatureMin,temperatureMax,precipIntensity,...,winningTime,iceThickness,flowVolume,dayOfYear,winningDate,daylightHours,past,future,precip_rain,precip_snow
0,1989-01-01,0.81,0.83,4.67,148.0,0.18,0.0,3.68,25.8,0.0,...,0,42.0,7600,1,0,0.174306,1,0,0,0
1,1989-01-02,0.84,0.8,4.3,131.0,0.16,0.0,-8.27,24.93,0.0,...,0,42.0,7600,2,0,0.177083,1,0,0,0
2,1989-01-03,0.88,0.69,2.26,104.0,0.22,0.0,-10.28,4.69,0.0,...,0,42.0,7600,3,0,0.179861,1,0,0,0
3,1989-01-04,0.91,0.6,7.51,69.0,0.17,0.0,-16.58,7.67,0.0,...,0,42.0,7600,4,0,0.181944,1,0,0,0
4,1989-01-05,0.94,0.71,7.3,78.0,0.13,0.0,2.57,14.7,0.0,...,0,42.0,7600,5,0,0.184722,1,0,0,0


Create column for daily average temperature

In [108]:
df['temperatureAvg'] = (df['temperatureMin'] + df['temperatureMax']) / 2

Create columns for number of "hot days," "cold days," and snow accumulated since Apr 1 in a given year.

I defined a "hot day" as a day where: day_average_temp > median(year_avg_temp) + std_dev(year_avg_temp)

A "cold day" is a day where: day_average_temp < median(year_avg_temp) - std_dev(year_avg_temp)

In [110]:
hot_count = []
cold_count = []
daily_accumulation = []
for year in year_list:
    hot_temp_count = 0
    cold_temp_count = 0
    daily_accum = 0
    temp_df = df.loc[df['Date'].dt.year == year]
    hot_threshold = temp_df['temperatureAvg'].median() + temp_df['temperatureAvg'].std()
    cold_threshold = temp_df['temperatureAvg'].median() - temp_df['temperatureAvg'].std()
    for idx in temp_df.index:
        current_temp = temp_df['temperatureAvg'].loc[idx]
        if temp_df['precip_snow'].loc[idx] == 1:
            daily_accum += temp_df['precipAccumulation'].loc[idx]
        else:
            pass
        if current_temp >= hot_threshold:
            hot_temp_count += 1
        elif current_temp <= cold_threshold:
            cold_temp_count += 1
        else:
            pass
        hot_count.append(hot_temp_count)
        cold_count.append(cold_temp_count)
        daily_accumulation.append(daily_accum)

In [111]:
df['numHotDays'] = hot_count
df['numColdDays'] = cold_count
df['accumulatedSnow'] = daily_accumulation

#### Save data before adding rolling average features.

In [112]:
df.to_csv('../data/pre-moving-average_data.csv', index = False)

Create columns for moving average features.

In [113]:
ma_cols = ['humidity', 'windSpeed', 'windBearing', 'cloudCover','uvIndex', 'precipIntensity',
           'iceThickness', 'temperatureAvg', 'numHotDays', 'numColdDays', 'flowVolume']
windows = [3, 5, 7, 10]

In [114]:
# # first add new columns with dummy info
for col in ma_cols:
    for window in windows:
        label_ma = col + '_MA' + str(window)
        df[label_ma] = 0
        
        label_slope = col + '_MA-slope' + str(window)
        df[label_slope] = 0
        
        label_accel = col + '_MA-accel' + str(window)
        df[label_accel] = 0
        
        label_std = col + '_MA-std_dev' + str(window)
        df[label_std] = 0

In [115]:
df.shape

(3336, 185)

In [116]:
# Update each year with its rolling averages
# this takes ~2.5 hours to run on my laptop
for year in year_list:
    temp_df = df.loc[df['Date'].dt.year == year]
    for col in ma_cols:
        for window in windows:
            # assign labels
            label_ma = col + '_MA_' + str(window)
            label_slope = col + '_MA-slope_' + str(window)
            label_accel = col + '_MA-accel_' + str(window)
            label_std = col + '_MA-std_dev_' + str(window)
            # for each year, update row values in new columns
            for idx in temp_df.index:
                df[label_ma].loc[idx] = temp_df[col].rolling(window).mean().loc[idx]
                df[label_slope].loc[idx] = temp_df[col].rolling(window).apply(lambda x: slope(x)).loc[idx]
                df[label_accel].loc[idx] = temp_df[col].rolling(window).apply(lambda x: accel(x)).loc[idx]
                df[label_std].loc[idx] = temp_df[col].rolling(window).std().loc[idx]

In [117]:
df.sample(7)

Unnamed: 0,Date,moonPhase,humidity,windSpeed,windBearing,cloudCover,uvIndex,temperatureMin,temperatureMax,precipIntensity,...,numColdDays_MA-accel5,numColdDays_MA-std_dev5,numColdDays_MA7,numColdDays_MA-slope7,numColdDays_MA-accel7,numColdDays_MA-std_dev7,numColdDays_MA10,numColdDays_MA-slope10,numColdDays_MA-accel10,numColdDays_MA-std_dev10
222,1990-03-25,0.98,0.66,0.0,0.0,0.12,2.0,-0.42,35.61,0.0,...,9.408312e-15,0.0,21.0,-7.448512e-16,-4.315977e-15,4.173877e-07,21.0,8.417784e-16,2.604707e-15,0.0
1698,2001-04-01,0.28,0.51,2.95,273.0,0.14,2.0,-12.78,18.29,0.0,...,0.7142857,0.4472136,23.142857,0.1071429,0.297619,0.3779645,23.1,0.05454545,0.1136364,0.3162278
3595,2014-02-16,0.57,0.67,1.36,315.0,0.53,0.0,-15.1,2.94,0.0,...,-0.3571429,1.140175,10.714286,0.8214286,-0.297619,1.799471,9.3,0.9030303,-0.1515152,2.750757
3583,2014-02-04,0.2,0.49,10.71,65.0,0.0,0.0,10.22,23.68,0.0,...,9.800325e-16,0.0,5.0,-3.724256e-16,-1.54142e-16,0.0,5.0,2.630557e-16,-3.255883e-16,6.083374e-09
3630,2014-03-23,0.75,0.29,9.42,63.0,0.0,2.0,20.94,42.15,0.0,...,9.408312e-15,0.0,21.0,-7.448512e-16,-4.315977e-15,0.0,21.0,8.417784e-16,2.604707e-15,0.0
2088,2004-02-22,0.1,0.72,3.02,301.0,0.13,1.0,-2.65,16.65,0.0,...,5.880195e-16,0.0,8.0,-1.862128e-16,-4.624261e-16,5.458112e-08,8.0,6.313338e-16,4.143851e-16,0.0
1224,1998-02-12,0.56,0.81,2.33,272.0,0.19,0.0,-7.55,9.12,0.0,...,1.176039e-15,2.005438e-07,16.0,-3.724256e-16,-9.248523e-16,1.960414e-07,16.0,1.262668e-15,8.287703e-16,9.612244e-08


In [118]:
df.shape

(3336, 185)

#### Save results to file

In [119]:
df.to_csv('../data/ma_features_added.csv', index = False)

#### Read data from file

In [None]:
df = pd.read_csv('../data/ma_features_added.csv')

In [None]:
df.head()

### Drop columns that are highly correlated
* temperatureMin and temperatureMax information was captured in temperatureAvg
* precipAccumulation information was captured in accumulatedSnow

In [120]:
df.drop(columns = ['temperatureMin', 'temperatureMax', 'precipAccumulation'],
        inplace = True)

In [121]:
gc.collect()

21

### Create training and testing DataFrames

In [122]:
train = df.loc[df['past'] == 1]
train.drop(columns = ['past', 'future'], inplace = True)

In [123]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2762 entries, 0 to 3663
Columns: 180 entries, Date to numColdDays_MA-std_dev10
dtypes: Sparse[int64, 0](2), datetime64[ns](1), float64(171), int64(5), object(1)
memory usage: 3.8+ MB


In [124]:
test = df.loc[df['future'] == 1]
test.drop(columns = ['past', 'future'], inplace = True)

In [125]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 574 entries, 3700 to 4408
Columns: 180 entries, Date to numColdDays_MA-std_dev10
dtypes: Sparse[int64, 0](2), datetime64[ns](1), float64(171), int64(5), object(1)
memory usage: 804.6+ KB


In [126]:
train.dropna(inplace = True)

In [127]:
test.dropna(inplace = True)

In [128]:
df.drop(columns = ['past', 'future'], inplace = True)
gc.collect()

0

#### Save training and testing DataFrames to file

In [129]:
train.to_csv('../data/model_training_data.csv', index = False)

In [130]:
test.to_csv('../data/model_testing_data.csv', index = False)