# Nenana Ice Classic Data Processing
This notebook was used to process the data gathered for this project.

In [1]:
# imports

# data
import numpy as np
import pandas as pd

# date processing
import datetime

# filter warnings
import warnings
warnings.filterwarnings("ignore")

# garbage collection
import gc

## Reading data from files

In [2]:
ice_df = pd.read_csv('../data/raw_ice_thickness_1989-2019.csv')
ice_df.head()

Unnamed: 0,Date,Date.1,Thickness
0,0,2019-01-16,16.0
1,1,2019-02-07,16.0
2,2,2019-02-26,23.5
3,3,2019-03-04,32.5
4,4,2019-03-13,25.7


In [3]:
weather_df = pd.read_csv('../data/raw_weather_1989-2020.csv')
weather_df.head()

Unnamed: 0,Date,time,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,windGust,windGustTime,icon,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,summary,ozone
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,,


In [4]:
winners_df = pd.read_csv('../data/ice_classic_winning_times.csv')
winners_df.head()

Unnamed: 0,Date,Time
0,1917-04-30,11:30 AM
1,1918-05-11,9:33 AM
2,1919-05-03,2:33 PM
3,1920-05-11,10:46 AM
4,1921-05-11,6:42 AM


## Merge the DataFrames into one

In [5]:
merged_df = weather_df.merge(winners_df, how = 'left', on = 'Date')
merged_df.head()

Unnamed: 0,Date,time,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,windGustTime,icon,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,summary,ozone,Time
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,,


### Rename a couple of columns to avoid potential confusion
The column 'time' contains the time that weather measurements were taken; 'Time' contains the winning time for a given year. This could be confusing, so I renamed 'time' to 'readingTime' and 'Time' to 'winningTime.'

In [6]:
merged_df.rename(columns = {'time' : 'readingTime', 'Time' : 'winningTime'}, inplace = True)

In [7]:
merged_df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,windGustTime,icon,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,summary,ozone,winningTime
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,,


### Calculate the number of hours of daylight
Daylight hours fluctuate a lot in Alaska. Articles I had read about the NIC mention that things like how much snow covers the ice, and therefore how much direct sunlight the ice receives, affect the rate the river ice melts.

I may be able to proxy snow depth with precipitation measurements. I theorize that the number of sunlight hours in a day may be a proxy for how much sunlight the river ice receives. I also have information regarding cloud cover which may help in that regard.

I calculated the number of daylight hours as a percentage of the day. That way its value is already scaled for use in modeling later.

In [8]:
def calc_daylight(sunrise, sunset):
    daylight_hours = (sunset - sunrise) / 60 / 60 /24
    return daylight_hours

In [9]:
merged_df['daylightHours'] = calc_daylight(merged_df['sunriseTime'], merged_df['sunsetTime'])

In [10]:
merged_df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,icon,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,summary,ozone,winningTime,daylightHours
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,,0.423611
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,,0.428472
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,,0.433333
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,,0.438194
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,,0.442361


In [11]:
merged_df['daylightHours'].value_counts()

0.842361    10
0.840972    10
0.845139    10
0.472917    10
0.829167    10
            ..
0.800000     1
0.757639     1
0.652778     1
0.804167     1
0.806944     1
Name: daylightHours, Length: 580, dtype: int64

In [12]:
ice_df.head()

Unnamed: 0,Date,Date.1,Thickness
0,0,2019-01-16,16.0
1,1,2019-02-07,16.0
2,2,2019-02-26,23.5
3,3,2019-03-04,32.5
4,4,2019-03-13,25.7


In [13]:
ice_df.drop(columns = 'Date', inplace = True)

In [14]:
ice_df.rename(columns = {'Date.1' : 'Date'}, inplace = True)

In [15]:
ice_df.head()

Unnamed: 0,Date,Thickness
0,2019-01-16,16.0
1,2019-02-07,16.0
2,2019-02-26,23.5
3,2019-03-04,32.5
4,2019-03-13,25.7


In [16]:
df = merged_df.merge(ice_df, how = 'left', on = 'Date')

In [17]:
df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,summary,ozone,winningTime,daylightHours,Thickness
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,0.423611,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,0.428472,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,0.433333,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,0.438194,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,0.442361,


### Rename other columns
There are a couple of other columns that I decided to rename to make them a little more descriptive of the values they contain.

In [18]:
df.columns

Index(['Date', 'readingTime', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'temperatureHigh', 'temperatureHighTime', 'temperatureLow',
       'temperatureLowTime', 'apparentTemperatureHigh',
       'apparentTemperatureHighTime', 'apparentTemperatureLow',
       'apparentTemperatureLowTime', 'dewPoint', 'humidity', 'windSpeed',
       'windBearing', 'cloudCover', 'uvIndex', 'uvIndexTime', 'visibility',
       'temperatureMin', 'temperatureMinTime', 'temperatureMax',
       'temperatureMaxTime', 'apparentTemperatureMin',
       'apparentTemperatureMinTime', 'apparentTemperatureMax',
       'apparentTemperatureMaxTime', 'precipType', 'pressure', 'windGust',
       'windGustTime', 'icon', 'precipIntensity', 'precipIntensityMax',
       'precipIntensityMaxTime', 'precipProbability', 'precipAccumulation',
       'summary', 'ozone', 'winningTime', 'daylightHours', 'Thickness'],
      dtype='object')

In [19]:
df.rename(columns = {'summary':'weatherSummary', 'Thickness':'iceThickness', 'pressure':'atmoPressure'}, inplace = True)

In [20]:
df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,weatherSummary,ozone,winningTime,daylightHours,iceThickness
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,0.423611,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,0.428472,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,0.433333,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,0.438194,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,0.442361,


In [21]:
# removed unneeded variables
del merged_df, ice_df, winners_df, weather_df
gc.collect()

0

### Dealing With Missing Values

---
#### Forward fill ice thickness values until next observed value
Since ice thickness is only measured periodically, I chose to make the naive assumption that it remains constant until the next measurement.

In [22]:
df['iceThickness'].fillna(method = 'pad', inplace = True)

In [23]:
df.head()

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,weatherSummary,ozone,winningTime,daylightHours,iceThickness
0,1989-03-01,604746000.0,604775100.0,604811700.0,0.8,29.65,604814400.0,6.68,604861200.0,20.64,...,,,,,,,,,0.423611,
1,1989-03-02,604832400.0,604861260.0,604898280.0,0.83,11.69,604857600.0,-15.29,604925640.0,-0.54,...,,,,,,,,,0.428472,
2,1989-03-03,604918800.0,604947480.0,604984920.0,0.87,1.84,604970400.0,-26.23,605034000.0,-5.72,...,,,,,,,,,0.433333,
3,1989-03-04,605005200.0,605033640.0,605071500.0,0.9,4.69,605062800.0,-38.53,605118840.0,4.47,...,,,,,,,,,0.438194,
4,1989-03-05,605091600.0,605119860.0,605158080.0,0.94,0.74,605149320.0,-27.43,605189280.0,0.23,...,,,,,,,,,0.442361,


In [24]:
df['iceThickness'].isna().sum()

15

In [25]:
df.loc[df['iceThickness'].isna() == False]

Unnamed: 0,Date,readingTime,sunriseTime,sunsetTime,moonPhase,temperatureHigh,temperatureHighTime,temperatureLow,temperatureLowTime,apparentTemperatureHigh,...,precipIntensity,precipIntensityMax,precipIntensityMaxTime,precipProbability,precipAccumulation,weatherSummary,ozone,winningTime,daylightHours,iceThickness
15,1989-03-16,6.060420e+08,6.060679e+08,6.061105e+08,0.34,24.80,6.060996e+08,-10.25,6.061500e+08,18.16,...,,,,,,,,,0.493056,37.5
16,1989-03-17,6.061284e+08,6.061540e+08,6.061970e+08,0.37,20.65,6.061788e+08,1.70,6.062436e+08,18.19,...,,,,,,,,,0.497917,37.5
17,1989-03-18,6.062148e+08,6.062402e+08,6.062836e+08,0.40,11.75,6.062791e+08,-5.48,6.063184e+08,2.00,...,,,,,,,,,0.502083,37.5
18,1989-03-19,6.063012e+08,6.063264e+08,6.063702e+08,0.43,14.80,6.063557e+08,-0.26,6.064056e+08,5.22,...,,,,,,,,,0.506944,37.5
19,1989-03-20,6.063876e+08,6.064126e+08,6.064568e+08,0.46,11.69,6.064416e+08,-14.70,6.064978e+08,1.71,...,,,,,,,,,0.511806,37.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,2019-05-27,1.558944e+09,1.558958e+09,1.559030e+09,0.80,68.04,1.558998e+09,49.06,1.559052e+09,67.54,...,0.0003,0.0016,1.559013e+09,0.12,,Partly cloudy throughout the day.,359.8,,0.827778,25.3
3061,2019-05-28,1.559030e+09,1.559045e+09,1.559116e+09,0.83,67.04,1.559093e+09,46.89,1.559132e+09,66.54,...,0.0008,0.0107,1.559056e+09,0.73,,Drizzle in the morning.,359.0,,0.832639,25.3
3062,2019-05-29,1.559117e+09,1.559131e+09,1.559203e+09,0.86,70.52,1.559172e+09,52.30,1.559221e+09,70.02,...,0.0001,0.0002,1.559120e+09,0.07,,Partly cloudy throughout the day.,347.8,,0.836806,25.3
3063,2019-05-30,1.559203e+09,1.559217e+09,1.559290e+09,0.89,75.52,1.559267e+09,57.20,1.559312e+09,75.02,...,0.0001,0.0001,1.559265e+09,0.09,,Partly cloudy throughout the day.,349.5,,0.840972,25.3


The first ~2 weeks' values were missed by the first fill operation; I backfilled them to eliminate NaN values in the iceThickness column.

In [26]:
df['iceThickness'].fillna(method = 'bfill', inplace = True)

In [27]:
df['iceThickness'].isna().sum()

0

In [28]:
df['iceThickness'].value_counts()

36      198
38      149
36.5    142
35.5    113
25      108
       ... 
35.0      4
49        4
55.0      4
24        3
27        2
Name: iceThickness, Length: 144, dtype: int64

---
#### Display DataFrame information
I wanted an idea of what other missing data I needed to deal with.

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         3065 non-null   object 
 1   readingTime                  3065 non-null   float64
 2   sunriseTime                  3065 non-null   float64
 3   sunsetTime                   3065 non-null   float64
 4   moonPhase                    3065 non-null   float64
 5   temperatureHigh              3045 non-null   float64
 6   temperatureHighTime          3045 non-null   float64
 7   temperatureLow               2988 non-null   float64
 8   temperatureLowTime           2988 non-null   float64
 9   apparentTemperatureHigh      3045 non-null   float64
 10  apparentTemperatureHighTime  3045 non-null   float64
 11  apparentTemperatureLow       2988 non-null   float64
 12  apparentTemperatureLowTime   2988 non-null   float64
 13  dewPoint          

---
#### temperatureMin, temperatureMax
There were 56 missing values in these columns. I decided on the naive approach of setting the missing values to the overnight low and daytime high, respectively. There were few enough missing that I didn't think this approach would overly bias a model.

In [30]:
df['temperatureMin'].fillna(value = df['temperatureLow'], inplace = True)
df['temperatureMax'].fillna(value = df['temperatureHigh'], inplace = True)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         3065 non-null   object 
 1   readingTime                  3065 non-null   float64
 2   sunriseTime                  3065 non-null   float64
 3   sunsetTime                   3065 non-null   float64
 4   moonPhase                    3065 non-null   float64
 5   temperatureHigh              3045 non-null   float64
 6   temperatureHighTime          3045 non-null   float64
 7   temperatureLow               2988 non-null   float64
 8   temperatureLowTime           2988 non-null   float64
 9   apparentTemperatureHigh      3045 non-null   float64
 10  apparentTemperatureHighTime  3045 non-null   float64
 11  apparentTemperatureLow       2988 non-null   float64
 12  apparentTemperatureLowTime   2988 non-null   float64
 13  dewPoint          

There were still missing values after that treatment. I decided to replace the remaining missing values with the median value for the year the missing value occurred in.

In [32]:
# convert Date from string to datetime object
df['Date'] = pd.to_datetime(df['Date'])

In [33]:
# what years are missing?
min_missing_years = df.loc[df['temperatureMin'].isna() == True]['Date'].dt.year.unique().tolist()

In [34]:
min_missing_years

[1989, 1991, 1995, 1996, 1999, 2002, 2003]

In [35]:
# what is the median temperatureMin for each year?
min_median = df.groupby(df['Date'].dt.year)['temperatureMin'].median()

In [36]:
# show median temperatureMin for each year
min_median.values

array([ 21.64 ,  24.56 ,  26.7  ,  18.75 ,  23.99 ,  22.57 , -15.115,
        32.41 ,  17.525,  27.83 ,   8.035,  20.55 ,  20.675,  20.76 ,
        27.74 ,  26.96 ,  17.035,  17.8  ,  27.695,  23.07 ,  21.45 ,
        27.48 ,  19.05 ,  24.98 ,  11.475,  21.56 ,  28.26 ,  26.085,
        23.82 ,  20.445,  27.53 ])

In [37]:
# validate that the index is each year
min_median.index

Int64Index([1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
            2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
            2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019],
           dtype='int64', name='Date')

In [38]:
# make a dict of the missing years and the medians of temperatureMin of those years
missing_year_medians = {}
for n, val in enumerate(min_median):
    if min_median.index[n] in min_missing_years:
        missing_year_medians[min_median.index[n]] = val
    else:
        pass
missing_year_medians

{1989: 21.64,
 1991: 26.7,
 1995: -15.115,
 1996: 32.41,
 1999: 8.035,
 2002: 20.76,
 2003: 27.74}

In [39]:
# set the missing values to the median for the year
for year in missing_year_medians:
    df['temperatureMin'].loc[(df['Date'].dt.year == year) & (df['temperatureMin'].isna() == True)] = missing_year_medians[year]

In [40]:
# what years are missing?
max_missing_years = df.loc[df['temperatureMax'].isna() == True]['Date'].dt.year.unique().tolist()

In [41]:
max_missing_years

[1993, 1994, 1999, 2003]

In [42]:
# what is the median temperatureMax for each year?
max_median = df.groupby(df['Date'].dt.year)['temperatureMax'].median()

In [43]:
# show median temperatureMax for each year
max_median.values

array([40.56 , 45.54 , 46.595, 38.54 , 47.51 , 43.83 , 22.125, 53.42 ,
       40.77 , 48.75 , 34.33 , 40.3  , 38.08 , 36.4  , 49.1  , 47.02 ,
       39.915, 36.19 , 46.285, 39.81 , 40.645, 45.58 , 40.64 , 45.55 ,
       32.08 , 40.73 , 48.8  , 44.125, 44.695, 40.34 , 46.69 ])

In [44]:
# validate that the index is each year
max_median.index

Int64Index([1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
            2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
            2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019],
           dtype='int64', name='Date')

In [45]:
# make a dict of the missing years and the medians of temperatureMax of those years
missing_year_medians = {}
for n, val in enumerate(max_median):
    if max_median.index[n] in max_missing_years:
        missing_year_medians[max_median.index[n]] = val
    else:
        pass
missing_year_medians

{1993: 47.51, 1994: 43.83, 1999: 34.33, 2003: 49.1}

In [46]:
# set the missing values to the median for the year
for year in missing_year_medians:
    df['temperatureMax'].loc[(df['Date'].dt.year == year) & (df['temperatureMax'].isna() == True)] = missing_year_medians[year]

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Date                         3065 non-null   datetime64[ns]
 1   readingTime                  3065 non-null   float64       
 2   sunriseTime                  3065 non-null   float64       
 3   sunsetTime                   3065 non-null   float64       
 4   moonPhase                    3065 non-null   float64       
 5   temperatureHigh              3045 non-null   float64       
 6   temperatureHighTime          3045 non-null   float64       
 7   temperatureLow               2988 non-null   float64       
 8   temperatureLowTime           2988 non-null   float64       
 9   apparentTemperatureHigh      3045 non-null   float64       
 10  apparentTemperatureHighTime  3045 non-null   float64       
 11  apparentTemperatureLow       2988 non-null 

---
#### (event)Time, apparentTemperature(category), sunriseTime, sunsetTime, weatherSummary, precipProbability, precipIntensityMax, visibility, temperatureHigh, temperatureLow, dewPoint
I didn't think any of these would help with this prediction task.

My reasoning:
* Most "(event)Time" features dropped were because they only captured a single, specific event (the time the daytime high temperature was recorded, for example)
  * I may go back and get hourly data; something like length of time that the temperature was high or low for a day might have an impact. For now, I passed this by due to time constraints.
* "apparentTemperature(category)" features were dropped because they're a representation of what conditions feel like, not what they actually are.
* Sunrise and sunset times were dropped because that information was captured in 'daylightHours'
* "weatherSummary"  was dropped because it's a text summary of information captured by other features.
* "precipProbability" was dropped because this is past data and the presence/amount of precipitation is known.
* "precipIntensityMax" was dropped because the precipitation rate is already captured by "precipIntensity" and I don't think knowing the maximum precipitation rate adds anything.
* "visibility" was dropped because atmospheric conditions are already captured by other features (e.g., "uvIndex," "precipIntensity")
* "temperatureHigh" and "temperatureLow" were dropped because the first is the daytime high and the second is the nighttime low; the 24 hour maximum and minimum are captured by "temperatureMax" and "temperatureMin" respectively
* "dewPoint" was dropped because it's correlated with temperature and humidity

In [48]:
df.drop(columns = ['readingTime', 'weatherSummary', 'sunriseTime', 'sunsetTime', 'precipProbability',
                   'apparentTemperatureHigh', 'apparentTemperatureHighTime', 'apparentTemperatureLow',
                   'apparentTemperatureLowTime', 'uvIndexTime', 'apparentTemperatureMin',
                   'apparentTemperatureMinTime', 'apparentTemperatureMax', 'apparentTemperatureMaxTime',
                   'visibility', 'precipIntensityMax', 'dewPoint',
                   'temperatureHigh', 'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
                   'temperatureMinTime', 'temperatureMaxTime'],
        inplace = True)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    3065 non-null   datetime64[ns]
 1   moonPhase               3065 non-null   float64       
 2   humidity                3006 non-null   float64       
 3   windSpeed               2944 non-null   float64       
 4   windBearing             2938 non-null   float64       
 5   cloudCover              2908 non-null   float64       
 6   uvIndex                 3025 non-null   float64       
 7   temperatureMin          3065 non-null   float64       
 8   temperatureMax          3065 non-null   float64       
 9   precipType              512 non-null    object        
 10  atmoPressure            2284 non-null   float64       
 11  windGust                1541 non-null   float64       
 12  windGustTime            1541 non-null   float64 

---
#### precipAccumulation
Since precipAccumulation is defined as "The amount of snowfall accumulation expected to occur (over the hour or day, respectively), in inches. (If no snowfall is expected, this property will not be defined.)," I decided to fill missing values in that column with 0.

In [50]:
df['precipAccumulation'].isna().sum()

2842

In [51]:
df['precipAccumulation'].fillna(value = 0, downcast = 'infer', inplace = True)

In [52]:
df['precipAccumulation'].isna().sum()

0

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    3065 non-null   datetime64[ns]
 1   moonPhase               3065 non-null   float64       
 2   humidity                3006 non-null   float64       
 3   windSpeed               2944 non-null   float64       
 4   windBearing             2938 non-null   float64       
 5   cloudCover              2908 non-null   float64       
 6   uvIndex                 3025 non-null   float64       
 7   temperatureMin          3065 non-null   float64       
 8   temperatureMax          3065 non-null   float64       
 9   precipType              512 non-null    object        
 10  atmoPressure            2284 non-null   float64       
 11  windGust                1541 non-null   float64       
 12  windGustTime            1541 non-null   float64 

---
#### precipType
I wanted to keep this column since rain or snow could impact the river ice, so I filled NaNs with 'None'.

In [54]:
df['precipType'].value_counts()

snow    301
rain    211
Name: precipType, dtype: int64

In [55]:
df['precipType'].fillna(value = 'None', inplace = True)

---
#### icon
The 'icon' column was missing values in just under half of the columns. It didn't appear to capture any unique information that wasn't already captured in other columns, so I dropped it.

In [56]:
df['icon'].value_counts()

clear-day            786
partly-cloudy-day    361
snow                 205
rain                 134
fog                   94
cloudy                51
wind                   1
Name: icon, dtype: int64

In [57]:
df.drop(columns = 'icon', inplace = True)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    3065 non-null   datetime64[ns]
 1   moonPhase               3065 non-null   float64       
 2   humidity                3006 non-null   float64       
 3   windSpeed               2944 non-null   float64       
 4   windBearing             2938 non-null   float64       
 5   cloudCover              2908 non-null   float64       
 6   uvIndex                 3025 non-null   float64       
 7   temperatureMin          3065 non-null   float64       
 8   temperatureMax          3065 non-null   float64       
 9   precipType              3065 non-null   object        
 10  atmoPressure            2284 non-null   float64       
 11  windGust                1541 non-null   float64       
 12  windGustTime            1541 non-null   float64 

---
#### winningTime
I filled missing values in 'winningTime' with zeroes, since only there will only be an entry in this column for the day that the ice broke.

In [59]:
df['winningTime'].fillna(value = 0, inplace = True)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    3065 non-null   datetime64[ns]
 1   moonPhase               3065 non-null   float64       
 2   humidity                3006 non-null   float64       
 3   windSpeed               2944 non-null   float64       
 4   windBearing             2938 non-null   float64       
 5   cloudCover              2908 non-null   float64       
 6   uvIndex                 3025 non-null   float64       
 7   temperatureMin          3065 non-null   float64       
 8   temperatureMax          3065 non-null   float64       
 9   precipType              3065 non-null   object        
 10  atmoPressure            2284 non-null   float64       
 11  windGust                1541 non-null   float64       
 12  windGustTime            1541 non-null   float64 

---
#### ozone, precipIntensityMaxTime, windGust, windGustTime
I decided to drop the 'ozone,' 'precipIntensityMaxTime,' 'windGust,' and 'windGustTime' columns because they were missing so much information, and I had no good strategy for imputing those values.

In [61]:
df.drop(columns = ['ozone', 'precipIntensityMaxTime', 'windGust', 'windGustTime'], inplace = True)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                3065 non-null   datetime64[ns]
 1   moonPhase           3065 non-null   float64       
 2   humidity            3006 non-null   float64       
 3   windSpeed           2944 non-null   float64       
 4   windBearing         2938 non-null   float64       
 5   cloudCover          2908 non-null   float64       
 6   uvIndex             3025 non-null   float64       
 7   temperatureMin      3065 non-null   float64       
 8   temperatureMax      3065 non-null   float64       
 9   precipType          3065 non-null   object        
 10  atmoPressure        2284 non-null   float64       
 11  precipIntensity     1676 non-null   float64       
 12  precipAccumulation  3065 non-null   float64       
 13  winningTime         3065 non-null   object      

---
#### atmoPressure
Barometric pressure is used as an aid in forecasting weather. Falling pressure indicates incoming inclement weather and rising pressure indicates incoming fair weather. With temperature, precipitation, and UV index information available, I chose to drop this column.

In [63]:
df.drop(columns = 'atmoPressure', inplace = True)

---
#### windSpeed, windBearing
I decided to replace missing values in these columns with zeroes.

In [64]:
df['windSpeed'].fillna(value = 0, inplace = True)
df['windBearing'].fillna(value = 0, inplace = True)

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                3065 non-null   datetime64[ns]
 1   moonPhase           3065 non-null   float64       
 2   humidity            3006 non-null   float64       
 3   windSpeed           3065 non-null   float64       
 4   windBearing         3065 non-null   float64       
 5   cloudCover          2908 non-null   float64       
 6   uvIndex             3025 non-null   float64       
 7   temperatureMin      3065 non-null   float64       
 8   temperatureMax      3065 non-null   float64       
 9   precipType          3065 non-null   object        
 10  precipIntensity     1676 non-null   float64       
 11  precipAccumulation  3065 non-null   float64       
 12  winningTime         3065 non-null   object        
 13  daylightHours       3065 non-null   float64     

---
#### humidity
There were only 59 values missing. This is a small percentage (under 2%), so I replaced the missing values with the median.

In [66]:
df['humidity'].fillna(value = df['humidity'].median(), inplace = True)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                3065 non-null   datetime64[ns]
 1   moonPhase           3065 non-null   float64       
 2   humidity            3065 non-null   float64       
 3   windSpeed           3065 non-null   float64       
 4   windBearing         3065 non-null   float64       
 5   cloudCover          2908 non-null   float64       
 6   uvIndex             3025 non-null   float64       
 7   temperatureMin      3065 non-null   float64       
 8   temperatureMax      3065 non-null   float64       
 9   precipType          3065 non-null   object        
 10  precipIntensity     1676 non-null   float64       
 11  precipAccumulation  3065 non-null   float64       
 12  winningTime         3065 non-null   object        
 13  daylightHours       3065 non-null   float64     

---
#### cloudCover
I decided to drop the 'cloudCover' column, because 'uvIndex' seemed like a better proxy for the amount/strength of sunlight on a given day.

In [68]:
df['cloudCover'].isna().sum()

157

In [69]:
df.drop(columns = ['cloudCover'], inplace = True)

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                3065 non-null   datetime64[ns]
 1   moonPhase           3065 non-null   float64       
 2   humidity            3065 non-null   float64       
 3   windSpeed           3065 non-null   float64       
 4   windBearing         3065 non-null   float64       
 5   uvIndex             3025 non-null   float64       
 6   temperatureMin      3065 non-null   float64       
 7   temperatureMax      3065 non-null   float64       
 8   precipType          3065 non-null   object        
 9   precipIntensity     1676 non-null   float64       
 10  precipAccumulation  3065 non-null   float64       
 11  winningTime         3065 non-null   object        
 12  daylightHours       3065 non-null   float64       
 13  iceThickness        3065 non-null   object      

---
#### precipIntensity
This is defined as "the intensity (in inches of liquid water per hour) of precipitation occurring at the given time. This value is conditional on probability (that is, assuming any precipitation occurs at all)," so I replaced missing values with zeroes.

In [71]:
df['precipIntensity'].fillna(value = 0, inplace = True)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                3065 non-null   datetime64[ns]
 1   moonPhase           3065 non-null   float64       
 2   humidity            3065 non-null   float64       
 3   windSpeed           3065 non-null   float64       
 4   windBearing         3065 non-null   float64       
 5   uvIndex             3025 non-null   float64       
 6   temperatureMin      3065 non-null   float64       
 7   temperatureMax      3065 non-null   float64       
 8   precipType          3065 non-null   object        
 9   precipIntensity     3065 non-null   float64       
 10  precipAccumulation  3065 non-null   float64       
 11  winningTime         3065 non-null   object        
 12  daylightHours       3065 non-null   float64       
 13  iceThickness        3065 non-null   object      

---
#### uvIndex
This was another case where the number of missing values was small (40), comprising a little over 1% of the total values. I decided to replace the missing values with the median.

In [73]:
df['uvIndex'].fillna(value = df['uvIndex'].median(), inplace = True)

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3065 entries, 0 to 3064
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                3065 non-null   datetime64[ns]
 1   moonPhase           3065 non-null   float64       
 2   humidity            3065 non-null   float64       
 3   windSpeed           3065 non-null   float64       
 4   windBearing         3065 non-null   float64       
 5   uvIndex             3065 non-null   float64       
 6   temperatureMin      3065 non-null   float64       
 7   temperatureMax      3065 non-null   float64       
 8   precipType          3065 non-null   object        
 9   precipIntensity     3065 non-null   float64       
 10  precipAccumulation  3065 non-null   float64       
 11  winningTime         3065 non-null   object        
 12  daylightHours       3065 non-null   float64       
 13  iceThickness        3065 non-null   object      

In [75]:
df.to_csv('../data/cleaned_data.csv')