# Nenana Ice Classic Data Processing
This notebook was used to process the data gathered for this project.

In [1]:
# imports

# data
import numpy as np
import pandas as pd

# date processing
import datetime

# garbage collection
import gc

## Reading data from files

In [2]:
ice_df = pd.read_csv('../data/ice_thickness_2009-2019.csv')
ice_df.head()

Unnamed: 0,Date,Thickness
0,2019-01-16,16.0
1,2019-02-07,16.0
2,2019-02-26,23.5
3,2019-03-04,32.5
4,2019-03-13,25.7


In [3]:
weather_df = pd.read_csv('../data/weather_2009-2020.csv')
weather_df.head()

Unnamed: 0,Date,time,summary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone
0,2009-04-01,1238572800,Partly cloudy throughout the day.,partly-cloudy-day,1238598720,1238647800,0.24,0.0,0.0,0.0,...,22.56,1238623320,-0.27,1238579940,17.69,1238608560,,,,
1,2009-04-02,1238659200,Partly cloudy throughout the day.,partly-cloudy-day,1238684940,1238734380,0.27,0.0,0.0,0.0,...,24.72,1238726100,-19.98,1238684760,20.02,1238735040,,,,
2,2009-04-03,1238745600,Clear throughout the day.,clear-day,1238771100,1238820960,0.31,0.0,0.0,0.0,...,31.73,1238804700,-5.11,1238773740,29.2,1238798700,,,,
3,2009-04-04,1238832000,Clear throughout the day.,clear-day,1238857260,1238907540,0.34,0.0,0.0,0.0,...,25.51,1238896800,-7.95,1238859660,20.35,1238896920,,,,
4,2009-04-05,1238918400,Clear throughout the day.,clear-day,1238943480,1238994120,0.38,0.0,0.0,0.0,...,31.85,1238985180,-7.83,1238929200,28.09,1238989980,,,,


In [4]:
winners_df = pd.read_csv('../data/ice_classic_winning_times.csv')
winners_df.head()

Unnamed: 0,Date,Time
0,1917-04-30,11:30 AM
1,1918-05-11,9:33 AM
2,1919-05-03,2:33 PM
3,1920-05-11,10:46 AM
4,1921-05-11,6:42 AM


## Merge the DataFrames into one

In [5]:
merged_df = weather_df.merge(winners_df, how = 'left', on = 'Date')
merged_df.head()

Unnamed: 0,Date,time,summary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,Time
0,2009-04-01,1238572800,Partly cloudy throughout the day.,partly-cloudy-day,1238598720,1238647800,0.24,0.0,0.0,0.0,...,1238623320,-0.27,1238579940,17.69,1238608560,,,,,
1,2009-04-02,1238659200,Partly cloudy throughout the day.,partly-cloudy-day,1238684940,1238734380,0.27,0.0,0.0,0.0,...,1238726100,-19.98,1238684760,20.02,1238735040,,,,,
2,2009-04-03,1238745600,Clear throughout the day.,clear-day,1238771100,1238820960,0.31,0.0,0.0,0.0,...,1238804700,-5.11,1238773740,29.2,1238798700,,,,,
3,2009-04-04,1238832000,Clear throughout the day.,clear-day,1238857260,1238907540,0.34,0.0,0.0,0.0,...,1238896800,-7.95,1238859660,20.35,1238896920,,,,,
4,2009-04-05,1238918400,Clear throughout the day.,clear-day,1238943480,1238994120,0.38,0.0,0.0,0.0,...,1238985180,-7.83,1238929200,28.09,1238989980,,,,,


### Rename a couple of columns to avoid potential confusion
The column 'time' contains the time that weather measurements were taken; 'Time' contains the winning time for a given year. This could be confusing, so I renamed 'time' to 'readingTime' and 'Time' to 'winningTime.'

In [7]:
merged_df.rename(columns = {'time' : 'readingTime', 'Time' : 'winningTime'}, inplace = True)

In [8]:
merged_df.head()

Unnamed: 0,Date,readingTime,summary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,winningTime
0,2009-04-01,1238572800,Partly cloudy throughout the day.,partly-cloudy-day,1238598720,1238647800,0.24,0.0,0.0,0.0,...,1238623320,-0.27,1238579940,17.69,1238608560,,,,,
1,2009-04-02,1238659200,Partly cloudy throughout the day.,partly-cloudy-day,1238684940,1238734380,0.27,0.0,0.0,0.0,...,1238726100,-19.98,1238684760,20.02,1238735040,,,,,
2,2009-04-03,1238745600,Clear throughout the day.,clear-day,1238771100,1238820960,0.31,0.0,0.0,0.0,...,1238804700,-5.11,1238773740,29.2,1238798700,,,,,
3,2009-04-04,1238832000,Clear throughout the day.,clear-day,1238857260,1238907540,0.34,0.0,0.0,0.0,...,1238896800,-7.95,1238859660,20.35,1238896920,,,,,
4,2009-04-05,1238918400,Clear throughout the day.,clear-day,1238943480,1238994120,0.38,0.0,0.0,0.0,...,1238985180,-7.83,1238929200,28.09,1238989980,,,,,


### Calculate the number of hours of daylight
Daylight hours fluctuate a lot in Alaska. Articles I had read about the NIC mention that things like how much snow covers the ice, and therefore how much direct sunlight the ice receives, affect the rate the river ice melts.

I may be able to proxy snow depth with precipitation measurements. I theorize that the number of sunlight hours in a day may be a proxy for how much sunlight the river ice receives. I also have information regarding cloud cover which may help in that regard.

I calculated the number of daylight hours as a percentage of the day. That way its value is already scaled for use in modeling later.

In [9]:
def calc_daylight(sunrise, sunset):
    daylight_hours = (sunset - sunrise) / 60 / 60 /24
    return daylight_hours

In [10]:
merged_df['daylightHours'] = calc_daylight(merged_df['sunriseTime'], merged_df['sunsetTime'])

In [12]:
merged_df.head()

Unnamed: 0,Date,readingTime,summary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,winningTime,daylightHours
0,2009-04-01,1238572800,Partly cloudy throughout the day.,partly-cloudy-day,1238598720,1238647800,0.24,0.0,0.0,0.0,...,-0.27,1238579940,17.69,1238608560,,,,,,0.568056
1,2009-04-02,1238659200,Partly cloudy throughout the day.,partly-cloudy-day,1238684940,1238734380,0.27,0.0,0.0,0.0,...,-19.98,1238684760,20.02,1238735040,,,,,,0.572222
2,2009-04-03,1238745600,Clear throughout the day.,clear-day,1238771100,1238820960,0.31,0.0,0.0,0.0,...,-5.11,1238773740,29.2,1238798700,,,,,,0.577083
3,2009-04-04,1238832000,Clear throughout the day.,clear-day,1238857260,1238907540,0.34,0.0,0.0,0.0,...,-7.95,1238859660,20.35,1238896920,,,,,,0.581944
4,2009-04-05,1238918400,Clear throughout the day.,clear-day,1238943480,1238994120,0.38,0.0,0.0,0.0,...,-7.83,1238929200,28.09,1238989980,,,,,,0.586111


In [13]:
merged_df['daylightHours'].value_counts()

0.842361    5
0.586111    4
0.806250    4
0.826389    4
0.693056    4
           ..
0.743056    1
0.635417    1
0.596528    1
0.727778    1
0.815278    1
Name: daylightHours, Length: 298, dtype: int64

In [14]:
df = merged_df.merge(ice_df, how = 'left', on = 'Date')

In [15]:
df.head()

Unnamed: 0,Date,readingTime,summary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,winningTime,daylightHours,Thickness
0,2009-04-01,1238572800,Partly cloudy throughout the day.,partly-cloudy-day,1238598720,1238647800,0.24,0.0,0.0,0.0,...,1238579940,17.69,1238608560,,,,,,0.568056,
1,2009-04-02,1238659200,Partly cloudy throughout the day.,partly-cloudy-day,1238684940,1238734380,0.27,0.0,0.0,0.0,...,1238684760,20.02,1238735040,,,,,,0.572222,45.0
2,2009-04-03,1238745600,Clear throughout the day.,clear-day,1238771100,1238820960,0.31,0.0,0.0,0.0,...,1238773740,29.2,1238798700,,,,,,0.577083,
3,2009-04-04,1238832000,Clear throughout the day.,clear-day,1238857260,1238907540,0.34,0.0,0.0,0.0,...,1238859660,20.35,1238896920,,,,,,0.581944,
4,2009-04-05,1238918400,Clear throughout the day.,clear-day,1238943480,1238994120,0.38,0.0,0.0,0.0,...,1238929200,28.09,1238989980,,,,,,0.586111,


### Rename other columns
There are a couple of other columns that I decided to rename to make them a little more descriptive of the values they contain.

In [16]:
df.columns

Index(['Date', 'readingTime', 'summary', 'icon', 'sunriseTime', 'sunsetTime',
       'moonPhase', 'precipIntensity', 'precipIntensityMax',
       'precipProbability', 'temperatureHigh', 'temperatureHighTime',
       'temperatureLow', 'temperatureLowTime', 'apparentTemperatureHigh',
       'apparentTemperatureHighTime', 'apparentTemperatureLow',
       'apparentTemperatureLowTime', 'dewPoint', 'humidity', 'pressure',
       'windSpeed', 'windGust', 'windGustTime', 'windBearing', 'cloudCover',
       'uvIndex', 'uvIndexTime', 'visibility', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperatureMax', 'apparentTemperatureMaxTime',
       'precipIntensityMaxTime', 'precipType', 'precipAccumulation', 'ozone',
       'winningTime', 'daylightHours', 'Thickness'],
      dtype='object')

In [17]:
df.rename(columns = {'summary':'weatherSummary', 'Thickness':'iceThickness', 'pressure':'atmoPressure'}, inplace = True)

In [18]:
df.head()

Unnamed: 0,Date,readingTime,weatherSummary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,winningTime,daylightHours,iceThickness
0,2009-04-01,1238572800,Partly cloudy throughout the day.,partly-cloudy-day,1238598720,1238647800,0.24,0.0,0.0,0.0,...,1238579940,17.69,1238608560,,,,,,0.568056,
1,2009-04-02,1238659200,Partly cloudy throughout the day.,partly-cloudy-day,1238684940,1238734380,0.27,0.0,0.0,0.0,...,1238684760,20.02,1238735040,,,,,,0.572222,45.0
2,2009-04-03,1238745600,Clear throughout the day.,clear-day,1238771100,1238820960,0.31,0.0,0.0,0.0,...,1238773740,29.2,1238798700,,,,,,0.577083,
3,2009-04-04,1238832000,Clear throughout the day.,clear-day,1238857260,1238907540,0.34,0.0,0.0,0.0,...,1238859660,20.35,1238896920,,,,,,0.581944,
4,2009-04-05,1238918400,Clear throughout the day.,clear-day,1238943480,1238994120,0.38,0.0,0.0,0.0,...,1238929200,28.09,1238989980,,,,,,0.586111,


In [19]:
# removed unneeded variables
del merged_df, ice_df, winners_df, weather_df
gc.collect()

0

### Forward fill ice thickness values until next observed value
Since ice thickness is only measured periodically, I chose to make the naive assumption that it remains constant until the next measurement. I knew this wasn't technically correct, but I hoped that it made a decent proxy.

In [24]:
df['iceThickness'].fillna(method = 'pad', inplace = True)

In [25]:
df.head()

Unnamed: 0,Date,readingTime,weatherSummary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,winningTime,daylightHours,iceThickness
0,2009-04-01,1238572800,Partly cloudy throughout the day.,partly-cloudy-day,1238598720,1238647800,0.24,0.0,0.0,0.0,...,1238579940,17.69,1238608560,,,,,,0.568056,
1,2009-04-02,1238659200,Partly cloudy throughout the day.,partly-cloudy-day,1238684940,1238734380,0.27,0.0,0.0,0.0,...,1238684760,20.02,1238735040,,,,,,0.572222,45.0
2,2009-04-03,1238745600,Clear throughout the day.,clear-day,1238771100,1238820960,0.31,0.0,0.0,0.0,...,1238773740,29.2,1238798700,,,,,,0.577083,45.0
3,2009-04-04,1238832000,Clear throughout the day.,clear-day,1238857260,1238907540,0.34,0.0,0.0,0.0,...,1238859660,20.35,1238896920,,,,,,0.581944,45.0
4,2009-04-05,1238918400,Clear throughout the day.,clear-day,1238943480,1238994120,0.38,0.0,0.0,0.0,...,1238929200,28.09,1238989980,,,,,,0.586111,45.0


The very first value was missed by the first fill operation; I backfilled it to eliminate NaN values in the iceThickness column.

In [28]:
df['iceThickness'].fillna(method = 'bfill', inplace = True)

In [29]:
df['iceThickness'].isna().sum()

0

In [30]:
df['iceThickness'].value_counts()

36.5     112
25       105
32.3      50
40.9      46
42.7      46
39        45
28.7      43
29.7      39
40        28
39.9      11
26.8       8
43.3       7
34.7       7
50         7
45.00      5
26.7       4
38.5       4
24.7       4
44.6       4
34.8       4
45.2       4
46.75      4
41.4       4
51         4
37         4
36.7       4
38.7       4
44.3       4
49.6       4
45.3       4
39.5       4
41.6       3
46.3       3
36.8       3
47         3
49.2       3
38.8       3
49.3       3
44.2       3
50.3       3
44         3
46.00      3
36         3
42.5       3
35.6       3
46.4       3
33.7       3
Name: iceThickness, dtype: int64

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671 entries, 0 to 670
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         671 non-null    object 
 1   readingTime                  671 non-null    int64  
 2   weatherSummary               670 non-null    object 
 3   icon                         668 non-null    object 
 4   sunriseTime                  671 non-null    int64  
 5   sunsetTime                   671 non-null    int64  
 6   moonPhase                    671 non-null    float64
 7   precipIntensity              671 non-null    float64
 8   precipIntensityMax           671 non-null    float64
 9   precipProbability            671 non-null    float64
 10  temperatureHigh              671 non-null    float64
 11  temperatureHighTime          671 non-null    int64  
 12  temperatureLow               671 non-null    float64
 13  temperatureLowTime  

### Dealing With Missing Values
First, since precipAccumulation is defined as "The amount of snowfall accumulation expected to occur (over the hour or day, respectively), in inches. (If no snowfall is expected, this property will not be defined.)," I decided to fill missing values in that column with 0.

In [35]:
df['precipAccumulation'].isna().sum()

632

In [37]:
df['precipAccumulation'].fillna(value = 0, downcast = 'infer', inplace = True)

In [38]:
df['precipAccumulation'].isna().sum()

0

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671 entries, 0 to 670
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         671 non-null    object 
 1   readingTime                  671 non-null    int64  
 2   weatherSummary               670 non-null    object 
 3   icon                         668 non-null    object 
 4   sunriseTime                  671 non-null    int64  
 5   sunsetTime                   671 non-null    int64  
 6   moonPhase                    671 non-null    float64
 7   precipIntensity              671 non-null    float64
 8   precipIntensityMax           671 non-null    float64
 9   precipProbability            671 non-null    float64
 10  temperatureHigh              671 non-null    float64
 11  temperatureHighTime          671 non-null    int64  
 12  temperatureLow               671 non-null    float64
 13  temperatureLowTime  

Next, I decided to deal with precipType. I wanted to keep the column since rain or snow could impact the river ice, so I filled NaNs with 'none'.

In [40]:
df['precipType'].value_counts()

rain    91
snow    39
Name: precipType, dtype: int64

In [41]:
df['precipType'].fillna(value = 'none', inplace = True)

In [43]:
df['icon'].value_counts()

clear-day            324
partly-cloudy-day    169
rain                  80
fog                   48
snow                  35
cloudy                12
Name: icon, dtype: int64

In [44]:
df.loc[df['icon'].isna() == True]

Unnamed: 0,Date,readingTime,weatherSummary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,winningTime,daylightHours,iceThickness
147,2011-04-26,1303804800,Drizzle in the morning.,,1303825440,1303884420,0.81,0.0,0.0004,0.99,...,1303819140,52.28,1303871760,1303837000.0,none,0.0,,,0.682639,39
152,2011-05-01,1304236800,Drizzle in the evening.,,1304256360,1304317380,0.97,0.0,0.0004,1.0,...,1304260500,48.06,1304304960,1304305000.0,none,0.0,,,0.70625,39
568,2018-04-20,1524211200,Flurries in the evening.,,1524233040,1524289680,0.19,0.0,0.0008,0.99,...,1524290460,23.45,1524246420,1524280000.0,none,0.0,,,0.655556,25


In [45]:
df.loc[df['weatherSummary'].str.contains('Drizzle')]

ValueError: cannot mask with array containing NA / NaN values

In [46]:
df.loc[df['weatherSummary'].isna() == True]

Unnamed: 0,Date,readingTime,weatherSummary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,winningTime,daylightHours,iceThickness
66,2010-04-06,1270540800,,clear-day,1270565700,1270616700,0.77,0.0,0.0,0.0,...,1270623600,37.23,1270601100,,none,0.0,,,0.590278,46.3


In [47]:
df.loc[df['icon'] == 'clear-day']

Unnamed: 0,Date,readingTime,weatherSummary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,winningTime,daylightHours,iceThickness
2,2009-04-03,1238745600,Clear throughout the day.,clear-day,1238771100,1238820960,0.31,0.0000,0.0000,0.00,...,1238773740,29.20,1238798700,,none,0.0,,,0.577083,45.00
3,2009-04-04,1238832000,Clear throughout the day.,clear-day,1238857260,1238907540,0.34,0.0000,0.0000,0.00,...,1238859660,20.35,1238896920,,none,0.0,,,0.581944,45.00
4,2009-04-05,1238918400,Clear throughout the day.,clear-day,1238943480,1238994120,0.38,0.0000,0.0000,0.00,...,1238929200,28.09,1238989980,,none,0.0,,,0.586111,45.00
5,2009-04-06,1239004800,Clear throughout the day.,clear-day,1239029640,1239080700,0.42,0.0000,0.0000,0.00,...,1239029400,36.17,1239068640,,none,0.0,,,0.590972,46.00
6,2009-04-07,1239091200,Clear throughout the day.,clear-day,1239115800,1239167280,0.45,0.0000,0.0000,0.00,...,1239117600,40.05,1239159480,,none,0.0,,,0.595833,46.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,2019-05-16,1557993600,Clear throughout the day.,clear-day,1558010040,1558077240,0.44,0.0002,0.0014,0.07,...,1558011720,72.65,1558056540,1.558074e+09,rain,0.0,389.1,,0.777778,25
659,2019-05-20,1558339200,Clear throughout the day.,clear-day,1558354860,1558423680,0.58,0.0000,0.0001,0.07,...,1558356780,71.09,1558392840,1.558404e+09,none,0.0,426.4,,0.796528,25
660,2019-05-21,1558425600,Clear throughout the day.,clear-day,1558441020,1558510260,0.61,0.0001,0.0001,0.05,...,1558442760,72.52,1558486860,1.558457e+09,rain,0.0,414.9,,0.801389,25
661,2019-05-22,1558512000,Clear throughout the day.,clear-day,1558527240,1558596900,0.64,0.0002,0.0015,0.11,...,1558531620,73.23,1558574640,1.558580e+09,rain,0.0,387.3,,0.806250,25


In [54]:
df['weatherSummary'].loc[66] = 'Clear throughout the day.'

In [55]:
df.loc[66]

Date                                          2010-04-06
readingTime                                   1270540800
weatherSummary                 Clear throughout the day.
icon                                           clear-day
sunriseTime                                   1270565700
sunsetTime                                    1270616700
moonPhase                                           0.77
precipIntensity                                        0
precipIntensityMax                                     0
precipProbability                                      0
temperatureHigh                                    39.51
temperatureHighTime                           1270598580
temperatureLow                                     23.56
temperatureLowTime                            1270656000
apparentTemperatureHigh                            37.23
apparentTemperatureHighTime                   1270601100
apparentTemperatureLow                             17.14
apparentTemperatureLowTime     

In [56]:
df.loc[df['weatherSummary'].str.contains('Drizzle')]

Unnamed: 0,Date,readingTime,weatherSummary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,winningTime,daylightHours,iceThickness
39,2009-05-10,1241942400,Drizzle in the morning and afternoon.,rain,1241959980,1242024900,0.56,0.0005,0.01,0.99,...,1241967540,42.0,1242002040,1241986000.0,rain,0.0,,,0.751389,42.7
91,2010-05-01,1272700800,Drizzle in the morning.,rain,1272720300,1272781440,0.63,0.0005,0.01,0.99,...,1272722280,50.04,1272765720,1272740000.0,rain,0.0,,,0.707639,40.9
102,2010-05-12,1273651200,Drizzle in the evening.,rain,1273668420,1273734060,0.97,0.0003,0.0059,1.0,...,1273670460,64.24,1273706820,1273723000.0,rain,0.0,,,0.759722,40.9
147,2011-04-26,1303804800,Drizzle in the morning.,,1303825440,1303884420,0.81,0.0,0.0004,0.99,...,1303819140,52.28,1303871760,1303837000.0,none,0.0,,,0.682639,39.0
152,2011-05-01,1304236800,Drizzle in the evening.,,1304256360,1304317380,0.97,0.0,0.0004,1.0,...,1304260500,48.06,1304304960,1304305000.0,none,0.0,,,0.70625,39.0
153,2011-05-02,1304323200,Drizzle overnight.,cloudy,1304342520,1304404020,1.0,0.0,0.0,0.0,...,1304346900,45.11,1304394720,,none,0.0,,,0.711806,39.0
173,2011-05-22,1306051200,Drizzle in the morning.,rain,1306066440,1306136100,0.7,0.0005,0.0101,1.0,...,1306083000,69.16,1306121400,1306084000.0,rain,0.0,,,0.80625,39.0
217,2012-05-05,1336204800,Drizzle in the afternoon.,rain,1336223340,1336286340,0.5,0.0005,0.01,0.99,...,1336227240,52.1,1336266840,1336262000.0,rain,0.0,,,0.729167,28.7
339,2014-05-05,1399276800,Drizzle in the evening.,rain,1399295460,1399358280,0.22,0.0005,0.01,0.99,...,1399331880,42.03,1399316460,1399345000.0,rain,0.0,,,0.727083,36.5
341,2014-05-07,1399449600,Drizzle in the morning.,rain,1399467840,1399531440,0.28,0.0014,0.0109,1.0,...,1399469460,46.77,1399527600,1399483000.0,rain,0.0,,,0.736111,36.5


In [60]:
df.loc[df['icon'].isna() == True]

Unnamed: 0,Date,readingTime,weatherSummary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,winningTime,daylightHours,iceThickness
147,2011-04-26,1303804800,Drizzle in the morning.,,1303825440,1303884420,0.81,0.0,0.0004,0.99,...,1303819140,52.28,1303871760,1303837000.0,none,0.0,,0,0.682639,39
152,2011-05-01,1304236800,Drizzle in the evening.,,1304256360,1304317380,0.97,0.0,0.0004,1.0,...,1304260500,48.06,1304304960,1304305000.0,none,0.0,,0,0.70625,39
568,2018-04-20,1524211200,Flurries in the evening.,,1524233040,1524289680,0.19,0.0,0.0008,0.99,...,1524290460,23.45,1524246420,1524280000.0,none,0.0,,0,0.655556,25


In [61]:
df['icon'].loc[147] = 'rain'
df['icon'].loc[152] = 'rain'
df['icon'].loc[568] = 'snow'

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671 entries, 0 to 670
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         671 non-null    object 
 1   readingTime                  671 non-null    int64  
 2   weatherSummary               671 non-null    object 
 3   icon                         671 non-null    object 
 4   sunriseTime                  671 non-null    int64  
 5   sunsetTime                   671 non-null    int64  
 6   moonPhase                    671 non-null    float64
 7   precipIntensity              671 non-null    float64
 8   precipIntensityMax           671 non-null    float64
 9   precipProbability            671 non-null    float64
 10  temperatureHigh              671 non-null    float64
 11  temperatureHighTime          671 non-null    int64  
 12  temperatureLow               671 non-null    float64
 13  temperatureLowTime  

In [58]:
df['winningTime'].fillna(value = 0, inplace = True)

In [63]:
df.loc[df['atmoPressure'].isna() == True]

Unnamed: 0,Date,readingTime,weatherSummary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone,winningTime,daylightHours,iceThickness
598,2018-05-20,1526803200,Light rain overnight.,rain,1526818800,1526887740,0.22,0.0004,0.017,1.0,...,1526822880,67.94,1526857380,1526886000.0,rain,0.0,,0,0.797917,25
599,2018-05-21,1526889600,Light rain in the morning.,rain,1526905020,1526974320,0.25,0.0064,0.0367,0.67,...,1526911020,68.1,1526951220,1526911000.0,rain,0.0,,0,0.802083,25


I decided to drop the two observations with no atmospheric pressure information. They were at the tail end of the typical breakup window, so I felt I wasn't losing anything by dropping them.

In [64]:
df.drop(index = [598, 599], inplace = True)

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 669 entries, 0 to 670
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         669 non-null    object 
 1   readingTime                  669 non-null    int64  
 2   weatherSummary               669 non-null    object 
 3   icon                         669 non-null    object 
 4   sunriseTime                  669 non-null    int64  
 5   sunsetTime                   669 non-null    int64  
 6   moonPhase                    669 non-null    float64
 7   precipIntensity              669 non-null    float64
 8   precipIntensityMax           669 non-null    float64
 9   precipProbability            669 non-null    float64
 10  temperatureHigh              669 non-null    float64
 11  temperatureHighTime          669 non-null    int64  
 12  temperatureLow               669 non-null    float64
 13  temperatureLowTime  

I decided to drop the 'ozone' and 'precipIntensityMaxTime' columns because they were missing so much information, and I had no good strategy for imputing it.

In [71]:
df.drop(columns = 'ozone', inplace = True)
df.drop(columns = 'precipIntensityMaxTime', inplace = True)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 669 entries, 0 to 670
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         669 non-null    object 
 1   readingTime                  669 non-null    int64  
 2   weatherSummary               669 non-null    object 
 3   icon                         669 non-null    object 
 4   sunriseTime                  669 non-null    int64  
 5   sunsetTime                   669 non-null    int64  
 6   moonPhase                    669 non-null    float64
 7   precipIntensity              669 non-null    float64
 8   precipIntensityMax           669 non-null    float64
 9   precipProbability            669 non-null    float64
 10  temperatureHigh              669 non-null    float64
 11  temperatureHighTime          669 non-null    int64  
 12  temperatureLow               669 non-null    float64
 13  temperatureLowTime  

In [73]:
df['windGust'].value_counts()

14.91    6
13.06    6
10.01    5
11.82    5
14.04    4
        ..
17.39    1
8.05     1
4.96     1
23.06    1
25.25    1
Name: windGust, Length: 495, dtype: int64

In [74]:
df['windGust'].fillna(value = 0, inplace = True)

In [77]:
df.drop(columns = 'windGustTime', inplace = True)

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 669 entries, 0 to 670
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         669 non-null    object 
 1   readingTime                  669 non-null    int64  
 2   weatherSummary               669 non-null    object 
 3   icon                         669 non-null    object 
 4   sunriseTime                  669 non-null    int64  
 5   sunsetTime                   669 non-null    int64  
 6   moonPhase                    669 non-null    float64
 7   precipIntensity              669 non-null    float64
 8   precipIntensityMax           669 non-null    float64
 9   precipProbability            669 non-null    float64
 10  temperatureHigh              669 non-null    float64
 11  temperatureHighTime          669 non-null    int64  
 12  temperatureLow               669 non-null    float64
 13  temperatureLowTime  

In [79]:
df.loc[df['cloudCover'].isna() == True]

Unnamed: 0,Date,readingTime,weatherSummary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,...,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipType,precipAccumulation,winningTime,daylightHours,iceThickness
501,2017-04-14,1492156800,Foggy throughout the day.,fog,1492179900,1492234200,0.62,0.0,0.0,0.0,...,1492225200,23.98,1492181940,51.02,1492225200,none,0.0,0,0.628472,37


In [83]:
df.loc[df['icon'] == 'fog']['cloudCover']

488    0.29
490    0.04
491    0.29
492    0.16
495    0.72
496    0.00
497    0.04
498    0.00
499    0.00
500    0.00
501     NaN
502    0.00
503    0.04
504    0.00
505    0.00
506    0.00
507    0.00
508    0.00
509    0.00
510    0.00
511    0.04
512    0.56
513    0.46
514    0.62
515    0.21
516    0.30
517    0.31
518    0.12
519    0.11
521    0.81
522    0.68
523    0.01
524    0.11
525    0.59
527    0.17
528    0.45
530    0.12
531    0.80
532    0.17
533    0.07
534    0.11
535    0.28
537    0.34
538    0.45
540    0.74
544    0.29
545    0.41
548    0.29
Name: cloudCover, dtype: float64

In [84]:
df['cloudCover'].fillna(value = 0, inplace = True)

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 669 entries, 0 to 670
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         669 non-null    object 
 1   readingTime                  669 non-null    int64  
 2   weatherSummary               669 non-null    object 
 3   icon                         669 non-null    object 
 4   sunriseTime                  669 non-null    int64  
 5   sunsetTime                   669 non-null    int64  
 6   moonPhase                    669 non-null    float64
 7   precipIntensity              669 non-null    float64
 8   precipIntensityMax           669 non-null    float64
 9   precipProbability            669 non-null    float64
 10  temperatureHigh              669 non-null    float64
 11  temperatureHighTime          669 non-null    int64  
 12  temperatureLow               669 non-null    float64
 13  temperatureLowTime  

In [86]:
df.isna().sum()

Date                           0
readingTime                    0
weatherSummary                 0
icon                           0
sunriseTime                    0
sunsetTime                     0
moonPhase                      0
precipIntensity                0
precipIntensityMax             0
precipProbability              0
temperatureHigh                0
temperatureHighTime            0
temperatureLow                 0
temperatureLowTime             0
apparentTemperatureHigh        0
apparentTemperatureHighTime    0
apparentTemperatureLow         0
apparentTemperatureLowTime     0
dewPoint                       0
humidity                       0
atmoPressure                   0
windSpeed                      0
windGust                       0
windBearing                    0
cloudCover                     0
uvIndex                        0
uvIndexTime                    0
visibility                     0
temperatureMin                 0
temperatureMinTime             0
temperatur

In [87]:
df.to_csv('../data/cleaned_data.csv')