In [1]:
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
measurements = pd.read_csv('Resources/hawaii_measurements.csv')
stations = pd.read_csv('Resources/hawaii_stations.csv')

In [3]:
measurements.info()

#all columns are fully populated except for 'prcp'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
station    19550 non-null object
date       19550 non-null object
prcp       18103 non-null float64
tobs       19550 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 611.0+ KB


In [4]:
#check statistical overview of data set at large
measurements.describe()

Unnamed: 0,prcp,tobs
count,18103.0,19550.0
mean,0.160644,73.097954
std,0.468746,4.523527
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.11,76.0
max,11.53,87.0


In [5]:
#check statistical overview of dataset where prcp is null
null_prcp=measurements[measurements['prcp'].isnull()]

null_prcp.describe()

#the basic statistical profile of tobs between the two datasets is pretty similar,
#indicating that we can safely drop the NaNs without impacting the overall integrity of the data

Unnamed: 0,prcp,tobs
count,0.0,1447.0
mean,,74.387699
std,,4.468849
min,,58.0
25%,,71.5
50%,,75.0
75%,,78.0
max,,87.0


In [6]:
by_station = measurements.groupby('station')
null_by_station = measurements[measurements['prcp'].isnull()].groupby('station')

by_station.count()

Unnamed: 0_level_0,date,prcp,tobs
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
USC00511918,1979,1932,1979
USC00513117,2709,2696,2709
USC00514830,2202,1937,2202
USC00516128,2612,2484,2612
USC00517948,1372,683,1372
USC00518838,511,342,511
USC00519281,2772,2772,2772
USC00519397,2724,2685,2724
USC00519523,2669,2572,2669


In [7]:
null_by_station.count()

#checking to see the distribution by station. 
# USC00517948 (50% prcp nul) and USC00518838 (33% prcp null) would loss significant data
# with dropping all NaN.

Unnamed: 0_level_0,date,prcp,tobs
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
USC00511918,47,0,47
USC00513117,13,0,13
USC00514830,265,0,265
USC00516128,128,0,128
USC00517948,689,0,689
USC00518838,169,0,169
USC00519397,39,0,39
USC00519523,97,0,97


In [8]:
measurements[measurements['station']=='USC00517948'].describe()

Unnamed: 0,prcp,tobs
count,683.0,1372.0
mean,0.063602,74.684402
std,0.243931,4.383041
min,0.0,58.0
25%,0.0,72.0
50%,0.0,75.0
75%,0.02,78.0
max,2.8,87.0


In [9]:
null_prcp[null_prcp['station']=='USC00517948'].describe()

Unnamed: 0,prcp,tobs
count,0.0,689.0
mean,,74.780842
std,,4.466918
min,,59.0
25%,,72.0
50%,,75.0
75%,,78.0
max,,87.0


In [10]:
measurements[measurements['station']=='USC00518838'].describe()

Unnamed: 0,prcp,tobs
count,342.0,511.0
mean,0.207222,72.72407
std,0.508305,4.144946
min,0.0,58.0
25%,0.0025,70.0
50%,0.03,73.0
75%,0.1975,76.0
max,6.3,83.0


In [11]:
null_prcp[null_prcp['station']=='USC00518838'].describe()

# Even though large proportion of USC00517948 and USC00518838 readings will be knocked out,
# The statistic profiles of those tobs datasets dropped are similar to the dataset at large,
# indicating they can be dropped without wildly throwing off results.

Unnamed: 0,prcp,tobs
count,0.0,169.0
mean,,72.822485
std,,4.355261
min,,58.0
25%,,70.0
50%,,73.0
75%,,76.0
max,,81.0


In [12]:
measurements_clean = measurements.dropna()

In [13]:
measurements_clean.to_csv('Resources/hawaii_measurements_clean.csv', index=False)
stations.to_csv('Resources/hawaii_stations_clean.csv', index=False)