In [9]:
import pandas as pd
stations = pd.read_csv('resources/hawaii_stations.csv')
measures = pd.read_csv('resources/hawaii_measurements.csv')

In [14]:
measures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
station    19550 non-null object
date       19550 non-null object
prcp       18103 non-null float64
tobs       19550 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 611.0+ KB


# Cleaning the data
it looks like we have some missing percipitation data 

I can see two logical methods for dealing with this
1. either fill in the missing values with the average across the entire variable
2. input in the median between the prior date and the next date

I'm going to opt for the later option as it seems to make more sense considering that we are working with weather data organized as a time series. 

Let's start by marking our rows with null values

In [20]:
measures['nulls'] = measures.isnull().any(axis=1)
print(measures.head(40))

        station        date  prcp  tobs  nulls
0   USC00519397  2010-01-01  0.08    65  False
1   USC00519397  2010-01-02  0.00    63  False
2   USC00519397  2010-01-03  0.00    74  False
3   USC00519397  2010-01-04  0.00    76  False
4   USC00519397  2010-01-06   NaN    73   True
5   USC00519397  2010-01-07  0.06    70  False
6   USC00519397  2010-01-08  0.00    64  False
7   USC00519397  2010-01-09  0.00    68  False
8   USC00519397  2010-01-10  0.00    73  False
9   USC00519397  2010-01-11  0.01    64  False
10  USC00519397  2010-01-12  0.00    61  False
11  USC00519397  2010-01-14  0.00    66  False
12  USC00519397  2010-01-15  0.00    65  False
13  USC00519397  2010-01-16  0.00    68  False
14  USC00519397  2010-01-17  0.00    64  False
15  USC00519397  2010-01-18  0.00    72  False
16  USC00519397  2010-01-19  0.00    66  False
17  USC00519397  2010-01-20  0.00    66  False
18  USC00519397  2010-01-21  0.00    69  False
19  USC00519397  2010-01-22  0.00    67  False
20  USC005193

# Using Pandas Interpolate function to fill in the missing values

Pandas has an interpolate method that fills in missing values as the median between the previous row and the next row 
 
We can see what values were changed by looking for rows where 'nulls' == True

In [22]:
measures = measures.interpolate()    
print(measures)

           station        date      prcp  tobs  nulls
0      USC00519397  2010-01-01  0.080000    65  False
1      USC00519397  2010-01-02  0.000000    63  False
2      USC00519397  2010-01-03  0.000000    74  False
3      USC00519397  2010-01-04  0.000000    76  False
4      USC00519397  2010-01-06  0.030000    73   True
5      USC00519397  2010-01-07  0.060000    70  False
6      USC00519397  2010-01-08  0.000000    64  False
7      USC00519397  2010-01-09  0.000000    68  False
8      USC00519397  2010-01-10  0.000000    73  False
9      USC00519397  2010-01-11  0.010000    64  False
10     USC00519397  2010-01-12  0.000000    61  False
11     USC00519397  2010-01-14  0.000000    66  False
12     USC00519397  2010-01-15  0.000000    65  False
13     USC00519397  2010-01-16  0.000000    68  False
14     USC00519397  2010-01-17  0.000000    64  False
15     USC00519397  2010-01-18  0.000000    72  False
16     USC00519397  2010-01-19  0.000000    66  False
17     USC00519397  2010-01-

## Now we just have to save the csv file

In [None]:
measures.to_csv('resources/clean_measures.csv')