In [31]:
# Import the Pandas library
import pandas as pd
import datetime as dt

In [32]:
path_meas = "Resources/hawaii_measurements.csv"
path_stat = "Resources/hawaii_stations.csv"

In [33]:
meas_df = pd.read_csv(path_meas)  # Hawaii measurements
stat_df = pd.read_csv(path_stat)  # Hawaii stations

In [34]:
meas_df.columns
# 'station', 'date', 'prcp', 'tobs'
stat_df.columns
# 'station', 'name', 'latitude', 'longitude', 'elevation'
meas_df.dtypes
# 
# station     object
# date        object  -- might have to convert these
# prcp       float64
# tobs         int64

station     object
date        object
prcp       float64
tobs         int64
dtype: object

In [35]:
meas_df.head(20)
# columns are as follows
# station  -- a station ID
# date  -- measurement date
# prcp  -- precipitation measurement
# tobs  -- temperature observed appears to be in F

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73
5,USC00519397,2010-01-07,0.06,70
6,USC00519397,2010-01-08,0.0,64
7,USC00519397,2010-01-09,0.0,68
8,USC00519397,2010-01-10,0.0,73
9,USC00519397,2010-01-11,0.01,64


In [36]:
meas_df.count()  
# all dates appear to be represented.  there appear to be missing precipitation values, AND zero precip values.  
# or perhaps those are also zero.  From my perspective, either choice influences results, and doesn't get 
# a true value for the precip.  But removing rows loses the temp reading.  
# fillna
# df.dropna(how='any')
# I choose to use fillna
# temperature observed appears.   

station    19550
date       19550
prcp       18103
tobs       19550
dtype: int64

In [37]:
meas_counts_by_stat = meas_df["station"].value_counts()
meas_counts_by_stat 

USC00519281    2772
USC00519397    2724
USC00513117    2709
USC00519523    2669
USC00516128    2612
USC00514830    2202
USC00511918    1979
USC00517948    1372
USC00518838     511
Name: station, dtype: int64

In [38]:
meas_df.head()
# meas_df.['date'] = pd.to_datetime(meas_df['date'], format='%d')
# meas_df.['date'] = datetime.strptime(meas_df.['date'], )

# df['Mycol'] = df['Mycol'].apply(lambda x: 
#                                     dt.datetime.strptime(x,'%d%b%Y:%H:%M:%S.%f'))
# convert your dates to datetime values. 
meas_df['date'] = meas_df['date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))
meas_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [39]:
# Push the DataFrames to  new CSV files
# renamed_df.to_csv("Output/books_clean.csv", encoding="utf-8", index=False, header=True)

meas_df.to_csv("Output/raw_meas.csv", encoding="utf-8", index=False, header=True)

In [41]:
meas_cln_df = meas_df.fillna(0)
meas_cln_df.head(10)

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,0.0,73
5,USC00519397,2010-01-07,0.06,70
6,USC00519397,2010-01-08,0.0,64
7,USC00519397,2010-01-09,0.0,68
8,USC00519397,2010-01-10,0.0,73
9,USC00519397,2010-01-11,0.01,64


In [42]:
meas_cln_df.count()  
# station    19550
# date       19550
# prcp       19550
# tobs       19550
meas_cln_df.max() 
# station    USC00519523
# date        2017-08-23
# prcp             11.53
# tobs                87
# dtype: object
meas_cln_df.min() 
# station    USC00511918
# date        2010-01-01
# prcp                 0
# tobs                53
# dtype: object

station            USC00511918
date       2010-01-01 00:00:00
prcp                         0
tobs                        53
dtype: object

In [43]:
# output clean measurement data
meas_cln_df.to_csv("Output/clean_meas.csv", encoding="utf-8", index=False, header=True)

In [45]:
stat_df.columns
# 'station', 'name', 'latitude', 'longitude', 'elevation'
stat_df.dtypes
# 
#station       object
#name          object
#latitude     float64
#longitude    float64
#elevation    float64

station       object
name          object
latitude     float64
longitude    float64
elevation    float64
dtype: object

In [47]:
stat_df.head(20)

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4


In [48]:
stat_df.count()

station      9
name         9
latitude     9
longitude    9
elevation    9
dtype: int64

In [51]:
stat_df.to_csv("Output/raw_stat.csv", encoding="utf-8", index=False, header=True)
stat_df.to_csv("Output/clean_stat.csv", encoding="utf-8", index=False, header=True)