# Step 1 - Data Engineering

The climate data for Hawaii is provided through two CSV files. Start by using Python and Pandas to inspect the content of these files and clean the data.

* Create a Jupyter Notebook file called `data_engineering.ipynb` and use this to complete all of your Data Engineering tasks.

* Use Pandas to read in the measurement and station CSV files as DataFrames.

* Inspect the data for NaNs and missing values. You must decide what to do with this data.

* Save your cleaned CSV files with the prefix `clean_`.

In [2]:
import os
import json
import pprint
import pandas as pd
import numpy
import csv

In [3]:
stations_file = os.path.join('Resources', 'hawaii_stations.csv')
measurements_file = os.path.join('Resources', 'hawaii_measurements.csv')

In [4]:
stations_df = pd.read_csv(stations_file)
stations_df.head()

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [5]:
stations_df.describe()

Unnamed: 0,latitude,longitude,elevation
count,9.0,9.0,9.0
mean,21.393826,-157.867098,60.977778
std,0.086442,0.103873,103.465547
min,21.2716,-158.0111,0.9
25%,21.3331,-157.9751,7.0
50%,21.3934,-157.8374,14.6
75%,21.45167,-157.8025,32.9
max,21.5213,-157.71139,306.6


In [6]:
stations_df = stations_df.rename(columns={'station': 'st_id', 'name': 'st_name'
                                          , 'latitude': 'st_latitude', 'longitude': 'st_longitude'
                                          , 'elevation': 'st_elevation'
                                         })

stations_df.head()

Unnamed: 0,st_id,st_name,st_latitude,st_longitude,st_elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [7]:
measurements_df = pd.read_csv(measurements_file)
measurements_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [8]:
measurements_df.describe()

Unnamed: 0,prcp,tobs
count,18103.0,19550.0
mean,0.160644,73.097954
std,0.468746,4.523527
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.11,76.0
max,11.53,87.0


In [9]:
measurements_df = measurements_df.dropna(how='any')
measurements_df = measurements_df.rename(columns={'station': 'st_id', 'date': 'rec_date'
                                          , 'prcp': 'prcp', 'tobs': 'tobs'
                                         })

measurements_df.head()

Unnamed: 0,st_id,rec_date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
5,USC00519397,2010-01-07,0.06,70


In [10]:
print(min(measurements_df['rec_date']))
print(max(measurements_df['rec_date']))

2010-01-01
2017-08-23


In [11]:
# Export file as a CSV, without the Pandas index, but with the header
stations_df.to_csv("Resources/cleaned_hawaii_stations.csv", index=False, header=True)
measurements_df.to_csv("Resources/cleaned_hawaii_measurements.csv", index=False, header=True)