In [None]:
#Dependencies

from matplotlib.figure import Figure 
import matplotlib.pyplot as plt 
import matplotlib.dates as mdates 
import pandas as pd 
import numpy as np 
import datetime as dt 
import seaborn as sns 
np.random.seed(sum(map(ord, "aesthetics")))


In [None]:
# Store filepath in a variable 
weather_file = "hawaii_measurements.csv"
# Read our Data file with the pandas library
weather_df = pd.read_csv(weather_file,parse_dates=["date"])
weather_df.head()

In [None]:
print(len(weather_df.index))

# Data cleansing notes on removed records:

### Stations with 2010 - 2017 data
    KANEOHE 838.1, HI US
    KUALOA RANCH HEADQUARTERS 886.9, HI US
    MANOA LYON ARBO 785.2, HI US
    PEARL CITY, HI US
    WAIHEE 837.5, HI US
    WAIKIKI 717.2, HI US
    WAIMANALO EXPERIMENTAL FARM, HI US

### Stations with 2010 - 2015 data
    HONOLULU OBSERVATORY 702.2, HI US
    UPPER WAHIAWA 874.3, HI US

### 8 out of 9 stations have blank prcp values for a total of 1447 records with blanks.
    USC00511918	  47 blank prcp
    USC00513117	  13 blank prcp
    USC00514830	  265 blank prcp
    USC00516128	 128 blank prcp
    USC00517948	 689 blank prcp
    USC00518838	169 blank prcp
    USC00519397	39 blank prcp
    USC00519523	97 blank prcp

### The missing data is spread out over the years of data so removing them doesn't seem to be a large concern as opposed to if the missing data was concentrated in a station or year.

### Measurement Records Information
    19,550 total records before changes
    1,447 records with blank precipitation (col = "prcp")
    18,103 records with no known issues after removing 1,447 with blank prcp
    Percentage of records removed is 0.07%
    Percentage of records remaining is 93.0%


In [None]:
# drop the NaN values and note the details of the dropped data.
weather_df= weather_df.dropna(axis=0)
print(len(weather_df.index))
weather_df.head()

In [None]:
weather_df = weather_df.sort_values(["station","date"], \
                                    ascending=[True, True]).reset_index(drop=True)

In [None]:
weather_df.head()

In [None]:
# For data analysis: Export file as an XLSX or CSV, w/o index, w/ header
# weather_df.to_excel("../output/clean_weather_source.xlsx", index=False, header=True)
weather_df.to_csv("clean_weather_source.csv", index=False, header=True)

In [None]:
# Analyze the data
# len(weather_df.index)
#weather_df.dtypes
weather_df.info()
weather_df.describe()

In [None]:
# Store filepath in a variable
station_file = "hawaii_stations.csv"
# Read our Data file with the pandas library and parse the date as datetime
station_df = pd.read_csv(station_file)
station_df = station_df.sort_values("station").reset_index(drop=True)
station_df.head(10)

In [None]:
# For data analysis: Export file as an XLSX or CSV, w/o index, w/ header
# station_df.to_excel("../output/clean_station_source.xlsx", index=False, header=True)
station_df.to_csv("clean_station_source.csv", index=False, header=True)

In [None]:
# Analyze the data
# len(station_df.index)
# station_df.dtypes
station_df.info()
station_df.describe()

In [None]:
# Merge the two dataframes on the "station" field
#pd.options.display.max_rows = 100
hiw_df = pd.merge(station_df, weather_df, on=("station"), how="outer", suffixes=("_x", "_y"))
hiw_df = hiw_df[["station", "name", "latitude", "longitude", "elevation", "date","prcp","tobs"]].\
                 sort_values(["station","date"], ascending=[True, True])
hiw_df.head(5)

In [None]:
# For data analysis: Export file as an XLSX or CSV, w/o index, w/ header
# hiw_df.to_excel("../output/clean_hiw_merged.xlsx", index=False, header=True)
hiw_df.to_csv("clean_hiw_merged.csv", index=False, header=True)

In [None]:
# Analyze combined data set
#len(hiw_df.index)
hiw_df.info()
hiw_df.describe()
#hiw_df.dtypes