In [2]:
#Dependencies

from matplotlib.figure import Figure 
import matplotlib.pyplot as plt 
import matplotlib.dates as mdates 
import pandas as pd 
import numpy as np 
import datetime as dt 
import seaborn as sns 
np.random.seed(sum(map(ord, "aesthetics")))


In [3]:
# Store filepath in a variable 
weather_file = "hawaii_measurements.csv"
# Read our Data file with the pandas library
weather_df = pd.read_csv(weather_file,parse_dates=["date"])
weather_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
print(len(weather_df.index))

19550


# Data cleansing notes on removed records:

### Stations with 2010 - 2017 data
    KANEOHE 838.1, HI US
    KUALOA RANCH HEADQUARTERS 886.9, HI US
    MANOA LYON ARBO 785.2, HI US
    PEARL CITY, HI US
    WAIHEE 837.5, HI US
    WAIKIKI 717.2, HI US
    WAIMANALO EXPERIMENTAL FARM, HI US

### Stations with 2010 - 2015 data
    HONOLULU OBSERVATORY 702.2, HI US
    UPPER WAHIAWA 874.3, HI US

### 8 out of 9 stations have blank prcp values for a total of 1447 records with blanks.
    USC00511918	  47 blank prcp
    USC00513117	  13 blank prcp
    USC00514830	  265 blank prcp
    USC00516128	 128 blank prcp
    USC00517948	 689 blank prcp
    USC00518838	169 blank prcp
    USC00519397	39 blank prcp
    USC00519523	97 blank prcp

### The missing data is spread out over the years of data so removing them doesn't seem to be a large concern as opposed to if the missing data was concentrated in a station or year.

### Measurement Records Information
    19,550 total records before changes
    1,447 records with blank precipitation (col = "prcp")
    18,103 records with no known issues after removing 1,447 with blank prcp
    Percentage of records removed is 0.07%
    Percentage of records remaining is 93.0%


In [5]:
# drop the NaN values and note the details of the dropped data.
weather_df= weather_df.dropna(axis=0)
print(len(weather_df.index))
weather_df.head()

18103


Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
5,USC00519397,2010-01-07,0.06,70


In [6]:
weather_df = weather_df.sort_values(["station","date"], \
                                    ascending=[True, True]).reset_index(drop=True)

In [7]:
weather_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00511918,2010-01-01,0.05,66
1,USC00511918,2010-01-02,0.0,70
2,USC00511918,2010-01-03,0.0,75
3,USC00511918,2010-01-04,0.0,75
4,USC00511918,2010-01-05,0.0,75


In [8]:
# For data analysis: Export file as an XLSX or CSV, w/o index, w/ header
# weather_df.to_excel("../output/clean_weather_source.xlsx", index=False, header=True)
weather_df.to_csv("clean_weather_source.csv", index=False, header=True)

In [9]:
# Analyze the data
# len(weather_df.index)
#weather_df.dtypes
weather_df.info()
weather_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18103 entries, 0 to 18102
Data columns (total 4 columns):
station    18103 non-null object
date       18103 non-null datetime64[ns]
prcp       18103 non-null float64
tobs       18103 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 565.8+ KB


Unnamed: 0,prcp,tobs
count,18103.0,18103.0
mean,0.160644,72.994863
std,0.468746,4.512107
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.11,76.0
max,11.53,87.0


In [10]:
# Store filepath in a variable
station_file = "hawaii_stations.csv"
# Read our Data file with the pandas library and parse the date as datetime
station_df = pd.read_csv(station_file)
station_df = station_df.sort_values("station").reset_index(drop=True)
station_df.head(10)

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4
4,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
5,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
8,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5


In [11]:
# For data analysis: Export file as an XLSX or CSV, w/o index, w/ header
# station_df.to_excel("../output/clean_station_source.xlsx", index=False, header=True)
station_df.to_csv("clean_station_source.csv", index=False, header=True)

In [12]:
# Analyze the data
# len(station_df.index)
# station_df.dtypes
station_df.info()
station_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 5 columns):
station      9 non-null object
name         9 non-null object
latitude     9 non-null float64
longitude    9 non-null float64
elevation    9 non-null float64
dtypes: float64(3), object(2)
memory usage: 440.0+ bytes


Unnamed: 0,latitude,longitude,elevation
count,9.0,9.0,9.0
mean,21.393826,-157.867098,60.977778
std,0.086442,0.103873,103.465547
min,21.2716,-158.0111,0.9
25%,21.3331,-157.9751,7.0
50%,21.3934,-157.8374,14.6
75%,21.45167,-157.8025,32.9
max,21.5213,-157.71139,306.6


In [13]:
# Merge the two dataframes on the "station" field
#pd.options.display.max_rows = 100
hiw_df = pd.merge(station_df, weather_df, on=("station"), how="outer", suffixes=("_x", "_y"))
hiw_df = hiw_df[["station", "name", "latitude", "longitude", "elevation", "date","prcp","tobs"]].\
                 sort_values(["station","date"], ascending=[True, True])
hiw_df.head(5)

Unnamed: 0,station,name,latitude,longitude,elevation,date,prcp,tobs
0,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9,2010-01-01,0.05,66
1,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9,2010-01-02,0.0,70
2,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9,2010-01-03,0.0,75
3,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9,2010-01-04,0.0,75
4,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9,2010-01-05,0.0,75


In [14]:
# For data analysis: Export file as an XLSX or CSV, w/o index, w/ header
# hiw_df.to_excel("../output/clean_hiw_merged.xlsx", index=False, header=True)
hiw_df.to_csv("../output/clean_hiw_merged.csv", index=False, header=True)

In [15]:
# Analyze combined data set
#len(hiw_df.index)
hiw_df.info()
hiw_df.describe()
#hiw_df.dtypes

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18103 entries, 0 to 18102
Data columns (total 8 columns):
station      18103 non-null object
name         18103 non-null object
latitude     18103 non-null float64
longitude    18103 non-null float64
elevation    18103 non-null float64
date         18103 non-null datetime64[ns]
prcp         18103 non-null float64
tobs         18103 non-null int64
dtypes: datetime64[ns](1), float64(4), int64(1), object(2)
memory usage: 1.2+ MB


Unnamed: 0,latitude,longitude,elevation,prcp,tobs
count,18103.0,18103.0,18103.0,18103.0,18103.0
mean,21.379572,-157.83381,38.425327,0.160644,72.994863
std,0.07907,0.082249,61.237607,0.468746,4.512107
min,21.2716,-158.0111,0.9,0.0,53.0
25%,21.3152,-157.84889,3.0,0.0,70.0
50%,21.33556,-157.8168,14.6,0.01,73.0
75%,21.45167,-157.8015,32.9,0.11,76.0
max,21.5213,-157.71139,306.6,11.53,87.0
