# Bonus: Temperature Analysis I

In [129]:
import pandas as pd
from datetime import datetime as dt

In [130]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
print(df.head())
print(df.dtypes)

       station        date  prcp  tobs
0  USC00519397  2010-01-01   2.0  18.3
1  USC00519397  2010-01-02   0.0  17.2
2  USC00519397  2010-01-03   0.0  23.3
3  USC00519397  2010-01-04   0.0  24.4
4  USC00519397  2010-01-06   NaN  22.8
station     object
date        object
prcp       float64
tobs       float64
dtype: object


In [131]:
# Convert the date column format from string to datetime

df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')

print(df.head(5))

print(df.dtypes)

       station       date  prcp  tobs
0  USC00519397 2010-01-01   2.0  18.3
1  USC00519397 2010-01-02   0.0  17.2
2  USC00519397 2010-01-03   0.0  23.3
3  USC00519397 2010-01-04   0.0  24.4
4  USC00519397 2010-01-06   NaN  22.8
station            object
date       datetime64[ns]
prcp              float64
tobs              float64
dtype: object


In [132]:
df.dtypes

station            object
date       datetime64[ns]
prcp              float64
tobs              float64
dtype: object

In [133]:
# Set the date column as the DataFrame index
df_index = df.set_index('date', drop=False, inplace=False)
print(df_index.head())

                station       date  prcp  tobs
date                                          
2010-01-01  USC00519397 2010-01-01   2.0  18.3
2010-01-02  USC00519397 2010-01-02   0.0  17.2
2010-01-03  USC00519397 2010-01-03   0.0  23.3
2010-01-04  USC00519397 2010-01-04   0.0  24.4
2010-01-06  USC00519397 2010-01-06   NaN  22.8


In [134]:
# Drop the date column
df_index.drop(columns='date')

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,2.0,18.3
2010-01-02,USC00519397,0.0,17.2
2010-01-03,USC00519397,0.0,23.3
2010-01-04,USC00519397,0.0,24.4
2010-01-06,USC00519397,,22.8
...,...,...,...
2017-08-19,USC00516128,2.3,21.7
2017-08-20,USC00516128,,25.6
2017-08-21,USC00516128,14.2,24.4
2017-08-22,USC00516128,12.7,24.4


### Compare June and December data across all years 

In [135]:
from scipy import stats

In [136]:
# Filter data for June
june_data_df = df_index[df_index['date'].dt.month == 6]
june_data_df

Unnamed: 0_level_0,station,date,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-06-01,USC00519397,2010-06-01,0.0,25.6
2010-06-02,USC00519397,2010-06-02,0.3,24.4
2010-06-03,USC00519397,2010-06-03,0.0,25.6
2010-06-04,USC00519397,2010-06-04,0.0,24.4
2010-06-05,USC00519397,2010-06-05,0.0,25.0
...,...,...,...,...
2017-06-26,USC00516128,2017-06-26,0.5,26.1
2017-06-27,USC00516128,2017-06-27,2.5,23.3
2017-06-28,USC00516128,2017-06-28,0.5,23.3
2017-06-29,USC00516128,2017-06-29,1.0,24.4


In [137]:
# Filter data for June
december_data_df = df_index[df_index['date'].dt.month == 12]
december_data_df

Unnamed: 0_level_0,station,date,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-12-01,USC00519397,2010-12-01,1.0,24.4
2010-12-03,USC00519397,2010-12-03,0.0,23.3
2010-12-04,USC00519397,2010-12-04,0.0,23.3
2010-12-06,USC00519397,2010-12-06,0.0,17.8
2010-12-07,USC00519397,2010-12-07,0.0,17.8
...,...,...,...,...
2016-12-27,USC00516128,2016-12-27,3.6,21.7
2016-12-28,USC00516128,2016-12-28,3.6,21.7
2016-12-29,USC00516128,2016-12-29,26.2,20.6
2016-12-30,USC00516128,2016-12-30,60.2,18.3


In [138]:
# Identify the average temperature for June
june_temp_average = june_data_df[("tobs")].mean()
print(june_temp_average)

23.857705882352818


In [139]:
# Identify the average temperature for December
december_temp_average = december_data_df[("tobs")].mean()
print(december_temp_average)

21.690507580751465


In [140]:
# Create collections of temperature data
june_data = june_data_df.tobs
june_data

date
2010-06-01    25.6
2010-06-02    24.4
2010-06-03    25.6
2010-06-04    24.4
2010-06-05    25.0
              ... 
2017-06-26    26.1
2017-06-27    23.3
2017-06-28    23.3
2017-06-29    24.4
2017-06-30    23.9
Name: tobs, Length: 1700, dtype: float64

In [141]:
december_data = december_data_df.tobs
december_data

date
2010-12-01    24.4
2010-12-03    23.3
2010-12-04    23.3
2010-12-06    17.8
2010-12-07    17.8
              ... 
2016-12-27    21.7
2016-12-28    21.7
2016-12-29    20.6
2016-12-30    18.3
2016-12-31    18.3
Name: tobs, Length: 1517, dtype: float64

In [143]:
# Run paired t-test
# Unpaired (independent) t-test
stats.ttest_ind(june_data, december_data)

Ttest_indResult(statistic=31.59056548087541, pvalue=5.361211725471411e-191)

### Analysis

The variance between the mean temperatures for June and December is approx. 2.2 degrees Celsius. So there is not a meaningful difference indicating Hawaii does indeed have mild weather all year around.

I have chosen an un-paired t-test as we are comparing the means of 2x different groups of temperatures (June and December).  A very low pvalue return indicates a statistically significant result.