# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
# Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'])

print (df)
print (df.dtypes)

           station       date  prcp  tobs
0      USC00519397 2010-01-01  0.08    65
1      USC00519397 2010-01-02  0.00    63
2      USC00519397 2010-01-03  0.00    74
3      USC00519397 2010-01-04  0.00    76
4      USC00519397 2010-01-06   NaN    73
...            ...        ...   ...   ...
19545  USC00516128 2017-08-19  0.09    71
19546  USC00516128 2017-08-20   NaN    78
19547  USC00516128 2017-08-21  0.56    76
19548  USC00516128 2017-08-22  0.50    76
19549  USC00516128 2017-08-23  0.45    76

[19550 rows x 4 columns]
station            object
date       datetime64[ns]
prcp              float64
tobs                int64
dtype: object


In [4]:
# Set the date column as the DataFrame index
df.set_index('date', inplace=True) 
df

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


In [5]:
# Drop the date column    Ref:https://stackoverflow.com/questions/20107570/removing-index-column-in-pandas-when-reading-a-csv
df.reset_index(drop=True, inplace=True) 
df

Unnamed: 0,station,prcp,tobs
0,USC00519397,0.08,65
1,USC00519397,0.00,63
2,USC00519397,0.00,74
3,USC00519397,0.00,76
4,USC00519397,,73
...,...,...,...
19545,USC00516128,0.09,71
19546,USC00516128,,78
19547,USC00516128,0.56,76
19548,USC00516128,0.50,76


### Compare June and December data across all years 

In [6]:
# Dependencies
import scipy
from scipy import stats

In [7]:
# Read data  ref:https://www.interviewqs.com/ddi-code-snippets/extract-month-year-pandas
df1 = pd.read_csv('Resources/hawaii_measurements.csv')
df1['date'] = pd.to_datetime(df1['date']).dt.to_period('M')

df1.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01,0.08,65
1,USC00519397,2010-01,0.0,63
2,USC00519397,2010-01,0.0,74
3,USC00519397,2010-01,0.0,76
4,USC00519397,2010-01,,73


In [8]:
# Drop unnecesary columns
df1 = df1.drop(['station', 'prcp'], axis=1)    
df1.head()

Unnamed: 0,date,tobs
0,2010-01,65
1,2010-01,63
2,2010-01,74
3,2010-01,76
4,2010-01,73


In [9]:
# Create column of just the month   ref:https://stackoverflow.com/questions/25146121/extracting-just-month-and-year-separately-from-pandas-datetime-column
df1['Month'] = df1['date'].dt.month
df1 = df1.rename(columns={"date": "Date", "tobs": "Temperature"})
df1

Unnamed: 0,Date,Temperature,Month
0,2010-01,65,1
1,2010-01,63,1
2,2010-01,74,1
3,2010-01,76,1
4,2010-01,73,1
...,...,...,...
19545,2017-08,71,8
19546,2017-08,78,8
19547,2017-08,76,8
19548,2017-08,76,8


In [10]:
# Filter data for months of June and December
df2 = df1[(df1["Month"]==6) | (df1["Month"]==12)]
df2

Unnamed: 0,Date,Temperature,Month
133,2010-06,78,6
134,2010-06,76,6
135,2010-06,78,6
136,2010-06,76,6
137,2010-06,77,6
...,...,...,...
19492,2017-06,79,6
19493,2017-06,74,6
19494,2017-06,74,6
19495,2017-06,76,6


In [11]:
# Obtain temperature averages for June and December
df2.groupby('Month')['Temperature'].mean().round(1)

Month
6     74.9
12    71.0
Name: Temperature, dtype: float64

In [12]:
# Run paired t-test   ref:https://stackoverflow.com/questions/13404468/t-test-in-pandas
June = df2[df2['Month']==6]
Dec = df2[df2['Month']==12]

stats.ttest_ind(June['Temperature'], Dec['Temperature'])

Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)

### Analysis