# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt
from scipy import stats
from scipy.stats import ttest_ind
import numpy as np
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func


In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
# Convert the date column format from string to datetime
# https://www.geeksforgeeks.org/convert-the-column-type-from-string-to-datetime-format-in-pandas-dataframe/
df['date']= pd.to_datetime(df['date'])
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
# Set the date column as the DataFrame index
# Drop the date column
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.set_index.html
df.set_index('date', inplace=True, drop=True)
df.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


### Compare June and December data across all years 

In [5]:
# prepare sql session for analysis
# create engine
engine = create_engine("sqlite:///hawaii.sqlite")
#reflect tables
Base = automap_base()
Base.prepare(engine, reflect=True)
#view classes
Base.classes.keys()

['measurement', 'station']

In [6]:
# set var for classes
Measurement = Base.classes.measurement
Station = Base.classes.station

In [7]:
#create session
session = Session(engine)

In [8]:
# June
# Filter data for desired months
june = session.query(Measurement.date, Measurement.tobs).filter(func.strftime("%m", Measurement.date)=="06")
# convert to DF
june_df = pd.DataFrame(june)
#view head
june_df.head()


Unnamed: 0,date,tobs
0,2010-06-01,78.0
1,2010-06-02,76.0
2,2010-06-03,78.0
3,2010-06-04,76.0
4,2010-06-05,77.0


In [9]:
# December
# Filter data for desired months
dec = session.query(Measurement.date, Measurement.tobs).filter(func.strftime("%m", Measurement.date)=="12")
# convert to DF
dec_df = pd.DataFrame(dec)
#view head
dec_df.head()

Unnamed: 0,date,tobs
0,2010-12-01,76.0
1,2010-12-03,74.0
2,2010-12-04,74.0
3,2010-12-06,64.0
4,2010-12-07,64.0


In [10]:
# Identify the average temperature for June
june_avg_temp = june_df["tobs"].mean()
june_avg_temp

74.94411764705882

In [11]:
# Identify the average temperature for December
dec_avg_temp = dec_df["tobs"].mean()
dec_avg_temp

71.04152933421226

In [12]:
# Create collections of temperature data
june_collection = june_df['tobs']
dec_collection = dec_df['tobs']

#june_collection,dec_collection


In [13]:
# Run paired t-test
ttest_ind(june_df['tobs'], dec_df['tobs'])

Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)

### Analysis

We use a paired t-test to identify the potential differences in June and December tempatures. With a pvalue of 3.902e-191 shows the December and June averages is statistically significnat.  