# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
# Convert the date column format from string to datetime
#-----------------------------------------------------------
# check for datatypes
df.dtypes

station     object
date        object
prcp       float64
tobs         int64
dtype: object

In [4]:
df.date = pd.to_datetime(df.date , yearfirst = True )
df.dtypes

station            object
date       datetime64[ns]
prcp              float64
tobs                int64
dtype: object

In [5]:
# create a months column to filter by month later

df['month'] = df['date'].dt.strftime('%b')

In [6]:
# Set the date column as the DataFrame index
df = df.set_index(df.date , drop=True)
df.head()

Unnamed: 0_level_0,station,date,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,USC00519397,2010-01-01,0.08,65,Jan
2010-01-02,USC00519397,2010-01-02,0.0,63,Jan
2010-01-03,USC00519397,2010-01-03,0.0,74,Jan
2010-01-04,USC00519397,2010-01-04,0.0,76,Jan
2010-01-06,USC00519397,2010-01-06,,73,Jan


In [7]:
# Drop the date column
df = df.drop('date' , 1)
df

Unnamed: 0_level_0,station,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,USC00519397,0.08,65,Jan
2010-01-02,USC00519397,0.00,63,Jan
2010-01-03,USC00519397,0.00,74,Jan
2010-01-04,USC00519397,0.00,76,Jan
2010-01-06,USC00519397,,73,Jan
...,...,...,...,...
2017-08-19,USC00516128,0.09,71,Aug
2017-08-20,USC00516128,,78,Aug
2017-08-21,USC00516128,0.56,76,Aug
2017-08-22,USC00516128,0.50,76,Aug


### Compare June and December data across all years 

In [8]:
from scipy import stats

In [15]:
# Filter data for desired months
june_df = df[df['month']== 'Jun']
june_df.head()

Unnamed: 0_level_0,station,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-06-01,USC00519397,0.0,78,Jun
2010-06-02,USC00519397,0.01,76,Jun
2010-06-03,USC00519397,0.0,78,Jun
2010-06-04,USC00519397,0.0,76,Jun
2010-06-05,USC00519397,0.0,77,Jun


In [28]:
decmbr_df = df[df['month']== 'Dec']
decmbr_df.head()

Unnamed: 0_level_0,station,prcp,tobs,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-12-01,USC00519397,0.04,76,Dec
2010-12-03,USC00519397,0.0,74,Dec
2010-12-04,USC00519397,0.0,74,Dec
2010-12-06,USC00519397,0.0,64,Dec
2010-12-07,USC00519397,0.0,64,Dec


In [18]:
# Identify the average temperature for June
avgJune_temp = june_df['tobs'].mean()
avgJune_temp

74.94411764705882

In [19]:
# Identify the average temperature for December
avgDec_temp = decmbr_df['tobs'].mean()
avgDec_temp

71.04152933421226

In [26]:
# Create collections of temperature data
temp_collection = june_df.tobs.append(decmbr_df.tobs)
temp_collection

date
2010-06-01    78
2010-06-02    76
2010-06-03    78
2010-06-04    76
2010-06-05    77
              ..
2016-12-27    71
2016-12-28    71
2016-12-29    69
2016-12-30    65
2016-12-31    65
Name: tobs, Length: 3217, dtype: int64

In [21]:
# Run paired t-test
stats.ttest_ind(june_df.tobs, decmbr_df.tobs)

Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)

### Analysis

 ##### Is there a meaningful difference between the temperature in, for example, June and December?
     - With P-value of 3.9, it's obvious that Observations could be random. We fail to reject Null Hypothesis which states 
       that there is no significant difference in temperature values between the two months.

     - Average temperature is not far from each other during these months and pleasant temperature for tourism.
     
     

#### Will you use a paired t-test, or an unpaired t-test? Why?
     - Paired T-test is best suited for this analysis.

     - According to https://www.technologynetworks.com/, A paired t-test is designed to compare the means of the same 
       group or item under two separate scenarios. An unpaired t-test compares the means of two independent or 
       unrelated groups.

     - Since our data is from the same population or group, paired t-test is the best fit.