# Bonus: Temperature Analysis I

In [37]:
import pandas as pd
from datetime import datetime as dt

In [38]:
# "tobs" is "temperature observations"
df = pd.read_csv('../Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [39]:
# check format for 'date' column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   station  19550 non-null  object 
 1   date     19550 non-null  object 
 2   prcp     18103 non-null  float64
 3   tobs     19550 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 611.1+ KB


In [40]:
# Convert the date column format from string to datetime
df['date']= pd.to_datetime(df['date'])
 
# Check the format of 'Date' column. We can see it has changed to datetime64[ns]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  19550 non-null  object        
 1   date     19550 non-null  datetime64[ns]
 2   prcp     18103 non-null  float64       
 3   tobs     19550 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 611.1+ KB


In [41]:
# Set the date column as the DataFrame index
df_index_date = df.set_index("date")
df_index_date.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


In [42]:
# Drop the date column.
# Dropping the index column of a pandas.DataFrame removes the index column 
# from the DataFrame and replaces it with the standard sequential indexing.
# USE pandas.DataFrame.reset_index() TO DROP THE INDEX COLUMN OF A DATAFRAME
drop_date_df = df.copy()
drop_date_df.reset_index(drop=True, inplace=True)
drop_date_df = drop_date_df.drop(columns=['date'])
drop_date_df

Unnamed: 0,station,prcp,tobs
0,USC00519397,0.08,65
1,USC00519397,0.00,63
2,USC00519397,0.00,74
3,USC00519397,0.00,76
4,USC00519397,,73
...,...,...,...
19545,USC00516128,0.09,71
19546,USC00516128,,78
19547,USC00516128,0.56,76
19548,USC00516128,0.50,76


### Compare June and December data across all years 

In [43]:
from scipy import stats

In [44]:
# Filter data for desired months
# 1st create a dataframe grouped by month with average for temperature observations
month_avg_df = df.groupby(df['date'].dt.strftime('%B'))['tobs'].mean()
month_avg_df

date
April        72.357268
August       76.412454
December     71.041529
February     69.442236
January      68.726115
July         76.082408
June         74.944118
March        70.059067
May          73.680900
November     73.252688
October      75.391388
September    76.164865
Name: tobs, dtype: float64

In [45]:
# Reset index to avoid problems filtering month
month_avg_df = month_avg_df.reset_index()
# month_avg_df = month_avg_df.drop(columns=['index'])
month_avg_df

Unnamed: 0,date,tobs
0,April,72.357268
1,August,76.412454
2,December,71.041529
3,February,69.442236
4,January,68.726115
5,July,76.082408
6,June,74.944118
7,March,70.059067
8,May,73.6809
9,November,73.252688


In [46]:
# Filter data for desired months. Now June
# Identify the average temperature for June
june_avg = month_avg_df.loc[month_avg_df['date'] == "June", ['tobs']]
june_avg_tobs = june_avg['tobs'].values[0]
print(f"The average temperature for June across all available years in this sample was {june_avg_tobs}")


The average temperature for June across all available years in this sample was 74.94411764705882


In [47]:
# Filter data for desired months. Now December
# Identify the average temperature for December
dec_avg = month_avg_df.loc[month_avg_df['date'] == "December", ['tobs']]
dec_avg_tobs = dec_avg['tobs'].values[0]
print(f"The average temperature for December across all available years in this sample was {dec_avg_tobs}")

The average temperature for December across all available years in this sample was 71.04152933421226


In [48]:
df

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.00,63
2,USC00519397,2010-01-03,0.00,74
3,USC00519397,2010-01-04,0.00,76
4,USC00519397,2010-01-06,,73
...,...,...,...,...
19545,USC00516128,2017-08-19,0.09,71
19546,USC00516128,2017-08-20,,78
19547,USC00516128,2017-08-21,0.56,76
19548,USC00516128,2017-08-22,0.50,76


In [49]:
# Create collections of temperature data
# 1 step create dataframes with june temps and december temps
# 2nd step move this data to 2 separate lists required to input ttest

june_tobs_df = df.loc[df['date'].dt.strftime('%B') == 'June']
dec_tobs_df = df.loc[df['date'].dt.strftime('%B') == 'December']

june_tobs = june_tobs_df['tobs'].to_list()
dec_tobs = dec_tobs_df['tobs'].to_list()


In [50]:
# Run paired t-test
stats.ttest_ind(june_tobs,dec_tobs)

Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)

### Analysis

We are using a paired sample t-test because we are comparing means from the same group at different times.
Unpaired t-test are used to compare means fo different groups.

Regarding results, we can see that t-test is very close to 0 and of course much less than the typical reference value of 0.05. Then we can affirm that (for the sample values observed) there is no statistically significant difference between temperature means in June and December in Hawaii.
