# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt

In [3]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
# Convert the date column format from string to datetime
datetime_format = "%Y-%m-%d"
df["date"] = pd.to_datetime(df["date"], format=datetime_format)

In [5]:
# Set the date column as the DataFrame index
df.set_index("date", inplace = True)

In [6]:
# Drop the date column
if "prcp" in df.columns:
    df.drop("prcp", axis=1, inplace=True)
df

Unnamed: 0_level_0,station,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,USC00519397,65
2010-01-02,USC00519397,63
2010-01-03,USC00519397,74
2010-01-04,USC00519397,76
2010-01-06,USC00519397,73
...,...,...
2017-08-19,USC00516128,71
2017-08-20,USC00516128,78
2017-08-21,USC00516128,76
2017-08-22,USC00516128,76


### Compare June and December data across all years 

In [7]:
from scipy import stats

In [8]:
# Filter data for desired months June
june_df = df.filter(regex ="\d\d\d\d-06-\d\d", axis=0).sort_index()
june_df

Unnamed: 0_level_0,station,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-06-01,USC00519397,78
2010-06-01,USC00514830,73
2010-06-01,USC00517948,77
2010-06-01,USC00518838,69
2010-06-01,USC00519523,76
...,...,...
2017-06-30,USC00519281,76
2017-06-30,USC00519397,75
2017-06-30,USC00519523,75
2017-06-30,USC00513117,74


In [10]:
# Identify the average temperature for June
def get_year_ranges(data):
    min_date = min(data.index)
    max_date = max(data.index)
    return list(range(min_date.year, max_date.year))


all_june_data = {}
for year in get_year_ranges(june_df):
    key = str(year)
    value = june_df.filter(regex = f"{year}-\d\d-", axis=0)["tobs"]
    all_june_data[key] = value
    print(f"year: {year}, mean: {round(value.mean(), 2)}")

year: 2010, mean: 74.93
year: 2011, mean: 73.94
year: 2012, mean: 74.0
year: 2013, mean: 74.6
year: 2014, mean: 75.03
year: 2015, mean: 74.99
year: 2016, mean: 75.18


In [15]:
# Identify the average temperature for December
dec_df = df.filter(regex ="\d\d\d\d-12-\d\d", axis=0).sort_index()

all_dec_data = {}
for year in get_year_ranges(dec_df):
    key = str(year)
    value = dec_df.filter(regex = f"{year}-\d\d-", axis=0)["tobs"]
    all_dec_data[key] = value
    print(f"year: {year}, mean: {round(value.mean(), 2)}")

year: 2010, mean: 70.21
year: 2011, mean: 70.82
year: 2012, mean: 71.19
year: 2013, mean: 71.09
year: 2014, mean: 69.9
year: 2015, mean: 73.42


In [None]:
# Create collections of temperature data

In [16]:
# Run paired t-test
t_value, p_value = stats.ttest_ind(june_df["tobs"], dec_df["tobs"])
print(f"t_value {t_value}")
print(f"p_value {p_value}")

t_value 31.60372399000329
p_value 3.9025129038616655e-191


### Analysis