In [14]:
# statistical analysis of temperature data, QuantPy on YouTube
# heating degree days (HDD), cooling degree days (CDD) "average" Temperature = Tmax + Tmin / 2

# KANSAS CITY ?, 1889-01-01 to 12/31/1933
# KANSAS CITY DOWNTOWN AIRPORT, 1/1/1934 to 9/30/1972
# KANSAS CITY INTL AIRPORT, 10/1/1972 to 12/31/2021

# Tmax, Tmin, Precipitation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

kansas_city = pd.read_csv("data/USW00003947.csv")
st_louis = pd.read_csv("data/USW00013994.csv")
bhm = pd.read_csv("data/USW00013876.csv")

### Check for missing data

In [15]:
### Checking for missing max and min temperatures #was 63 and 56 with 54 misaligned; restricted to KCI there are 0
max_temp = kansas_city[["Date","tmax"]]
min_temp = kansas_city[["Date","tmin"]]
print(max_temp.isnull().value_counts())
print(min_temp.isnull().value_counts())

count = 0
for mx, mn in zip(np.where(max_temp.isnull())[0], np.where(min_temp.isnull())[0]):
    if mx != mn:
        count += 1
print('\nNumber of misaligned null values equals', count)
###

Date   tmax 
False  False    48514
       True        63
Name: count, dtype: int64
Date   tmin 
False  False    48521
       True        56
Name: count, dtype: int64

Number of misaligned null values equals 54


### Calculate average temps and drop missing values

In [16]:
kansas_city["Date"] = pd.to_datetime(kansas_city["Date"]) #Thanks skbrimmer!
kansas_city.set_index("Date", inplace=True)
kc_temps = kansas_city[["tmax", "tmin"]]

def avg_temp(row):
    return (row.tmax+row.tmin)/2

kc_temps["Tavg"] = kc_temps.apply(avg_temp,axis=1)
#drop na values here
kc_temps = kc_temps.dropna()
print("Kansas City")
print(kc_temps)
# print(kc_temps.describe())

Kansas City
            tmax  tmin   Tavg
Date                         
1889-01-01  44.0  19.0  31.50
1889-01-02  48.0  28.0  38.00
1889-01-03  52.0  33.0  42.50
1889-01-04  42.0  31.0  36.50
1889-01-05  30.0  25.0  27.50
...          ...   ...    ...
2021-12-27  64.0  39.0  51.50
2021-12-28  57.0  23.2  40.10
2021-12-29  33.1  19.2  26.15
2021-12-30  44.1  28.2  36.15
2021-12-31  54.0  30.9  42.45

[48461 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kc_temps["Tavg"] = kc_temps.apply(avg_temp,axis=1)


In [18]:
st_louis["Date"] = pd.to_datetime(st_louis["Date"]) #Thanks skbrimmer!
st_louis.set_index("Date", inplace=True)
stl_temps = st_louis[["tmax", "tmin"]]

stl_temps["Tavg"] = stl_temps.apply(avg_temp,axis=1)
#drop na values here
stl_temps = stl_temps.dropna()
print("St Louis")
print(stl_temps)
# print(stl_temps.describe())

St Louis
            tmax  tmin   Tavg
Date                         
1893-01-01  32.0  26.0  29.00
1893-01-04  34.0  18.0  26.00
1893-01-05  37.0  14.0  25.50
1893-01-06  23.0   4.0  13.50
1893-01-07  34.0  22.0  28.00
...          ...   ...    ...
2021-12-27  64.0  44.1  54.05
2021-12-28  46.9  39.0  42.95
2021-12-29  44.1  35.1  39.60
2021-12-30  50.0  37.0  43.50
2021-12-31  68.0  41.0  54.50

[46989 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stl_temps["Tavg"] = stl_temps.apply(avg_temp,axis=1)


In [19]:
bhm["Date"] = pd.to_datetime(bhm["Date"]) #Thanks skbrimmer!
bhm.set_index("Date", inplace=True)
bhm_temps = bhm[["tmax", "tmin"]]

bhm_temps["Tavg"] = bhm_temps.apply(avg_temp,axis=1)
#drop na values here
bhm_temps = bhm_temps.dropna()
print("Birmingham")
print(bhm_temps)
# print(bhm_temps.describe())

Birmingham
            tmax  tmin   Tavg
Date                         
1896-01-01  47.0  26.0  36.50
1896-01-02  56.0  33.0  44.50
1896-01-03  56.0  30.0  43.00
1896-01-04  34.0  14.0  24.00
1896-01-05  44.0  15.0  29.50
...          ...   ...    ...
2021-12-27  73.9  64.9  69.40
2021-12-28  75.9  64.0  69.95
2021-12-29  77.0  66.0  71.50
2021-12-30  75.0  55.9  65.45
2021-12-31  77.0  55.9  66.45

[36385 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bhm_temps["Tavg"] = bhm_temps.apply(avg_temp,axis=1)


### Set start date to 1 Jan 1896 for KC and STL, join dataframes

In [20]:
kc_temps_1896 = kc_temps[kc_temps["Date"] > "1896-01-01"]
# stl_temps_1896 = stl_temps[stl_temps["Date"] > "2009-12-31"]
# tricity_temps = kc_temps_1896.join([stl_temps_1896, bhm_temps])

KeyError: 'Date'

### Visually explore data

In [None]:
kc_temps[-5000:].plot(figsize=(8,6))
plt.show()

### Distributions

In [None]:
plt.figure(figsize=(8,6))
kc_temps.tmin.hist(bins=60, alpha=0.6, label="Tmin")
kc_temps.tmax.hist(bins=60, alpha=0.6, label="Tmax")
kc_temps["Tavg"].hist(bins=60, alpha=0.6, label="Tavg")
plt.legend()
plt.show()

### Summer and Winter

In [None]:
plt.figure(figsize=(8,6))
kc_temps_season[kc_temps_season["winter"] == 1]["Tavg"].hist(bins=60, alpha=0.8, label="winter")
kc_temps_season[kc_temps_season["summer"] == 1]["Tavg"].hist(bins=60, alpha=0.8, label="summer")
plt.legend()
plt.show()

### Investigate temperature records

In [None]:
# resample by month start, calculate mins and maxes for tmax, tmin and Tavg
date_list = kc_temps.index.tolist()
mth_kc_temps = pd.DataFrame(data=date_list, index=date_list).resample("MS")[0].agg([min,max])
mth_kc_temps["month"] = mth_kc_temps.index.month
def min_max_temps(row):
    stats = kc_temps[(kc_temps.index >= row["min"]) & (kc_temps.index <= row["max"])].agg([min, max])
    row["tmax_max"] = stats.loc["max", "tmax"]
    row["tmax_min"] = stats.loc["min", "tmax"]
    row["tmin_max"] = stats.loc["max", "tmin"]
    row["tmin_min"] = stats.loc["min", "tmin"]
    row["Tavg_max"] = stats.loc["max", "Tavg"]
    row["Tavg_min"] = stats.loc["min", "Tavg"]
    return row

mth_kc_temps = mth_kc_temps.apply(min_max_temps,axis=1)
mth_kc_temps

### Extremes on Record

In [None]:
grouped_mths_kc = mth_kc_temps.groupby(mth_kc_temps.month)[["tmax_max", "tmax_min", "tmin_max", "tmin_min", "Tavg_max", "Tavg_min"]].agg([min, max])
grouped_mths_kc['months'] = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
grouped_mths_kc = grouped_mths_kc.set_index('months')
print(grouped_mths_kc[[("tmax_max", "max"),("tmin_min", "min"),("tmax_min", "min"),("tmin_max", "max")]])

In [None]:
# Look at the max and min of the Tavg max and min
print(grouped_mths_kc[[("Tavg_max", "max"),("Tavg_max", "min"),("Tavg_min", "max"),("Tavg_min", "min")]])

### Decomposition of temperatures into seasonality and trends

In [None]:
# Now, decomposition of time-series components
# trend - decreasin, constant or increasing?
# seasonality - periodic signal
# noise - variation in signal not accounted for by trend or seasonailty, a.k.a. "remainder"
from statsmodels.tsa.seasonal import seasonal_decompose
kc_temps.sort_index(inplace=True)
print(kc_temps)

In [None]:
kc_temps["Tavg"].rolling(window = 365*10).mean().plot(figsize=(8,4), color="tab:red", title="Rolling mean over a 10 year window")
plt.show()

In [None]:
kc_temps["Tavg"].rolling(window = 365*10).var().plot(figsize=(8,4), color="tab:red", title="Rolling variance over a 10 year window")
plt.show()

In [None]:
# seasonal decomposition
decompose_result = seasonal_decompose(kc_temps['Tavg'], model='additive', period=int(365*10), extrapolate_trend='freq')
 
trend = decompose_result.trend
seasonal = decompose_result.seasonal
residual = decompose_result.resid
 
decompose_result.plot()
plt.show()

In [None]:
# visualize 10 years
years_examine = 365*5
start_date = 3*years_examine
fig, axs = plt.subplots(3, figsize=(8,6))
fig.suptitle('Removed Trend and Seasonality')
axs[0].plot(trend[-start_date:-years_examine])
axs[1].plot(seasonal[-start_date:-years_examine])
axs[1].set_ylim([-25,25])
axs[2].plot(residual[-start_date:-years_examine])
axs[2].set_ylim([-20,20])
plt.show()

In [None]:
# check residual distribution
residual.hist(bins=60, figsize=(8,6))