In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [86]:
climate_df: pd.DataFrame = pd.read_csv("../data/Cleaned_Data_Seasonal.csv")
climate_df.head()

Unnamed: 0,Region,Year,Season,ACI_Combined_Seasonal_Smoothed,ACI_Combined_Seasonal_Unsmoothed,CDD_Seasonal_Smoothed,CDD_Seasonal_Unsmoothed,CDD_Seasonal_Unsmoothed_Unstandardized,Rx5Day_Seasonal_Smoothed,Rx5Day_Seasonal_Unsmoothed,...,Sea_Level_Seasonal_Unsmoothed_Unstandardized,T10_Seasonal_Smoothed,T10_Seasonal_Unsmoothed,T10_Seasonal_Unsmoothed_Unstandardized,T90_Seasonal_Smoothed,T90_Seasonal_Unsmoothed,T90_Seasonal_Unsmoothed_Unstandardized,WP90_Seasonal_Smoothed,WP90_Seasonal_Unsmoothed,WP90_Seasonal_Unsmoothed_Unstandardized
0,ALA,1961,1,0.24,0.58,0.13,-0.26,33.96,0.01,-0.57,...,7266.74,0.05,-1.51,1.31,-0.11,0.42,13.04,0.14,0.62,11.51
1,CAR,1961,1,-0.12,0.32,,,,0.08,0.87,...,,0.1,-0.98,5.47,-0.3,-0.12,10.12,-0.18,-0.47,8.62
2,CEA,1961,1,-0.31,-0.8,-0.18,-1.01,14.93,-0.17,-0.33,...,6917.55,0.09,1.19,15.88,-0.1,-0.41,8.26,-0.44,-1.05,7.18
3,CWP,1961,1,0.06,0.33,0.44,0.75,50.62,-0.49,-0.82,...,7126.53,-0.27,-1.29,2.7,0.06,0.79,13.28,-0.1,-0.27,9.0
4,MID,1961,1,0.05,-0.25,0.76,-0.47,22.53,-0.14,-1.01,...,,-0.15,-0.65,6.86,-0.15,0.49,11.98,-0.34,-0.92,7.69


In [87]:
plains_df: pd.DataFrame = climate_df[climate_df["Region"].isin(["NPL", "SPL"])]

print(f"Records before dropping empty rows: {len(plains_df)}")
plains_df = plains_df.dropna()
print(f"Records after dropping empty rows: {len(plains_df)}")
plains_df.head()

Records before dropping empty rows: 500
Records after dropping empty rows: 500


Unnamed: 0,Region,Year,Season,ACI_Combined_Seasonal_Smoothed,ACI_Combined_Seasonal_Unsmoothed,CDD_Seasonal_Smoothed,CDD_Seasonal_Unsmoothed,CDD_Seasonal_Unsmoothed_Unstandardized,Rx5Day_Seasonal_Smoothed,Rx5Day_Seasonal_Unsmoothed,...,Sea_Level_Seasonal_Unsmoothed_Unstandardized,T10_Seasonal_Smoothed,T10_Seasonal_Unsmoothed,T10_Seasonal_Unsmoothed_Unstandardized,T90_Seasonal_Smoothed,T90_Seasonal_Unsmoothed,T90_Seasonal_Unsmoothed_Unstandardized,WP90_Seasonal_Smoothed,WP90_Seasonal_Unsmoothed,WP90_Seasonal_Unsmoothed_Unstandardized
7,NPL,1961,1,0.18,0.84,-0.32,0.75,33.9,-0.39,0.45,...,7395.0,-0.02,-1.52,2.29,-0.27,0.53,13.22,0.78,0.57,11.68
10,SPL,1961,1,-0.12,-0.05,0.82,-0.54,38.19,-0.21,1.27,...,6854.45,-0.2,-1.09,5.26,-0.09,-0.23,9.37,0.05,-0.44,9.16
22,NPL,1961,2,0.21,0.21,-0.26,0.54,33.15,-0.39,-1.47,...,7316.67,-0.07,0.14,10.91,-0.23,-0.18,9.29,0.82,1.27,12.95
25,SPL,1961,2,-0.13,-0.38,0.65,-0.43,38.55,-0.13,-1.28,...,6938.56,-0.22,-0.05,9.86,-0.14,-0.35,8.78,0.1,1.12,12.51
37,NPL,1961,3,0.23,0.65,-0.19,0.33,32.39,-0.53,-2.39,...,7428.33,-0.14,-1.4,5.31,-0.08,2.96,22.06,0.75,0.19,10.41


In [88]:
season_dummies: pd.DataFrame = pd.get_dummies(plains_df['Season'], prefix='Season')
features: pd.DataFrame = pd.concat([plains_df[['Year']], season_dummies], axis=1)

x_train, x_test, y_train, y_test = train_test_split(features, plains_df['CDD_Seasonal_Smoothed'], test_size=0.3, random_state=1)

print(x_train.head())

      Year  Season_1  Season_2  Season_3  Season_4
100   1962     False     False      True     False
460   1968     False     False      True     False
2242  1998     False      True     False     False
295   1965     False     False     False      True
2707  2006      True     False     False     False


In [89]:
model = LinearRegression()
model.fit(x_train, y_train)

y_prediction = model.predict(x_test)
mse = mean_squared_error(y_test, y_prediction)
r_squared = r2_score(y_test, y_prediction)

print(f"The MSE is {round(mse, 3)}.")
print(f"The r-squared value is {round(r_squared, 3)}.")

The MSE is 0.336.
The r-squared value is 0.025.


In [90]:
region_dummies = pd.get_dummies(plains_df['Region'], prefix='Region')
features = pd.concat([features, region_dummies], axis=1)

x_train, x_test, y_train, y_test = train_test_split(features, plains_df['CDD_Seasonal_Smoothed'], test_size=0.3, random_state=1)
print(x_train.head())

      Year  Season_1  Season_2  Season_3  Season_4  Region_NPL  Region_SPL
100   1962     False     False      True     False       False        True
460   1968     False     False      True     False       False        True
2242  1998     False      True     False     False        True       False
295   1965     False     False     False      True       False        True
2707  2006      True     False     False     False        True       False


In [91]:
model = LinearRegression()
model.fit(x_train, y_train)

y_prediction = model.predict(x_test)
mse = mean_squared_error(y_test, y_prediction)
r_squared = r2_score(y_test, y_prediction)

print(f"The MSE is {round(mse, 3)}.")
print(f"The r-squared value is {round(r_squared, 3)}.")

The MSE is 0.334.
The r-squared value is 0.032.
