### Analysis part 1: interest rates

In [39]:
import pandas as pd
import numpy as np
import sklearn as sk

In [42]:
df = pd.read_csv('data/cleaned data/total_market_data_merged.csv')
df = df.drop(['Unnamed: 0', "YEAR_x", "YEAR_y"], axis = 1)

In [43]:
df.head()

Unnamed: 0,Five-Digit ZIP Code,Year,Annual Change (%),HPI,HPI from 2012,HPI with 2012 base,normalized_sale_price,RECESSION_FLAG,avg_rate_for_year
0,1001.0,1984,.,100.0,279.95,0.357207,58369.68857,0,10.225
1,1002.0,1984,15.40,115.4,348.54,0.331095,91121.681896,0,10.225
2,1020.0,1984,14.40,114.4,296.68,0.385601,54643.995979,0,10.225
3,1027.0,1984,24.50,124.5,400.14,0.311141,67051.04593,0,10.225
4,1028.0,1984,15.12,197.78,537.26,0.368127,82988.087351,0,10.225


### Part 4/5/bonus: machine learning

In [None]:
# This is the data I'm going to make an annual change % with: zip code, year, recession flag, avg rate
new_data = np.array([[97405, 2023, 0, 5.0],
                     [97405, 2024, 0, 5.0],
                     [97405, 2025, 0, 5.0]])

In [83]:
# create a dataset with no missing annual change values - just drop them

dfml = df[df["Annual Change (%)"].notna()]
dfml=dfml[~dfml.isin(["."]).any(axis=1)]

In [84]:
# define my target var
mltarget = dfml["Annual Change (%)"]

In [92]:
# define my features
mlfeatures = dfml[["Five-Digit ZIP Code", "Year", "RECESSION_FLAG", "avg_rate_for_year"]]
mlfeatures['Year'] = pd.to_datetime(mlfeatures['Year'], format='%Y')
mlfeatures['Year'] = pd.to_datetime(mlfeatures['Year']).dt.year
#mlfeatures.set_index('Year', inplace=True)
print(mlfeatures.dtypes)

Five-Digit ZIP Code    float64
Year                     int64
RECESSION_FLAG           int64
avg_rate_for_year      float64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mlfeatures['Year'] = pd.to_datetime(mlfeatures['Year'], format='%Y')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mlfeatures['Year'] = pd.to_datetime(mlfeatures['Year']).dt.year


In [93]:
mlfeatures.head()

Unnamed: 0,Five-Digit ZIP Code,Year,RECESSION_FLAG,avg_rate_for_year
1,1002.0,1984,0,10.225
2,1020.0,1984,0,10.225
3,1027.0,1984,0,10.225
4,1028.0,1984,0,10.225
5,1040.0,1984,0,10.225


In [94]:
mlfeatures.shape

(435906, 4)

#### Creating training and testing datasets

In [130]:


# Split the data into training and testing sets
# First using random
mlfeatures_train, mlfeatures_test, mltarget_train, mltarget_test = train_test_split(mlfeatures, mltarget, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Shape of mlfeatures_train:", mlfeatures_train.shape)
print("Shape of mltarget_train:", mltarget_train.shape)
print("Shape of mlfeatures_test:", mlfeatures_test.shape)
print("Shape of mltarget_test:", mltarget_test.shape)

# Second split: using dates
dfmltrim = dfml[["Five-Digit ZIP Code", "Year", "RECESSION_FLAG", "avg_rate_for_year", 'Annual Change (%)']]
train_date_df = dfmltrim[dfmltrim['Year'] < 2019]
test_date_df = dfmltrim[dfmltrim['Year'] >= 2019]

# Separate features and target variables
mlfeatures_train_date = train_date_df.drop('Annual Change (%)', axis=1)
mltarget_train_date = train_date_df['Annual Change (%)']
mlfeatures_test_date = test_date_df.drop('Annual Change (%)', axis=1)
mltarget_test_date = test_date_df['Annual Change (%)']

print("Shape of mlfeatures_train_date:", mlfeatures_train_date.shape)
print("Shape of mltarget_train_date:", mltarget_train_date.shape)
print("Shape of mlfeatures_test_date:", mlfeatures_test_date.shape)
print("Shape of mltarget_test_date:", mltarget_test_date.shape)


Shape of mlfeatures_train: (348724, 4)
Shape of mltarget_train: (348724,)
Shape of mlfeatures_test: (87182, 4)
Shape of mltarget_test: (87182,)
Shape of mlfeatures_train_date: (385985, 4)
Shape of mltarget_train_date: (385985,)
Shape of mlfeatures_test_date: (49921, 4)
Shape of mltarget_test_date: (49921,)


## Linear Regression

In [131]:
from sklearn.linear_model import LinearRegression

# Create a Linear Regression model for each
lr_model = LinearRegression()
lr_model_date = LinearRegression()

# Fit the model to the training data
lr_model.fit(mlfeatures_train, mltarget_train)
lr_model_date.fit(mlfeatures_train_date, mltarget_train_date)


LinearRegression()

In [132]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on the testing data
mltarget_pred = lr_model.predict(mlfeatures_test) # 80/20 split
mltarget_date_pred = lr_model_date.predict(mlfeatures_test_date) # date-based split

print("80-20 split results")
# Compute the mean squared error
mse = mean_squared_error(mltarget_test, mltarget_pred)
print("Mean Squared Error:", mse)
# Compute the root mean squared error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)
# Compute the R-squared score
r2 = r2_score(mltarget_test, mltarget_pred)
print("R-squared:", r2)


print("date-based results")
mse_date = mean_squared_error(mltarget_test, mltarget_pred)
print("Mean Squared Error:", mse_date)
# Compute the root mean squared error
rmse_date = np.sqrt(mse_date)
print("Root Mean Squared Error:", rmse_date)
# Compute the R-squared score
r2_date = r2_score(mltarget_test_date, mltarget_date_pred)
print("R-squared:", r2)


80-20 split results
Mean Squared Error: 57.164272290929546
Root Mean Squared Error: 7.560705806399925
R-squared: 0.11279030306727944
date-based results
Mean Squared Error: 57.164272290929546
Root Mean Squared Error: 7.560705806399925
R-squared: 0.11279030306727944


In [133]:
# TESTING THE MODEL OUT


# Use the trained model to make predictions for the new dataset
predictions = lr_model.predict(new_data)
predictions_date = lr_model_date.predict(new_data)
# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions)
print("Predicted Annual Change (%):", predictions_date, " based on date-split train test")


Predicted Annual Change (%): [13.42194258 13.02620294 12.6304633 ]
Predicted Annual Change (%): [9.15039505 8.75175197 8.35310889]  based on date-split train test




## Random Forest

In [134]:
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest model
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
# Fit the model to the training data
rf_model.fit(mlfeatures_train, mltarget_train)

print("Results based on 80-20 split")
mltarget_pred = rf_model.predict(mlfeatures_test)
# Compute the mean squared error
mse = mean_squared_error(mltarget_test, mltarget_pred)
print("Mean Squared Error:", mse)
# Compute the root mean squared error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)
# Compute the R-squared score
r2 = r2_score(mltarget_test, mltarget_pred)
print("R-squared:", r2)





Results based on 80-20 split
Mean Squared Error: 31.102743928544506
Root Mean Squared Error: 5.576983407590927
R-squared: 0.517274428437034


In [138]:
# Date based split
# Create a Random Forest model
rf_model_date = RandomForestRegressor(n_estimators=200, random_state=42)
# Fit the model to the training data
rf_model_date.fit(mlfeatures_train_date, mltarget_train_date)

# Make predictions on the testing data
print("Results based on date split")
mltarget_date_pred = rf_model_date.predict(mlfeatures_test_date)
# Compute the mean squared error
mse_date = mean_squared_error(mltarget_test_date, mltarget_date_pred)
print("Mean Squared Error:", mse_date)
# Compute the root mean squared error
rmse_date = np.sqrt(mse_date)
print("Root Mean Squared Error:", rmse_date)
# Compute the R-squared score
r2_date = r2_score(mltarget_test_date, mltarget_date_pred)
print("R-squared:", r2_date)

Results based on date split
Mean Squared Error: 73.9727855158881
Root Mean Squared Error: 8.600743311824164
R-squared: -0.35972662959902957


In [139]:
# TESTING THE RANDOM SPLIT MODEL OUT


# Use the trained model to make predictions for the new dataset
predictions = rf_model.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions)



Predicted Annual Change (%): [7.1146 7.1146 7.1146]


In [140]:
# TESTING THE DATE MODEL OUT


# Use the trained model to make predictions for the new dataset
predictions = rf_model_date.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions)

Predicted Annual Change (%): [7.1146 7.1146 7.1146]




## Gradient Boost

In [107]:
from sklearn.ensemble import GradientBoostingRegressor

# Create a Gradient Boosting Regressor model
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# Fit the model to the training data
gb_model.fit(mlfeatures_train, mltarget_train)

# Make predictions on the testing data
mltarget_pred = gb_model.predict(mlfeatures_test)

# Compute the mean squared error
mse = mean_squared_error(mltarget_test, mltarget_pred)
print("Mean Squared Error:", mse)

# Compute the root mean squared error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

# Compute the R-squared score
r2 = r2_score(mltarget_test, mltarget_pred)
print("R-squared:", r2)


Mean Squared Error: 30.065853951864266
Root Mean Squared Error: 5.483233895418311
R-squared: 0.5333673271147457


In [141]:
# Create a Gradient Boosting Regressor model
gb_model_date = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# Fit the model to the training data
gb_model_date.fit(mlfeatures_train_date, mltarget_train_date)

# Make predictions on the testing data
mltarget_pred_date = gb_model_date.predict(mlfeatures_test_date)

# Compute the mean squared error
mse_date = mean_squared_error(mltarget_test_date, mltarget_pred_date)
print("Mean Squared Error:", mse)

# Compute the root mean squared error
rmse_date = np.sqrt(mse_date)
print("Root Mean Squared Error:", rmse)

# Compute the R-squared score
r2 = r2_score(mltarget_test_date, mltarget_pred_date)
print("R-squared:", r2)

Mean Squared Error: 31.102743928544506
Root Mean Squared Error: 5.576983407590927
R-squared: -0.18100187121452893


In [143]:
# TESTING THE SPLIT MODEL OUT

# Use the trained model to make predictions for the new dataset
predictions = gb_model.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions)

Predicted Annual Change (%): [16.3668514  16.3668514  15.68058264]




In [142]:
# TESTING THE DATE MODEL OUT


# Use the trained model to make predictions for the new dataset
predictions_date = gb_model_date.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions_date)

Predicted Annual Change (%): [7.38601276 8.50069072 8.33317877]




## Voting regressor to combine them

In [109]:
from sklearn.ensemble import VotingRegressor

# Create the voting regressor
voting_model = VotingRegressor([('rf', rf_model), ('gb', gb_model), ('lr', lr_model)])

# Fit the voting regressor to the training data
voting_model.fit(mlfeatures_train, mltarget_train)

# Make predictions on the testing data
mltarget_pred = voting_model.predict(mlfeatures_test)

# Compute the mean squared error
mse = mean_squared_error(mltarget_test, mltarget_pred)
print("Mean Squared Error:", mse)

# Compute the root mean squared error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

# Compute the R-squared score
r2 = r2_score(mltarget_test, mltarget_pred)
print("R-squared:", r2)


Mean Squared Error: 30.15628962009876
Root Mean Squared Error: 5.491474266542525
R-squared: 0.5319637336009878


In [144]:
# Create the voting regressor
voting_model_date = VotingRegressor([('rf', rf_model_date), ('gb', gb_model_date), ('lr', lr_model_date)])

# Fit the voting regressor to the training data
voting_model_date.fit(mlfeatures_train_date, mltarget_train_date)

# Make predictions on the testing data
mltarget_pred_date = voting_model_date.predict(mlfeatures_test_date)

# Compute the mean squared error
mse_date = mean_squared_error(mltarget_test_date, mltarget_pred_date)
print("Mean Squared Error:", mse_date)

# Compute the root mean squared error
rmse_date = np.sqrt(mse_date)
print("Root Mean Squared Error:", rmse_date)

# Compute the R-squared score
r2_date = r2_score(mltarget_test_date, mltarget_pred_date)
print("R-squared:", r2_date)

Mean Squared Error: 68.16872023276142
Root Mean Squared Error: 8.25643508015181
R-squared: -0.2530395274389585


In [146]:
# TESTING THE MODEL OUT

# Create a new dataset with the desired feature values
new_data = np.array([[97405, 2023, 0, 5.0],
                     [97405, 2024, 0, 5.0],
                     [97405, 2025, 0, 5.0]])

# Use the trained model to make predictions for the new dataset
predictions = voting_model.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions)

Predicted Annual Change (%): [15.59551466 15.68364844 15.77178221]




In [147]:
# TESTING THE MODEL OUT

# Use the trained model to make predictions for the new dataset
predictions_date = voting_model_date.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions_date)

Predicted Annual Change (%): [7.88366927 7.91741429 7.95115931]


