### Analysis part 1: interest rates

In [199]:
import pandas as pd
import numpy as np
import sklearn as sk

In [200]:
df = pd.read_csv('data/cleaned data/total_market_data_merged.csv')
df = df.drop(['Unnamed: 0', "YEAR_x", "YEAR_y", 'DATE'], axis = 1)

In [201]:
df.head()

Unnamed: 0,Five-Digit ZIP Code,Year,Annual Change (%),HPI,HPI from 2012,HPI with 2012 base,normalized_sale_price,RECESSION_FLAG,avg_rate_for_year,Median_hh_income,Year_Avg_Unempl
0,1001.0,1984,.,100.0,279.95,0.357207,58369.68857,0,10.225,55828,7.7
1,1002.0,1984,15.40,115.4,348.54,0.331095,91121.681896,0,10.225,55828,7.7
2,1020.0,1984,14.40,114.4,296.68,0.385601,54643.995979,0,10.225,55828,7.7
3,1027.0,1984,24.50,124.5,400.14,0.311141,67051.04593,0,10.225,55828,7.7
4,1028.0,1984,15.12,197.78,537.26,0.368127,82988.087351,0,10.225,55828,7.7


### Part 4/5/bonus: machine learning

In [202]:
# This is the data I'm going to make an annual change % with: zip code, year, recession flag, avg rate
new_data = np.array([[97405, 2023, 0, 5.0, 71000, 5],
                     [97405, 2024, 0, 5.0, 71000, 5],
                     [97405, 2025, 0, 5.0, 71000, 5]])

In [203]:
# create a dataset with no missing annual change values - just drop them

dfml = df[df["Annual Change (%)"].notna()]
dfml=dfml[~dfml.isin(["."]).any(axis=1)]

In [204]:
# define my target var
mltarget = dfml["Annual Change (%)"]
y_mltarget = mltarget.values

In [205]:
# define my features
mlfeatures = dfml[["Five-Digit ZIP Code", "Year", "RECESSION_FLAG", "avg_rate_for_year", "Median_hh_income", "Year_Avg_Unempl"]]
mlfeatures['Year'] = pd.to_datetime(mlfeatures['Year'], format='%Y')
mlfeatures['Year'] = pd.to_datetime(mlfeatures['Year']).dt.year
#mlfeatures.set_index('Year', inplace=True)
print(mlfeatures.dtypes)

Five-Digit ZIP Code    float64
Year                     int64
RECESSION_FLAG           int64
avg_rate_for_year      float64
Median_hh_income         int64
Year_Avg_Unempl        float64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mlfeatures['Year'] = pd.to_datetime(mlfeatures['Year'], format='%Y')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mlfeatures['Year'] = pd.to_datetime(mlfeatures['Year']).dt.year


In [206]:
mlfeatures.head()

Unnamed: 0,Five-Digit ZIP Code,Year,RECESSION_FLAG,avg_rate_for_year,Median_hh_income,Year_Avg_Unempl
1,1002.0,1984,0,10.225,55828,7.7
2,1020.0,1984,0,10.225,55828,7.7
3,1027.0,1984,0,10.225,55828,7.7
4,1028.0,1984,0,10.225,55828,7.7
5,1040.0,1984,0,10.225,55828,7.7


In [207]:
mlfeatures.shape

(414660, 6)

#### Creating training and testing datasets

In [208]:


# Split the data into training and testing sets
# First using random
mlfeatures_train, mlfeatures_test, mltarget_train, mltarget_test = train_test_split(mlfeatures, mltarget, test_size=0.2, random_state=42)
x_mlfeatures_train = mlfeatures_train.values #creating numpy versions here
x_mlfeatures_test = mlfeatures_test.values
y_mltarget_train = mltarget_train.values
y_mltarget_test = mltarget_test.values

# Print the shapes of the training and testing sets
print("Shape of mlfeatures_train:", mlfeatures_train.shape)
print("Shape of mltarget_train:", mltarget_train.shape)
print("Shape of mlfeatures_test:", mlfeatures_test.shape)
print("Shape of mltarget_test:", mltarget_test.shape)

# Second split: using dates
dfmltrim = dfml[["Five-Digit ZIP Code", "Year", "RECESSION_FLAG", "avg_rate_for_year",'Median_hh_income','Year_Avg_Unempl', 'Annual Change (%)']]
train_date_df = dfmltrim[dfmltrim['Year'] < 2019]
x_train_date_df = train_date_df.values
test_date_df = dfmltrim[dfmltrim['Year'] >= 2019]
x_test_date_df = test_date_df.values
# Separate features and target variables
mlfeatures_train_date = train_date_df.drop('Annual Change (%)', axis=1)
x_mlfeatures_train_date = mlfeatures_train_date.values #create a numpy version 
mltarget_train_date = train_date_df['Annual Change (%)']
y_mltarget_train_date = mltarget_train_date.values # numpy again, and also below...
mlfeatures_test_date = test_date_df.drop('Annual Change (%)', axis=1)
x_mlfeatures_test_date = mlfeatures_test_date.values
mltarget_test_date = test_date_df['Annual Change (%)']
y_mltarget_test_date = mltarget_test_date.values

print("Shape of mlfeatures_train_date:", mlfeatures_train_date.shape)
print("Shape of mltarget_train_date:", mltarget_train_date.shape)
print("Shape of mlfeatures_test_date:", mlfeatures_test_date.shape)
print("Shape of mltarget_test_date:", mltarget_test_date.shape)


Shape of mlfeatures_train: (331728, 6)
Shape of mltarget_train: (331728,)
Shape of mlfeatures_test: (82932, 6)
Shape of mltarget_test: (82932,)
Shape of mlfeatures_train_date: (364739, 6)
Shape of mltarget_train_date: (364739,)
Shape of mlfeatures_test_date: (49921, 6)
Shape of mltarget_test_date: (49921,)


In [209]:
mlfeatures_train.head()

Unnamed: 0,Five-Digit ZIP Code,Year,RECESSION_FLAG,avg_rate_for_year,Median_hh_income,Year_Avg_Unempl
174416,30021.0,2003,0,1.1275,63967,5.8
169693,95073.0,2002,0,1.666667,64047,5.9
196029,98072.0,2004,0,1.349167,63745,5.6
118214,45241.0,1998,0,5.353333,64781,4.5
397582,47136.0,2020,1,0.375833,71186,5.1


In [210]:
mlfeatures_train.columns

Index(['Five-Digit ZIP Code', 'Year', 'RECESSION_FLAG', 'avg_rate_for_year',
       'Median_hh_income', 'Year_Avg_Unempl'],
      dtype='object')

## Linear Regression

In [211]:
from sklearn.linear_model import LinearRegression

# Create a Linear Regression model for each
lr_model = LinearRegression()
lr_model_date = LinearRegression()

# Fit the model to the training data
lr_model.fit(x_mlfeatures_train, y_mltarget_train)
lr_model_date.fit(x_mlfeatures_train_date, y_mltarget_train_date)


LinearRegression()

In [212]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on the testing data
y_mltarget_pred = lr_model.predict(x_mlfeatures_test) # 80/20 split
y_mltarget_date_pred = lr_model_date.predict(x_mlfeatures_test_date) # date-based split

print("80-20 split results")
# Print the key stats: mse, rmse, r-squared
mse = mean_squared_error(y_mltarget_test, y_mltarget_pred)
print("Mean Squared Error:", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)
r2 = r2_score(y_mltarget_test, y_mltarget_pred)
print("R-squared:", r2)


print("date-based results")
# Print the key stats: mse, rmse, r-squared
mse_date = mean_squared_error(y_mltarget_test, y_mltarget_pred)
print("Mean Squared Error:", mse_date)
rmse_date = np.sqrt(mse_date)
print("Root Mean Squared Error:", rmse_date)
r2_date = r2_score(y_mltarget_test_date, y_mltarget_date_pred)
print("R-squared:", r2)


80-20 split results
Mean Squared Error: 46.86627544855477
Root Mean Squared Error: 6.845894788013819
R-squared: 0.21805482735883686
date-based results
Mean Squared Error: 46.86627544855477
Root Mean Squared Error: 6.845894788013819
R-squared: 0.21805482735883686


In [213]:
# TESTING THE MODEL OUT


# Use the trained model to make predictions for the new dataset
predictions = lr_model.predict(new_data)
predictions_date = lr_model_date.predict(new_data)
# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions)
print("Predicted Annual Change (%):", predictions_date, " based on date-split train test")


Predicted Annual Change (%): [7.43032164 7.3854586  7.34059556]
Predicted Annual Change (%): [3.31083825 3.2106276  3.11041696]  based on date-split train test


## Random Forest

In [214]:
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest model
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
# Fit the model to the training data
rf_model.fit(x_mlfeatures_train, y_mltarget_train)

y_mltarget_pred = rf_model.predict(x_mlfeatures_test)

In [215]:
print("Results based on 80-20 split")
# Print the key stats: mse, rmse, r-squared
mse = mean_squared_error(y_mltarget_test, y_mltarget_pred)
print("Mean Squared Error:", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)
r2 = r2_score(y_mltarget_test, y_mltarget_pred)
print("R-squared:", r2)

Results based on 80-20 split
Mean Squared Error: 27.434710029783623
Root Mean Squared Error: 5.237815387142203
R-squared: 0.542262770717767


In [216]:
# Date based split
# Create a Random Forest model
rf_model_date = RandomForestRegressor(n_estimators=200, random_state=42)
# Fit the model to the training data
rf_model_date.fit(x_mlfeatures_train_date, y_mltarget_train_date)

# Make predictions on the testing data

y_mltarget_date_pred = rf_model_date.predict(x_mlfeatures_test_date)

In [217]:
print("Results based on date split")
# Print the key stats: mse, rmse, r-squared
mse_date = mean_squared_error(y_mltarget_test_date, y_mltarget_date_pred)
print("Mean Squared Error:", mse_date)
rmse_date = np.sqrt(mse_date)
print("Root Mean Squared Error:", rmse_date)
r2_date = r2_score(y_mltarget_test_date, y_mltarget_date_pred)
print("R-squared:", r2_date)

Results based on date split
Mean Squared Error: 74.11522006213326
Root Mean Squared Error: 8.609019692283974
R-squared: -0.3623447822636039


In [218]:
# TESTING THE RANDOM SPLIT MODEL OUT


# Use the trained model to make predictions for the new dataset
predictions = rf_model.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions)

Predicted Annual Change (%): [15.5711 15.5711 15.5711]


In [219]:
# TESTING THE DATE MODEL OUT


# Use the trained model to make predictions for the new dataset
predictions = rf_model_date.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions)

Predicted Annual Change (%): [5.57675 5.57675 5.57675]


## Gradient Boost

In [220]:
from sklearn.ensemble import GradientBoostingRegressor

# Create an empty Gradient Boosting Regressor model
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# Fit the model to the training data
gb_model.fit(x_mlfeatures_train, y_mltarget_train)

# Make predictions on the testing data
y_mltarget_pred = gb_model.predict(x_mlfeatures_test)


In [221]:

print("results from 80-20 split")
# Print the key stats: mse, rmse, r-squared
mse = mean_squared_error(y_mltarget_test, y_mltarget_pred)
print("Mean Squared Error:", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)
r2 = r2_score(y_mltarget_test, y_mltarget_pred)
print("R-squared:", r2)


results from 80-20 split
Mean Squared Error: 25.785538313017287
Root Mean Squared Error: 5.077946269213301
R-squared: 0.5697785451299523


In [222]:
# Create a Gradient Boosting Regressor model
gb_model_date = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# Fit the model to the training data
gb_model_date.fit(mlfeatures_train_date, mltarget_train_date)

# Make predictions on the testing data
mltarget_pred_date = gb_model_date.predict(mlfeatures_test_date)


In [223]:
# Print the key stats: mse, rmse, r-squared
mse_date = mean_squared_error(mltarget_test_date, mltarget_pred_date)
print("Mean Squared Error:", mse)
rmse_date = np.sqrt(mse_date)
print("Root Mean Squared Error:", rmse)
r2 = r2_score(mltarget_test_date, mltarget_pred_date)
print("R-squared:", r2)

Mean Squared Error: 25.785538313017287
Root Mean Squared Error: 5.077946269213301
R-squared: -0.18059401467350544


In [224]:
# TESTING THE 80-20 SPLIT MODEL OUT

# Use the trained model to make predictions for the new dataset
predictions = gb_model.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%) 80-20 model:", predictions)

Predicted Annual Change (%): [11.8800865 11.8800865 11.8800865]


In [None]:
# TESTING THE DATE MODEL OUT


# Use the trained model to make predictions for the new dataset
predictions_date = gb_model_date.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%) date-split model:", predictions_date)

Predicted Annual Change (%): [6.55908359 6.55908359 6.55908359]




## Voting regressor to combine them

In [226]:
from sklearn.ensemble import VotingRegressor

# Create the voting regressor
voting_model = VotingRegressor([('rf', rf_model), ('gb', gb_model), ('lr', lr_model)])

# Fit the voting regressor to the training data
voting_model.fit(x_mlfeatures_train, y_mltarget_train)

# Make predictions on the testing data
y_mltarget_pred = voting_model.predict(mlfeatures_test)






In [227]:
# Print the key stats: mse, rmse, r-squared
mse = mean_squared_error(y_mltarget_test, y_mltarget_pred)
print("Mean Squared Error:", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)
r2 = r2_score(y_mltarget_test, y_mltarget_pred)
print("R-squared:", r2)

Mean Squared Error: 25.958450006126125
Root Mean Squared Error: 5.094943572418258
R-squared: 0.5668935822771204


In [230]:
# Create the voting regressor
voting_model_date = VotingRegressor([('rf', rf_model_date), ('gb', gb_model_date), ('lr', lr_model_date)])

# Fit the voting regressor to the training data
voting_model_date.fit(x_mlfeatures_train_date, y_mltarget_train_date)

# Make predictions on the testing data
y_mltarget_pred_date = voting_model_date.predict(x_mlfeatures_test_date)


In [None]:
# Print the key stats: mse, rmse, r-squared
mse_date = mean_squared_error(y_mltarget_test_date, y_mltarget_pred_date)
print("Mean Squared Error:", mse_date)
rmse_date = np.sqrt(mse_date)
print("Root Mean Squared Error:", rmse_date)
r2_date = r2_score(y_mltarget_test_date, y_mltarget_pred_date)
print("R-squared:", r2_date)

In [None]:

# Use the trained model to make predictions for the new dataset
predictions = voting_model.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions)



Predicted Annual Change (%): [13.28816043 13.27320609 13.25825174]




In [None]:
# TESTING THE MODEL OUT

# Use the trained model to make predictions for the new dataset
predictions_date = voting_model_date.predict(new_data)

# Print the predicted annual change (%) for each year
print("Predicted Annual Change (%):", predictions_date)

Predicted Annual Change (%): [5.35903653 5.32563299 5.29222944]


