# Housing Regression Example
## Train Models Notebook

In [None]:
import os
import pandas as pd
from joblib import dump
import matplotlib.pyplot as plt

In [None]:
# Define data path
data_path = os.path.join('..', 'data')

In [None]:
# Set X input data path
X_train_data_path = os.path.join(data_path, 'X_train.csv')

In [None]:
# Set y input data path
y_train_data_path = os.path.join(data_path, 'y_train.csv')

In [None]:
# Read in data
X_train = pd.read_csv(X_train_data_path)
y_train = pd.read_csv(y_train_data_path)

In [None]:
# Check current directory
os.getcwd()

In [None]:
# Set path to model dir and ensure exists
model_path = os.path.join('..', 'models')
os.makedirs(model_path, exist_ok=True)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Set model output dir
lr_model_name = os.path.join(model_path, 'lr.pkl')
rf_model_name = os.path.join(model_path, 'rf.pkl')

In [None]:
# Instantiate model, fit model, save model
lr = LinearRegression()
lr.fit(X_train, y_train)
dump(lr, lr_model_name)

In [None]:
# Instantiate model, fit model, save model
rf = RandomForestRegressor()
rf.fit(X_train, y_train.target)
dump(rf, rf_model_name)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
# Predict on train with both models
# NOTE: test metrics are more insightful
lr_train_preds = lr.predict(X_train)
rf_train_preds = rf.predict(X_train)

In [None]:
# Calculate mean squared error for both models
lr_mse = mean_squared_error(y_train, lr_train_preds)
rf_mse = mean_squared_error(y_train, rf_train_preds)

In [None]:
# Print calculations
print(f"The MSE for the linear regression models is : {lr_mse: .2f}")
print(f"The MSE for the random forest regression models is : {rf_mse: .2f}")

In [None]:
# Ensure img dir is set and exists
image_path = os.path.join('..', 'imgs')
os.makedirs(image_path, exist_ok=True)
error_plot_path = os.path.join(image_path, 'Train_Scatterplot.png')

In [None]:
# Plot both predictions
plt.figure(figsize=(10,10))
plt.scatter(y_train, lr_train_preds, c='crimson', label='Linear Regression')
plt.scatter(y_train, rf_train_preds, c='gold', label='RF Regression')

plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.title('Training Error', fontsize=15)

plt.legend()
plt.tight_layout()
plt.savefig(error_plot_path)
plt.show()