# Phase 2 Project Office Hours - Jan 20

Woo py files!

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from yellowbrick.regressor import ResidualsPlot
import eli5

In [None]:
df = pd.read_csv('data/kc_house_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# filling all null values with zero
df = df.fillna(value=0)

## First Model

In [None]:
# define columns to use
used_cols = [c for c in df.columns.to_list() if c not in [
    'price', 'id', 'date', 'view', 'sqft_above', 'sqft_basement',
    'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
            ]

In [None]:
# define X and y
X = df[used_cols]
X.info()

y = df['price']

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, 
                                                    random_state=42)

In [None]:
# scale
scaler = MinMaxScaler()

scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# model
lr = LinearRegression()

lr.fit(X_train_scaled, y_train)

y_train_preds = lr.predict(X_train_scaled)
y_test_preds = lr.predict(X_test_scaled)

In [None]:
#evaluate

print(f"Train R2: {r2_score(y_train, y_train_preds):.3f}")
print(f"Test R2: {r2_score(y_test, y_test_preds):.3f}")
print("---")
print(f"Train MAE: {mean_absolute_error(y_train, y_train_preds):.3f}")
print(f"Test MAE: {mean_absolute_error(y_test, y_test_preds):.3f}")
print("---")
print(f"Train RMSE: {mean_squared_error(y_train, y_train_preds, squared=False):.3f}")
print(f"Test RMSE: {mean_squared_error(y_test, y_test_preds, squared=False):.3f}")

In [None]:
# evaluate residuals
visualizer = ResidualsPlot(lr, hist=False, qqplot=True)

visualizer.fit(X_train_scaled, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test_scaled, y_test)  # Evaluate the model on the test data
visualizer.show()                        # Finalize and render the figure

In [None]:
# without yellowbrick:

# plt.scatter(y_train_pred, residuals_train, alpha=.75)
# plt.scatter(y_test_pred, residuals_test, color='g', alpha=.75)

# plt.axhline(y=0, color='black')

# plt.title('Residuals for Linear Regression Model')
# plt.ylabel('Residuals')
# plt.xlabel('Predicted Values')

In [None]:
# check feature importance
eli5.show_weights(lr, feature_names=used_cols)

In [None]:
# without eli5:

# print(f"Intercept: {lr.intercept_}")
# dict(zip(X.columns, lr.coef_))