In [80]:
import pickle
import pathlib

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [81]:
DATA_DIR = pathlib.Path.cwd()
clean_data_path = DATA_DIR / 'data' / 'processed' / 'ames_clean.pkl'
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

model_data = data.copy()
# data.info()

In [82]:
## erro aumenta
# model_data['Bathrooms'] = model_data['Full.Bath'] + model_data['Half.Bath'] + model_data['Bsmt.Full.Bath'] + model_data['Bsmt.Half.Bath']
# model_data = model_data.drop(columns=['Full.Bath', 'Half.Bath', 'Bsmt.Full.Bath', 'Bsmt.Half.Bath'])

## erro aumenta
# model_data['Porch'] = model_data['Open.Porch.SF'] + model_data['Enclosed.Porch'] + model_data['X3Ssn.Porch'] + model_data['Screen.Porch']
# model_data = model_data.drop(columns=['Open.Porch.SF', 'Enclosed.Porch', 'X3Ssn.Porch', 'Screen.Porch'])

# erro diminui
# soma de todos os 'pes' quadrados
model_data['Total.Liv.SF'] = model_data['X1st.Flr.SF'] + model_data['X2nd.Flr.SF'] + model_data['Gr.Liv.Area'] + model_data['BsmtFin.SF.1'] + model_data['BsmtFin.SF.2'] + model_data['Low.Qual.Fin.SF'] + model_data['Bsmt.Unf.SF']
model_data = model_data.drop(columns=['X1st.Flr.SF', 'X2nd.Flr.SF', 'Gr.Liv.Area', 'BsmtFin.SF.1', 'BsmtFin.SF.2', 'Low.Qual.Fin.SF', 'Bsmt.Unf.SF'])

In [83]:

# -------------- Encode Ordinal -------------- #
categorical_columns = []
ordinal_columns = []
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)

# -------------- Factorize -------------- #
for col in ordinal_columns:
    codes, _ = pd.factorize(data[col], sort=True)
    model_data[col] = codes

# -------------- One Hot Encoding -------------- #
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data, drop_first=True)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

# -------------- Should have helped with linearRegression errors -------------- #
model_data = pd.get_dummies(model_data, drop_first=True)
for cat in categorical_columns:
    dummies = []
    for col in model_data.columns:
        if col.startswith(cat + "_"):
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)

In [84]:
# We're using ridge because not all versions of python work perfectly with linearRegression
from sklearn.linear_model import Ridge 

X = model_data.drop(columns=['SalePrice']).copy()
y = model_data['SalePrice'].copy()

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=42)

model = Ridge(alpha=0.01)
model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)

RMSE = np.sqrt(mean_squared_error(ytest, ypred))
RMSE

0.06047644625183051

In [85]:
error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 14.94%
