In [1]:
import pickle
import pathlib

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
DATA_DIR = pathlib.Path.cwd()
clean_data_path = DATA_DIR / 'data' / 'processed' / 'ames_clean.pkl'
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

# data.info()

In [3]:
model_data = data.copy()

# -------------- Encode Ordinal -------------- #
categorical_columns = []
ordinal_columns = []
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)

# -------------- Factorize -------------- #
for col in ordinal_columns:
    codes, _ = pd.factorize(data[col], sort=True)
    model_data[col] = codes

# -------------- One Hot Encoding -------------- #
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data, drop_first=True)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

# -------------- ???????????? -------------- #
model_data = pd.get_dummies(model_data, drop_first=True)
for cat in categorical_columns:
    dummies = []
    for col in model_data.columns:
        if col.startswith(cat + "_"):
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)

In [10]:
from sklearn.linear_model import Ridge

X = model_data.drop(columns=['SalePrice']).copy()
y = model_data['SalePrice'].copy()

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=42)

model = Ridge(alpha=0.01)
model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)

RMSE = np.sqrt(mean_squared_error(ytest, ypred))
RMSE

0.06112328105082156

In [11]:
error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 15.11%
