In [30]:
import os
import notebook
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import re

directory = 'c:\\Users\\Callsoom\\Documents\\ML-Getting-Started\\house_price_prediction'
train_data_path = os.path.join(directory, 'train.csv')
test_data_path = os.path.join(directory, 'test.csv')
train_data_raw = pd.read_csv(train_data_path)
test_data_raw = pd.read_csv(test_data_path)

In [31]:

def pythonize_field_names(field_names):
    pythonized_names = []
    
    for name in field_names:
        if name[0].isupper() and name[1:].islower():
            # If the field starts with a capital letter and has no other capitals
            pythonized_names.append(name.lower())
        else:
            # Handle fields with more than one capital letter
            parts = re.split('([A-Z]+)', name)
            new_name = []
            for part in parts:
                if part.isupper():
                    if len(part) > 1:
                        # More than one consecutive uppercase, underscore only before the first
                        new_name.append('_' + part[0].lower() + part[1:].lower())
                    else:
                        # Single uppercase letter, prefix with underscore
                        new_name.append('_' + part.lower())
                else:
                    new_name.append(part.lower())
            pythonized_names.append(''.join(new_name).lstrip('_'))
    
    return pythonized_names

In [32]:
def fill_missing(df):
    df = df.copy()
    fillers = {
        'bsmt_qual': 'no', 'bsmt_cond': 'no', 'alley': 'no', 'bsmt_fin_type2': 'no', 'garage_type': 'no', 'bsmt_fin_type1': 'no',
        'mas_vnr_type': 'no', 'fireplace_qu': 'no', 'bsmt_exposure': 'no', 'fence': 'no', 'garage_cond': 'no',
        'misc_feature': 'no', 'pool_qc': 'no', 'garage_qual': 'no', 'garage_finish': 'no',
        'electrical': df['electrical'].mode()[0],
        'mas_vnr_area': 0, 'garage_yr_blt': -1,
        'lot_frontage': df['lot_frontage'].mean()
    }
    df.fillna(fillers, inplace=True)
    return df

def feature_engineering(df):
    df = df.copy()
    df['property_age'] = 2024 - df['year_built']
    return df

def preprocessing_pipeline(numerical_cols, catagorical_cols):
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, catagorical_cols)
        ])

    return preprocessor

In [33]:
train_data_raw.columns = pythonize_field_names(list(train_data_raw.columns))
test_data_raw.columns = pythonize_field_names(list(test_data_raw.columns))

train_data = fill_missing(train_data_raw)
test_data = fill_missing(test_data_raw)
train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)

numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('sale_price')
numerical_cols.remove('id')

categorical_cols = train_data.select_dtypes(include=['object', 'category']).columns.tolist()

X = train_data.drop(columns=['sale_price', 'id'])
y = train_data['sale_price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = preprocessing_pipeline(numerical_cols, categorical_cols)
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', Ridge(alpha=1.0))])

# Fit the model
model.fit(X_train, y_train)

# Predict and calculate RMSE on validation set
val_predictions = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
print("Validation RMSE:", rmse)

# Optionally, perform cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
average_rmse = np.sqrt(-scores.mean())
print("Average RMSE from cross-validation:", average_rmse)

Validation RMSE: 29742.679559040822
Average RMSE from cross-validation: 33170.43478694476
