In [787]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import scipy.stats as stats
import os
import mlflow
import mlflow.sklearn
import pickle
import dagshub
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import mstats
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from scipy.stats import skew
from scipy.stats import zscore
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
import pprint

In [788]:
# Download data, connect to dagshub
dagshub.init(repo_owner='egval20', repo_name='ml_Assignment1', mlflow=True)

mlflow.set_experiment('House Prices')
mlflow.set_tracking_uri('https://dagshub.com/egval20/ml_Assignment1.mlflow')


# Feature Selection

Drop Id

In [789]:
def remove_columns(X, threshold = 0.9):
    print("Original shape:", X.shape)

    X = X.drop(columns=["Id"])
    
    cols_to_drop = []
    for col in X.columns:
        res = X[col].value_counts(normalize=True)
        if not res.empty and res.iloc[0] >= threshold:
            cols_to_drop.append(col)
    X = X.drop(cols_to_drop, axis=1)
    print(X.shape)

    # many none
    threshold = 0.95 * len(X)
    X = X.dropna(axis=1, thresh=threshold)
    print(X.shape)

    # many zeros
    zero_proportion = (X == 0).mean()
    threshold_zero = 0.9
    columns_to_drop = zero_proportion[zero_proportion >= threshold_zero].index
    X = X.drop(columns=columns_to_drop)

    print(f"Updated shape: {X.shape}")

    X = X.loc[:, X.nunique() > 1]
    print(X.shape)
    return X

In [790]:
def domain_knowledge_remove_columns(X) :
    X = X.drop(["Neighborhood", "Condition1", "Condition2", "Exterior1st", "Exterior2nd"], axis=1)

    structural_features = ['GrLivArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'BsmtFinSF1']
    keep_structural = ['GrLivArea']

    quality_features = ['OverallQual', 'ExterQual', 'KitchenQual', 'BsmtQual']
    keep_quality = ['OverallQual']

    time_features = ['YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold']
    keep_time = ['YearBuilt']

    count_features = ['BedroomAbvGr', 'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'Fireplaces']
    keep_counts = ['BedroomAbvGr', 'FullBath']

    garage_features = ['GarageCars', 'GarageArea']
    keep_garage = ['GarageCars']

    external_features = ['LotArea', 'LotShape', 'LandContour', 'LotConfig', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch']
    keep_external = ['LotArea']

    categorical_features = ['MSSubClass', 'MSZoning', 'BldgType', 'HouseStyle', 'RoofStyle', 'Foundation', 'SaleType', 'SaleCondition']
    keep_categorical = ['BldgType', 'SaleCondition']

    sale_price = ['SalePrice']

    features_to_keep = keep_structural + keep_quality + keep_time + keep_counts + keep_garage + keep_external + keep_categorical + sale_price
    X = X[features_to_keep]
    print(X.shape)
    print(features_to_keep)
    return X

# Feature Engineering

In [791]:
def map_ordinal(df, features, mapping):
    for feature in features:
        df[feature] = df[feature].map(mapping).fillna(0)
    return df

In [792]:
def tranfer_ordinal(X):
    numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

    print(numerical_features)
    print(categorical_features)

    for column in categorical_features:
        unique_categories = X[column].dropna().unique()
        ordinal_mapping = {category: idx for idx, category in enumerate(sorted(unique_categories))}
    
        X[column] = X[column].map(ordinal_mapping)

    print(X.select_dtypes(include=["object"]).columns.tolist())
    return X

### Outlier Handling

In [793]:
def outlier_handling(X):
    z_scores = np.abs(stats.zscore(X.select_dtypes(include=[np.number])))
    X = X[(z_scores < 3).all(axis=1)]
    return X

### Handling Missing Values

In [794]:
def missing_value_hanlding(X):
    X = X.fillna(X.median())
    return X

In [795]:
with mlflow.start_run(run_name="Feature_Selection_Model"):
    mlflow.log_param("initial_features_count", train_data.shape[1] - 2)
    mlflow.log_param("threshold_same_values", threshold)
    mlflow.log_param("threshold_missing_values", 0.95)
    mlflow.log_param("outlier_z_score_threshold", 3)
    mlflow.log_param("final_features_count", X.shape[1])
    mlflow.log_param("final_features", list(X.columns))
    mlflow.log_param("samples_after_filtering", X.shape[0])
    

🏃 View run Feature_Selection_Model at: https://dagshub.com/egval20/ml_Assignment1.mlflow/#/experiments/0/runs/f12ad488ef3d4d689d8cf4cea0110317
🧪 View experiment at: https://dagshub.com/egval20/ml_Assignment1.mlflow/#/experiments/0


# Training

In [796]:
# Import Data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

X = train_data.copy()
X = domain_knowledge_remove_columns(X)
X = tranfer_ordinal(X)
X = outlier_handling(X)
X = missing_value_hanlding(X)
print(X.shape)
print(X.columns)

(1460, 10)
['GrLivArea', 'OverallQual', 'YearBuilt', 'BedroomAbvGr', 'FullBath', 'GarageCars', 'LotArea', 'BldgType', 'SaleCondition', 'SalePrice']
['GrLivArea', 'OverallQual', 'YearBuilt', 'BedroomAbvGr', 'FullBath', 'GarageCars', 'LotArea', 'SalePrice']
['BldgType', 'SaleCondition']
[]
(1303, 10)
Index(['GrLivArea', 'OverallQual', 'YearBuilt', 'BedroomAbvGr', 'FullBath',
       'GarageCars', 'LotArea', 'BldgType', 'SaleCondition', 'SalePrice'],
      dtype='object')


In [799]:
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5)
}

overfitting_models = {
    "Polynomial Regression (Degree 5)": Pipeline([
        ('poly', PolynomialFeatures(degree=5)),
        ('scaler', StandardScaler()),
        ('linear', LinearRegression())
    ]),
    
    "Random Forest (Deep Trees)": RandomForestRegressor(
        n_estimators=100,
        max_depth=None, 
        min_samples_split=2, 
        min_samples_leaf=1,
        max_features=None, 
        random_state=42
    ),
    
    "Neural Network (Complex)": MLPRegressor(
        hidden_layer_sizes=(100, 100, 100),
        max_iter=1000,
        activation='relu',
        solver='adam',
        random_state=42
    )
}


results = {}

X_features = X.drop('SalePrice', axis=1)
y = X['SalePrice']

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_features, y, test_size=0.2, random_state=42)

for name, model in overfitting_models.items():
    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    
    # Evaluate the model
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, y_pred)
    
    results[name] = {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2
    }
    
    pprint.pprint(f"Name: {name}")
    pprint.pprint(results[name])

'Name: Polynomial Regression (Degree 5)'
{'MAE': 47863796.525396064,
 'MSE': 4.002111975789436e+16,
 'R2': -9465256.77367763,
 'RMSE': 200052792.427135}
'Name: Random Forest (Deep Trees)'
{'MAE': 18492.960993431854,
 'MSE': 750075770.4480399,
 'R2': 0.8226021520629716,
 'RMSE': 27387.5112131066}
'Name: Neural Network (Complex)'
{'MAE': 17544.647367488193,
 'MSE': 659865470.3582209,
 'R2': 0.8439374807433372,
 'RMSE': 25687.846744291764}
