In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import scipy.stats as stats
import os
import mlflow
import mlflow.sklearn
import pickle
import dagshub
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import mstats
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from scipy.stats import skew
from scipy.stats import zscore
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
import pprint
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import learning_curve
import re

In [2]:
# Download data, connect to dagshub
dagshub.init(repo_owner='egval20', repo_name='ml_Assignment1', mlflow=True)

mlflow.set_experiment('House Prices')
mlflow.set_tracking_uri('https://dagshub.com/egval20/ml_Assignment1.mlflow')


# Feature Selection

Drop Id

In [5]:
def remove_columns(X, threshold = 0.9):
    print("Original shape:", X.shape)

    X = X.drop(columns=["Id"])
    
    cols_to_drop = []
    for col in X.columns:
        res = X[col].value_counts(normalize=True)
        if not res.empty and res.iloc[0] >= threshold:
            cols_to_drop.append(col)
    X = X.drop(cols_to_drop, axis=1)
    print(X.shape)

    # many none
    threshold = 0.95 * len(X)
    X = X.dropna(axis=1, thresh=threshold)
    print(X.shape)

    # many zeros
    zero_proportion = (X == 0).mean()
    threshold_zero = 0.9
    columns_to_drop = zero_proportion[zero_proportion >= threshold_zero].index
    X = X.drop(columns=columns_to_drop)

    print(f"Updated shape: {X.shape}")

    X = X.loc[:, X.nunique() > 1]
    print(X.shape)
    return X

In [6]:
def domain_knowledge_remove_columns(X, is_train) :
    X = X.drop(["Neighborhood", "Condition1", "Condition2", "Exterior1st", "Exterior2nd"], axis=1)

    structural_features = ['GrLivArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'BsmtFinSF1']
    keep_structural = ['GrLivArea']

    quality_features = ['OverallQual', 'ExterQual', 'KitchenQual', 'BsmtQual']
    keep_quality = ['OverallQual']

    time_features = ['YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold']
    keep_time = ['YearBuilt']

    count_features = ['BedroomAbvGr', 'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'Fireplaces']
    keep_counts = ['BedroomAbvGr', 'FullBath']

    garage_features = ['GarageCars', 'GarageArea']
    keep_garage = ['GarageCars']

    external_features = ['LotArea', 'LotShape', 'LandContour', 'LotConfig', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch']
    keep_external = ['LotArea']

    categorical_features = ['MSSubClass', 'MSZoning', 'BldgType', 'HouseStyle', 'RoofStyle', 'Foundation', 'SaleType', 'SaleCondition']
    keep_categorical = ['BldgType', 'SaleCondition']

    if is_train :
        sale_price = ['SalePrice']
    else :
        sale_price = []

    features_to_keep = keep_structural + keep_quality + keep_time + keep_counts + keep_garage + keep_external + keep_categorical + sale_price
    X = X[features_to_keep]
    print(X.shape)
    print(features_to_keep)
    return X

# Feature Engineering

In [8]:
def map_ordinal(df, features, mapping):
    for feature in features:
        df[feature] = df[feature].map(mapping).fillna(0)
    return df

In [9]:
def tranfer_ordinal(X):
    numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

    print(numerical_features)
    print(categorical_features)

    for column in categorical_features:
        unique_categories = X[column].dropna().unique()
        ordinal_mapping = {category: idx for idx, category in enumerate(sorted(unique_categories))}
    
        X[column] = X[column].map(ordinal_mapping)

    print(X.select_dtypes(include=["object"]).columns.tolist())
    return X

### Outlier Handling

In [11]:
def outlier_handling(X):
    z_scores = np.abs(stats.zscore(X.select_dtypes(include=[np.number])))
    X = X[(z_scores < 3).all(axis=1)]
    return X

### Handling Missing Values

In [13]:
def missing_value_hanlding(X):
    X = X.fillna(X.median())
    return X

# Training

In [15]:
# Import Data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

X = train_data.copy()
X = domain_knowledge_remove_columns(X, True)
X = tranfer_ordinal(X)
X = outlier_handling(X)
X = missing_value_hanlding(X)
print(X.shape)
print(X.columns)

(1460, 10)
['GrLivArea', 'OverallQual', 'YearBuilt', 'BedroomAbvGr', 'FullBath', 'GarageCars', 'LotArea', 'BldgType', 'SaleCondition', 'SalePrice']
['GrLivArea', 'OverallQual', 'YearBuilt', 'BedroomAbvGr', 'FullBath', 'GarageCars', 'LotArea', 'SalePrice']
['BldgType', 'SaleCondition']
[]
(1303, 10)
Index(['GrLivArea', 'OverallQual', 'YearBuilt', 'BedroomAbvGr', 'FullBath',
       'GarageCars', 'LotArea', 'BldgType', 'SaleCondition', 'SalePrice'],
      dtype='object')


In [16]:
with mlflow.start_run(run_name="Feature_Selection_Model"):
    mlflow.log_param("initial_features_count", train_data.shape[1] - 2)
    mlflow.log_param("threshold_same_values", 0.9)
    mlflow.log_param("threshold_missing_values", 0.95)
    mlflow.log_param("outlier_z_score_threshold", 3)
    mlflow.log_param("final_features_count", X.shape[1])
    mlflow.log_param("final_features", list(X.columns))
    mlflow.log_param("samples_after_filtering", X.shape[0])
    

🏃 View run Feature_Selection_Model at: https://dagshub.com/egval20/ml_Assignment1.mlflow/#/experiments/0/runs/1946d7323cd0422dbb4cb8dcaefa7fc7
🧪 View experiment at: https://dagshub.com/egval20/ml_Assignment1.mlflow/#/experiments/0


In [17]:
X_features = X.drop('SalePrice', axis=1)
y = X['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X_features, y, test_size=0.2, random_state=42)

In [18]:
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [19]:
underfitting_models = {
    "Constant Predictor": Pipeline([
        ('preprocessor', preprocessor),
        ('model', DummyRegressor(strategy='constant', constant=y_train.mean()))
    ])
}

In [20]:
properly_fitted_models = {
    "Linear Regression": Pipeline([
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ]),
    
    "Ridge Regression": Pipeline([
        ('preprocessor', preprocessor),
        ('model', Ridge(alpha=1.0))
    ])
}

In [21]:
overfitting_models = {
    "Polynomial Regression (Degree 5)": Pipeline([
        ('preprocessor', preprocessor),
        ('poly', PolynomialFeatures(degree=5)),
        ('model', LinearRegression())
    ])
}

In [22]:
def evaluate_models(models_dict, X_train, y_train, X_val, y_val):
    results = {}
    
    for name, pipeline in models_dict.items():
        pipeline.fit(X_train, y_train)
        
        y_train_pred = pipeline.predict(X_train)
        y_val_pred = pipeline.predict(X_val)
        
        train_metrics = {
            'MAE': mean_absolute_error(y_train, y_train_pred),
            'MSE': mean_squared_error(y_train, y_train_pred),
            'RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred)),
            'R2': r2_score(y_train, y_train_pred)
        }
        val_metrics = {
            'MAE': mean_absolute_error(y_val, y_val_pred),
            'MSE': mean_squared_error(y_val, y_val_pred),
            'RMSE': np.sqrt(mean_squared_error(y_val, y_val_pred)),
            'R2': r2_score(y_val, y_val_pred)
        }
        overfit_gap = {
            'MAE_gap': train_metrics['MAE'] - val_metrics['MAE'],
            'MSE_gap': train_metrics['MSE'] - val_metrics['MSE'],
            'RMSE_gap': train_metrics['RMSE'] - val_metrics['RMSE'],
            'R2_gap': train_metrics['R2'] - val_metrics['R2']
        }
        results[name] = {
            'train': train_metrics,
            'validation': val_metrics,
            'gap': overfit_gap
        }
    return results

In [23]:
underfit_results = evaluate_models(underfitting_models, X_train, y_train, X_val, y_val)
proper_results = evaluate_models(properly_fitted_models, X_train, y_train, X_val, y_val)
overfit_results = evaluate_models(overfitting_models, X_train, y_train, X_val, y_val)

In [24]:
def print_model_evaluation_results(results):
    for name, metrics in results.items():
        print(f"\n{name}:")
        print(f"RMSE: {metrics['train']['RMSE']:.2f}")
        print(f"MAE: {metrics['train']['MAE']:.2f}")
        print(f"R²: {metrics['train']['R2']:.4f}")
        print(f"Validation Metrics:")
        print(f"RMSE: {metrics['validation']['RMSE']:.2f}")
        print(f"MAE: {metrics['validation']['MAE']:.2f}")
        print(f"R²: {metrics['validation']['R2']:.4f}")
        print(f"Performance Gap (Training - Validation):")
        print(f"RMSE Gap: {metrics['gap']['RMSE_gap']:.2f}")
        print(f"R² Gap: {metrics['gap']['R2_gap']:.4f}")

In [25]:
print_model_evaluation_results(underfit_results)
print_model_evaluation_results(overfit_results)
print_model_evaluation_results(proper_results)


Constant Predictor:
RMSE: 65913.11
MAE: 50783.77
R²: 0.0000
Validation Metrics:
RMSE: 65698.45
MAE: 50239.81
R²: -0.0208
Performance Gap (Training - Validation):
RMSE Gap: 214.67
R² Gap: 0.0208

Polynomial Regression (Degree 5):
RMSE: 7403.42
MAE: 3342.94
R²: 0.9874
Validation Metrics:
RMSE: 588363227.34
MAE: 108254137.59
R²: -81871782.8925
Performance Gap (Training - Validation):
RMSE Gap: -588355823.92
R² Gap: 81871783.8799

Linear Regression:
RMSE: 26841.45
MAE: 20119.51
R²: 0.8342
Validation Metrics:
RMSE: 29019.60
MAE: 21429.81
R²: 0.8008
Performance Gap (Training - Validation):
RMSE Gap: -2178.14
R² Gap: 0.0333

Ridge Regression:
RMSE: 26841.51
MAE: 20115.88
R²: 0.8342
Validation Metrics:
RMSE: 29018.16
MAE: 21429.26
R²: 0.8008
Performance Gap (Training - Validation):
RMSE Gap: -2176.64
R² Gap: 0.0333


In [26]:
def log_models_to_mlflow(results):
    mlflow.set_experiment('House Prices')
    for name, metrics in results.items():
        safe_name = re.sub(r'[^\w\s-]', '', name)
        with mlflow.start_run(run_name=safe_name):
            mlflow.log_metric("train_RMSE", metrics['train']['RMSE'])
            mlflow.log_metric("train_MAE", metrics['train']['MAE'])
            mlflow.log_metric("train_R2", metrics['train']['R2'])
            
            mlflow.log_metric("val_RMSE", metrics['validation']['RMSE'])
            mlflow.log_metric("val_MAE", metrics['validation']['MAE'])
            mlflow.log_metric("val_R2", metrics['validation']['R2'])
            
            mlflow.log_metric("RMSE_Gap", metrics['gap']['RMSE_gap'])
            mlflow.log_metric("R2_Gap", metrics['gap']['R2_gap'])
            
            print(f"Logged {name} to MLflow as a separate experiment")

In [27]:
log_models_to_mlflow(underfit_results)
log_models_to_mlflow(overfit_results)
log_models_to_mlflow(proper_results)

Logged Constant Predictor to MLflow as a separate experiment
🏃 View run Constant Predictor at: https://dagshub.com/egval20/ml_Assignment1.mlflow/#/experiments/0/runs/000a75e425044ef1bcf28e438293f34e
🧪 View experiment at: https://dagshub.com/egval20/ml_Assignment1.mlflow/#/experiments/0
Logged Polynomial Regression (Degree 5) to MLflow as a separate experiment
🏃 View run Polynomial Regression Degree 5 at: https://dagshub.com/egval20/ml_Assignment1.mlflow/#/experiments/0/runs/76b267d21a5449b2b6a8e218c4d79618
🧪 View experiment at: https://dagshub.com/egval20/ml_Assignment1.mlflow/#/experiments/0
Logged Linear Regression to MLflow as a separate experiment
🏃 View run Linear Regression at: https://dagshub.com/egval20/ml_Assignment1.mlflow/#/experiments/0/runs/0d832c4d75044d09bfdd73bd1a219a14
🧪 View experiment at: https://dagshub.com/egval20/ml_Assignment1.mlflow/#/experiments/0
Logged Ridge Regression to MLflow as a separate experiment
🏃 View run Ridge Regression at: https://dagshub.com/egva