## Import necessary libraries

In [144]:
# Clear memory
%reset -f

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [145]:
# System imports
import sys
import importlib
from pathlib import Path

# Visualization imports
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Project setup
project_root = Path().resolve().parent
sys.path.append(str(project_root))

# Project modules
MODULES = [
    'iowa_dream.utils',
    'iowa_dream.data',
    'iowa_dream.feature_engineering',
    'iowa_dream.feature_engineering.lot_frontage_imputer',
    'iowa_dream.evaluation',
    'iowa_dream.evaluation.metrics_plot'
]

# Reload modules
for module in MODULES:
    if module in sys.modules:
        importlib.reload(sys.modules[module])
    else:
        __import__(module)

# Project imports
from iowa_dream.data.importer import load_config
from iowa_dream.data.loader import preliminary_loader
from iowa_dream.utils.sample_split import create_sample_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from glum import GeneralizedLinearRegressor, GeneralizedLinearRegressorCV
from iowa_dream.feature_engineering.lot_frontage_imputer import LotFrontageGroupMedianImputer
from iowa_dream.feature_engineering.add_drop_features import Add_Drop_Attributes
from iowa_dream.feature_engineering.categotical_transformer import *
from iowa_dream.feature_engineering.numerical_transformer import *
from iowa_dream.evaluation.metrics_plot import *

In [146]:
# Example usage
data_file = project_root / load_config()['kaggle']['cleaned_path'] / 'cleaned_AmesHousing.csv'
df = preliminary_loader(data_file)

In [147]:
# Get data dictionary from config
config = load_config()
cleaned_data_dict = config['cleaned_data_dict']

# Extract feature groups
ordinal_features = cleaned_data_dict['ordinal']['columns']
nominal_features = cleaned_data_dict['nominal']['columns'] 
discrete_features = cleaned_data_dict['discrete']['columns']
continuous_features = cleaned_data_dict['continuous']['columns']
proximity_data = {
    neighborhood: group['category'] 
    for group in config['university_proximity']
    for neighborhood in group['neighborhoods']
}
glm_data_dict = config['glm_data_dict']
try:
    glm_ordinal_features = glm_data_dict['categorical']['ordinal']['columns']
    glm_nominal_features = glm_data_dict['categorical']['nominal']['columns']
    glm_numerical_features = glm_data_dict['numerical']['columns']
except KeyError as e:
    print(f"Error accessing data dictionary structure: {e}")
    # Use default feature lists from earlier if glm structure not found
    glm_ordinal_features = cleaned_data_dict['ordinal']['columns']
    glm_nominal_features = cleaned_data_dict['nominal']['columns']
    glm_numerical_features = continuous_features + discrete_features

In [148]:
df = create_sample_split(df, 'pid')
train_df = df[df['sample'] == 'train']
test_df = df[df['sample'] == 'test']
y = df['saleprice']

# Separate features (X) and target (y)
X_train = train_df.drop(['saleprice', 'sample', 'pid'], axis=1)
y_train = train_df['saleprice']
X_test = test_df.drop(['saleprice', 'sample', 'pid'], axis=1)
y_test = test_df['saleprice']

In [149]:
X_train.columns

Index(['lot_frontage', 'lot_area', 'lot_config', 'neighborhood', 'overall_qu',
       'overall_cond', 'year_blt', 'exterior_1st', 'mas_vnr_type', 'exter_qu',
       'foundation', 'bsmt_qu', 'bsmt_exposure', 'bsmt_unf_sf',
       'total_bsmt_sf', 'heating_qu', '2nd_flr_sf', 'gr_liv_area', 'full_bath',
       'half_bath', 'bedroom_abvgr', 'kitchen_qu', 'fireplaces',
       'fireplace_qu', 'garage_area', 'wood_deck_sf', 'mo_sold', 'year_sold'],
      dtype='object')

# 0. Baseline median predictor

#1. Median predictor

#1. Baseline GLM

In [150]:
baseline_preprocessor = ColumnTransformer(
    transformers=[
        (
            "group_impute",
            LotFrontageGroupMedianImputer(
                group_cols=['neighborhood', 'lot_config'],
                target_col='lot_frontage'
            ),
            ['neighborhood', 'lot_config', 'lot_frontage'],
        ),
        ("cat", OneHotEncoder(sparse_output=False, drop="first", handle_unknown='ignore'), nominal_features),
    ]
)
baseline_preprocessor.set_output(transform="pandas")
baseline_GLM_model_pipeline = Pipeline(
    [
        ("preprocess", baseline_preprocessor),
        (
            "estimate",
            GeneralizedLinearRegressor(
                family='gamma', l1_ratio=1, fit_intercept=True
            ),
        ),
    ]
)
baseline_GLM_model_pipeline.fit(X_train, y_train)
reevaluate_models([baseline_GLM_model_pipeline], X_train, y_train)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,39739.53,21.92%,16.47%,19772.31,0.75


This is already quite good :), indicating that our model is capturing quite well the pattern in the data. 

# 2. GLM with combined (added features) and processed data

In [151]:
# Preprocessing pipeline for numerical features
numerical_pipeline = Pipeline(steps=[
    ('winsorized_scaler', WinsorizedRobustScaler(range_min=10, range_max=99))
])

# Preprocessing pipeline for ordinal features
ordinal_pipeline = Pipeline(steps=[
    ('ordinal_merger', OrdinalMerger(min_obs=10))
])

# Preprocessing pipeline for nominal features
nominal_pipeline = Pipeline(steps=[
    ('nominal_grouper', NominalGrouper(min_obs=10)),
    ('onehot', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'))
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, glm_numerical_features),
    ('ord', ordinal_pipeline, glm_ordinal_features),
    ('nom', nominal_pipeline, glm_nominal_features)
])

# Full pipeline
glm_1_pipeline = Pipeline(steps=[
    ('imputer', LotFrontageGroupMedianImputer(group_cols=['neighborhood', 'lot_config'], target_col='lot_frontage')),
    ('feature_add_drop', Add_Drop_Attributes(proximity_data=proximity_data)),
    ('preprocessor', preprocessor),
    ('glm', GeneralizedLinearRegressor(family='gamma', link='log', l1_ratio=1, fit_intercept=True))  # GeneralizedLinearRegressor equivalent for gamma family
])

glm_1_pipeline

In [152]:
# Fit and evaluate
glm_1_pipeline.fit(X_train, y_train)
reevaluate_models([baseline_GLM_model_pipeline, glm_1_pipeline], X_train, y_train)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,39739.53,21.92%,16.47%,19772.31,0.75
Model 2,21330.47,11.77%,9.21%,10413.38,0.93


In [153]:
reevaluate_models([baseline_GLM_model_pipeline,  glm_1_pipeline], X_test, y_test)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,44376.3,25.23%,16.92%,18936.76,0.64
Model 2,22821.78,12.98%,9.37%,10450.47,0.9


In [154]:
from sklearn.preprocessing import PolynomialFeatures

# Define interaction terms
interaction_features = [
    ('age', 'exter_qu'),
    ('gr_liv_area', 'overall_score'),
    ('gr_liv_area', 'neighborhood_score'), 
    ('gr_liv_area', 'age'),
    ('lot_area', 'lot_frontage'),
]

# Preprocessing pipeline for interaction terms
interaction_pipeline = Pipeline(steps=[
    ('interaction', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False))
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, glm_numerical_features),
    ('ord', ordinal_pipeline, glm_ordinal_features),
    ('nom', nominal_pipeline, glm_nominal_features),
    ('interaction', interaction_pipeline, [f[0] for f in interaction_features] + [f[1] for f in interaction_features])
])

# Full pipeline
glm_2_pipeline = Pipeline(steps=[
    ('imputer', LotFrontageGroupMedianImputer(group_cols=['neighborhood', 'lot_config'], target_col='lot_frontage')),
    ('feature_add_drop', Add_Drop_Attributes(proximity_data=proximity_data)),
    ('preprocessor', preprocessor),
    ('glm', GeneralizedLinearRegressor(family='gamma', link='log', l1_ratio=1, fit_intercept=True))
])

glm_2_pipeline

In [155]:
# Fit and evaluate
glm_2_pipeline.fit(X_train, y_train)
reevaluate_models([baseline_GLM_model_pipeline, glm_1_pipeline, glm_2_pipeline], X_train, y_train)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,39739.53,21.92%,16.47%,19772.31,0.75
Model 2,21330.47,11.77%,9.21%,10413.38,0.93
Model 3,20120.58,11.10%,8.44%,9597.33,0.94


In [156]:
reevaluate_models([baseline_GLM_model_pipeline, glm_1_pipeline, glm_2_pipeline], X_test, y_test)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,44376.3,25.23%,16.92%,18936.76,0.64
Model 2,22821.78,12.98%,9.37%,10450.47,0.9
Model 3,22197.94,12.62%,8.82%,10352.69,0.91
