In [498]:
# Clear memory
%reset -f

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [499]:
# System imports
import sys
import importlib
from pathlib import Path

# Visualization imports
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Project setup
project_root = Path().resolve().parent
sys.path.append(str(project_root))

# Project modules
MODULES = [
    'iowa_dream.data',
    'iowa_dream.utils.describer',
    'iowa_dream.feature_engineering',
    'iowa_dream.feature_engineering.add_drop_features',
    'iowa_dream.utils.plotting_EDA',
    'iowa_dream.feature_engineering.categotical_transformer',
    'iowa_dream.feature_engineering.numerical_transformer',
]

# Reload modules
for module in MODULES:
    if module in sys.modules:
        importlib.reload(sys.modules[module])
    else:
        __import__(module)

# Project imports
from iowa_dream.data.importer import load_config
from iowa_dream.data.loader import preliminary_loader
from iowa_dream.utils.describer import (
    categorical_describer,
    numerical_describer, 
    analyze_categorical_sparsity
)
from iowa_dream.utils.plotting_EDA import (
    plot_missing_data_heatmap,
    box_plot_dist,
    output_distribution_plotting,
    plot_target_over_time,
    plot_feature_distributions_interactive,
    plot_feature_target_relationships_interactive,
    plot_price_vs_sale_condition,
    plot_cramer_v_associations,
    anova_categorical_feature_importance,
    plot_numerical_correlation_matrix
)
from sklearn.preprocessing import OneHotEncoder
from glum import GeneralizedLinearRegressor
from iowa_dream.feature_engineering.lot_frontage_imputer import *
from iowa_dream.feature_engineering.add_drop_features import *
from iowa_dream.feature_engineering.categotical_transformer import *
from iowa_dream.feature_engineering.numerical_transformer import *


In [500]:
# Example usage
data_file = project_root / load_config()['kaggle']['cleaned_path'] / 'cleaned_AmesHousing.csv'
df = preliminary_loader(data_file, standardize=False)

In [501]:
# Extract feature column names for different data types
config = load_config()
data_dict = config['data_dict']
preliminary_dropped_features = config['preliminary_dropped_features']

# Extract column lists for each data type, excluding dropped features
ordinal = [col for col in data_dict['ordinal_columns']['columns'] 
          if col not in preliminary_dropped_features]
nominal = [col for col in data_dict['nominal_columns']['columns']
          if col not in preliminary_dropped_features] 
continuous = [col for col in data_dict['continuous_columns']['columns']
             if col not in preliminary_dropped_features]
discrete = [col for col in data_dict['discrete_columns']['columns'] 
           if col not in preliminary_dropped_features]

# Create mapping of neighborhoods to their university proximity category
proximity_data = {
    neighborhood: group['category'] 
    for group in config['university_proximity']
    for neighborhood in group['neighborhoods']
}

In [503]:
# Initialize and apply the GroupMedianImputer for lot_frontage
lot_frontage_imputer = GroupMedianImputer(
    group_cols=['neighborhood', 'lot_config'],
    target_col='lot_frontage'
)

# Transform the data
df = lot_frontage_imputer.fit_transform(df)

# Verify no missing values remain in lot_frontage
print(f"Missing values in lot_frontage after imputation: {df['lot_frontage'].isna().sum()}")

# Initialize transformers
add_numerical = AddAttributes_Numerical(add_attributes=True)

# Apply transformations
df_transformed = add_numerical.fit_transform(df)

add_ordinal = AddAttributes_Ordinal(add_attributes=True, proximity_data=proximity_data)
df_transformed = add_ordinal.fit_transform(df_transformed)

drop_features = DropFeatures(col_drop=preliminary_dropped_features)
df_transformed = drop_features.fit_transform(df_transformed)

# Compare columns
dropped_cols = set(df.columns) - set(df_transformed.columns)
new_cols = set(df_transformed.columns) - set(df.columns)

# Initialize numerical transformer
continuous = continuous + ['pct_finished_bsmt_sf', 'pct_half_bath', 'timing_remodel_index', 'total_area']

numerical_transformer = RobustScalerWithIndicator()

# Apply transformation
df_transformed_numerical = numerical_transformer.fit_transform(df_transformed[continuous+discrete])

# Replace original columns with transformed ones
df_transformed = df_transformed.drop(columns=continuous+discrete)
df_transformed = pd.concat([df_transformed, df_transformed_numerical], axis=1)

# Apply nominal transformer
nominal_transformer = NominalTransformer()
df_transformed_nominal = nominal_transformer.fit_transform(df_transformed[nominal])

# Apply ordinal merger
ordinal = ordinal + ['exterior_quality_score', 'interior_quality_score', 'university_proximity_category']
ordinal_merger = OrdinalMerger(min_obs=int(0.01 * len(df_transformed)))  # 1% of data size
df_transformed_ordinal = ordinal_merger.fit_transform(df_transformed[ordinal])

# Replace original columns with transformed ones
df_transformed = df_transformed.drop(columns=nominal + ordinal)
df_transformed = pd.concat([df_transformed, df_transformed_nominal, df_transformed_ordinal], axis=1)

# Get columns that contain 'zero' in their name
zero_indicator = [col for col in df_transformed.columns if 'zero' in col.lower()]

# Initialize encoders for both nominal and ordinal variables
nominal_encoder = OneHotEncoder(sparse_output=False, drop='first')  # Drop first category to avoid multicollinearity
ordinal_encoder = OneHotEncoder(sparse_output=False, drop='first')

# Fit and transform nominal variables
nominal_encoded = pd.DataFrame(
    nominal_encoder.fit_transform(df_transformed[nominal]),
    columns=nominal_encoder.get_feature_names_out(nominal),
    index=df_transformed.index
)

# Fit and transform ordinal variables 
ordinal_encoded = pd.DataFrame(
    ordinal_encoder.fit_transform(df_transformed[ordinal]),
    columns=ordinal_encoder.get_feature_names_out(ordinal),
    index=df_transformed.index
)

# Replace original nominal and ordinal columns with encoded ones
df_transformed = df_transformed.drop(columns=nominal + ordinal)
df_transformed = pd.concat([df_transformed, nominal_encoded, ordinal_encoded], axis=1)


# Initialize GLM with Gamma distribution and log link function
glm = GeneralizedLinearRegressor(family='gamma', link='log')



Dropped columns:
['bldg_type', 'garage_area', 'garage_year_blt', 'house_style', 'pool_qu', 'utilities']

New columns:
['exterior_quality_score', 'interior_quality_score', 'pct_finished_bsmt_sf', 'pct_half_bath', 'timing_remodel_index', 'total_area', 'university_proximity_category']


In [508]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# Select top 50 features based on mutual information
selector = SelectKBest(score_func=mutual_info_regression, k=50)

# Prepare X and y
X = df_transformed.drop('saleprice', axis=1)
y = df_transformed['saleprice']

# Fit and transform
X = pd.DataFrame(
    selector.fit_transform(X, y),
    columns=X.columns[selector.get_support()],
    index=X.index
)

# Print selected features
print("\nSelected features:")
print(sorted(list(X.columns)))


Selected features:
['1st_flr_sf', '2nd_flr_sf', 'bsmt_qu_2', 'bsmt_qu_4', 'bsmt_qu_6', 'bsmt_unf_sf', 'bsmtfin_sf_1', 'bsmtfin_type_1_6', 'exter_qu_2', 'exterior_1st_VinylSd', 'fireplace_qu_4', 'fireplaces', 'fireplaces_zero_indicator', 'foundation_CBlock', 'foundation_PConc', 'full_bath', 'garage_cars', 'garage_finish_2', 'garage_finish_3', 'garage_finish_4', 'garage_qu_2', 'garage_type_Attchd', 'garage_type_Detchd', 'gr_liv_area', 'heating_qu_5', 'kitchen_qu_3', 'kitchen_qu_5', 'lot_area', 'lot_frontage', 'mas_vnr_area', 'mas_vnr_area_zero_indicator', 'mas_vnr_type_nan', 'ms_subclass_60', 'open_porch_sf', 'open_porch_sf_zero_indicator', 'order', 'overall_cond_5', 'overall_qu_5', 'overall_qu_7', 'overall_qu_8', 'overall_qu_9', 'pct_half_bath', 'pid', 'sale_type_New', 'timing_remodel_index', 'total_area', 'total_bsmt_sf', 'totrms_abvgr', 'wood_deck_sf', 'year_blt_51+']
