In [None]:
import pandas as pd
path="train.csv"
train=pd.read_csv(path)
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
test=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')

In [None]:
# Make copies to avoid modifying the original dataframes
train_processed = train.copy()
test_processed = test.copy()

In [None]:
# Store IDs for later
train_ID = train_processed['Id']
test_ID = test_processed['Id']


In [None]:
# Store the target variable
target = train_processed['SalePrice']
train_processed.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test_processed.drop(['Id'], axis=1, inplace=True)

In [None]:
# Combine train and test for consistent preprocessing
all_data = pd.concat([train_processed, test_processed], axis=0)
print(f"Original dataset shape: {all_data.shape}")

Original dataset shape: (2919, 79)


In [None]:
# ================ 1. MISSING VALUE TREATMENT ================
print("\nSTEP 1: Missing Value Treatment")

# Check missing values before imputation
missing_before = all_data.isnull().sum()[all_data.isnull().sum() > 0].sort_values(ascending=False)
print(f"Features with missing values before imputation: {len(missing_before)}")

# LotFrontage: Impute with neighborhood median
lot_frontage_by_neighborhood = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.median())
all_data['LotFrontage'] = all_data['LotFrontage'].fillna(lot_frontage_by_neighborhood)
# If any remain NA, use overall median
all_data['LotFrontage'] = all_data['LotFrontage'].fillna(all_data['LotFrontage'].median())

# Masonry veneer
all_data['MasVnrType'] = all_data['MasVnrType'].fillna('None')
all_data['MasVnrArea'] = all_data['MasVnrArea'].fillna(0)

# Basement features
basement_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for col in basement_cols:
    all_data[col] = all_data[col].fillna('No_Basement')

# Garage features
garage_cols = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for col in garage_cols:
    all_data[col] = all_data[col].fillna('No_Garage')
all_data['GarageYrBlt'] = all_data['GarageYrBlt'].fillna(0)

#fireplace
all_data['FireplaceQu'] = all_data['FireplaceQu'].fillna('No_Fireplace')
# Features with rare occurrences
rare_cols = ['Alley', 'PoolQC', 'Fence', 'MiscFeature']
for col in rare_cols:
    all_data[col] = all_data[col].fillna('None')

# Electrical - fill with mode
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])

# Check missing values after imputation
missing_after = all_data.isnull().sum()[all_data.isnull().sum() > 0].sort_values(ascending=False)
print(f"Features with missing values after imputation: {len(missing_after)}")
if len(missing_after) > 0:
    print("Remaining missing values:")
    print(missing_after)


STEP 1: Missing Value Treatment
Features with missing values before imputation: 34
Features with missing values after imputation: 15
Remaining missing values:
MSZoning        4
Utilities       2
BsmtFullBath    2
BsmtHalfBath    2
Functional      2
Exterior2nd     1
Exterior1st     1
BsmtUnfSF       1
BsmtFinSF2      1
BsmtFinSF1      1
TotalBsmtSF     1
KitchenQual     1
GarageCars      1
GarageArea      1
SaleType        1
dtype: int64


In [None]:
# ================ 2. CREATE BINARY FLAGS FOR PRESENCE/ABSENCE ================
print("\nSTEP 2: Creating Binary Flags")

all_data['HasAlley'] = (all_data['Alley'] != 'None').astype(int)
all_data['HasPool'] = (all_data['PoolQC'] != 'None').astype(int)
all_data['HasFence'] = (all_data['Fence'] != 'None').astype(int)
all_data['HasMiscFeature'] = (all_data['MiscFeature'] != 'None').astype(int)
all_data['HasBasement'] = (all_data['BsmtQual'] != 'No_Basement').astype(int)
all_data['HasGarage'] = (all_data['GarageType'] != 'No_Garage').astype(int)
all_data['HasMasVnr'] = (all_data['MasVnrType'] != 'None').astype(int)
all_data['HasSecondFloor'] = (all_data['2ndFlrSF'] > 0).astype(int)
all_data['HasFireplace'] = (all_data['Fireplaces'] > 0).astype(int)


STEP 2: Creating Binary Flags


In [None]:
all_data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,SaleCondition,HasAlley,HasPool,HasFence,HasMiscFeature,HasBasement,HasGarage,HasMasVnr,HasSecondFloor,HasFireplace
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,Normal,0,0,0,0,1,1,1,1,0
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,Normal,0,0,0,0,1,1,0,0,1
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,Normal,0,0,0,0,1,1,1,1,1
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,Abnorml,0,0,0,0,1,1,0,1,1
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,Normal,0,0,0,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,Normal,0,0,0,0,1,0,0,1,0
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,Abnorml,0,0,0,0,1,1,0,1,0
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,Abnorml,0,0,0,0,1,1,0,0,1
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,Normal,0,0,1,1,1,0,0,0,0


In [None]:
# ================ 3. AGE-RELATED FEATURES ================
print("\nSTEP 3: Creating Age-Related Features")

# Extract year information
all_data['PropertyAge'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['RecentRemodel'] = ((all_data['YrSold'] - all_data['YearRemodAdd']) <= 5).astype(int)
all_data['AgeWhenRemodeled'] = all_data['YearRemodAdd'] - all_data['YearBuilt']
# Handle cases where remodel year is the same as built year
all_data.loc[all_data['AgeWhenRemodeled'] == 0, 'AgeWhenRemodeled'] = 0
all_data['RecentConstruction'] = (all_data['PropertyAge'] <= 5).astype(int)
all_data['YearsSinceRemodel'] = all_data['YrSold'] - all_data['YearRemodAdd']
all_data['IsNewHome'] = (all_data['YearBuilt'] == all_data['YrSold']).astype(int)


STEP 3: Creating Age-Related Features


In [None]:
# ================ 4. SIZE & AREA METRICS ================
print("\nSTEP 4: Creating Size & Area Metrics")

all_data['TotalSF'] = all_data['GrLivArea'] + all_data['TotalBsmtSF']
all_data['TotalBathrooms'] = (
    all_data['FullBath'] +
    (0.5 * all_data['HalfBath']) +
    all_data['BsmtFullBath'] +
    (0.5 * all_data['BsmtHalfBath'])
)



STEP 4: Creating Size & Area Metrics


In [None]:

# Avoid division by zero for percentage features
all_data['PctUnfinishedBasement'] = 0
mask = all_data['TotalBsmtSF'] > 0
all_data.loc[mask, 'PctUnfinishedBasement'] = all_data.loc[mask, 'BsmtUnfSF'] / all_data.loc[mask, 'TotalBsmtSF']

all_data['LivingAreaRatio'] = all_data['GrLivArea'] / all_data['LotArea']
all_data['TotalPorchArea'] = (
    all_data['OpenPorchSF'] +
    all_data['EnclosedPorch'] +
    all_data['3SsnPorch'] +
    all_data['ScreenPorch']
)


In [None]:
# ================ 5. QUALITY ENCODINGS ================
print("\nSTEP 5: Encoding Quality Variables")

# Create quality mapping dictionaries
quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No_Basement': 0, 'No_Garage': 0, 'None': 0}
exposure_map = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'No_Basement': 0, 'None': 0}
finish_map = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'No_Basement': 0, 'None': 0}
fireplace_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No_Fireplace': 0}


map_dict = {
    'ExterQual': quality_map,
    'ExterCond': quality_map,
    'BsmtQual': quality_map,
    'BsmtCond': quality_map,
    'KitchenQual': quality_map,
    'FireplaceQu': fireplace_map,
    'GarageQual': quality_map,
    'GarageCond': quality_map,
    'PoolQC': quality_map,
    'BsmtExposure': exposure_map,
    'BsmtFinType1': finish_map,
    'BsmtFinType2': finish_map,
}

for col, mapper in map_dict.items():
    if col in all_data.columns:
        all_data[f'{col}_Encoded'] = all_data[col].map(mapper).fillna(0)


# Apply special mappings
all_data['BsmtExposure_Encoded'] = all_data['BsmtExposure'].map(exposure_map)

finish_cols = ['BsmtFinType1', 'BsmtFinType2']
for col in finish_cols:
    all_data[f'{col}_Encoded'] = all_data[col].map(finish_map)



STEP 5: Encoding Quality Variables


In [None]:
# ================ 6. QUALITY COMPOUND METRICS ================
print("\nSTEP 6: Creating Quality Compound Metrics")

all_data['OverallGrade'] = all_data['OverallQual'] * all_data['OverallCond']
all_data['KitchenScore'] = all_data['KitchenQual_Encoded'] * all_data['KitchenAbvGr']
# Only count kitchens that exist
all_data.loc[all_data['KitchenAbvGr'] == 0, 'KitchenScore'] = 0

all_data['ExteriorScore'] = all_data['ExterQual_Encoded'] + all_data['ExterCond_Encoded']
all_data['BathroomScore'] = all_data['TotalBathrooms'] * (
    (all_data['FullBath'] * 1.5 + all_data['HalfBath']) /
    (all_data['TotalBathrooms'] + 0.01)  # Avoid division by zero
)



STEP 6: Creating Quality Compound Metrics


In [None]:
# ================ 7. NEIGHBORHOOD & LOCATION ANALYSIS ================
print("\nSTEP 7: Neighborhood & Location Analysis")

# Calculate neighborhood metrics using the training data
# We need the target variable for this
train_data = all_data.iloc[:len(train)]
test_data = all_data.iloc[len(train):]

# Calculate mean price by neighborhood
neighborhood_df = pd.DataFrame({'Neighborhood': train_data['Neighborhood'], 'SalePrice': target})
neighborhood_means = neighborhood_df.groupby('Neighborhood')['SalePrice'].mean()

# Get neighborhood price tiers (quintiles)
neighborhood_tiers = pd.qcut(
    neighborhood_means,
    q=5,
    labels=['Economic', 'Standard', 'Medium', 'High', 'Luxury']
).to_dict()

# Add neighborhood features to all data
all_data['MeanNeighborhoodPrice'] = all_data['Neighborhood'].map(neighborhood_means)
all_data['NeighborhoodTier'] = all_data['Neighborhood'].map(neighborhood_tiers)

# Encode neighborhood tiers
tier_map = {'Luxury': 5, 'High': 4, 'Medium': 3, 'Standard': 2, 'Economic': 1}
all_data['NeighborhoodTier_Encoded'] = all_data['NeighborhoodTier'].map(tier_map)

# Fill missing values for test data neighborhoods not in train
all_data['MeanNeighborhoodPrice'].fillna(neighborhood_means.mean(), inplace=True)
all_data['NeighborhoodTier_Encoded'].fillna(3, inplace=True)  # Default to Medium

# Simplified zoning density
zone_density = {
    'A': 1, 'C': 2, 'FV': 3, 'I': 1,
    'RH': 4, 'RL': 2, 'RP': 2, 'RM': 3
}
# Extract first character from MSZoning to get base zone type
all_data['ZoneType'] = all_data['MSZoning'].str[0]
all_data['ZoneDensity'] = all_data['ZoneType'].map(zone_density).fillna(2)


STEP 7: Neighborhood & Location Analysis


In [None]:
# ================ 8. SIMPLIFIED CATEGORICAL FEATURES ================
print("\nSTEP 8: Simplifying Categorical Features")

# Simplify house style
all_data['StyleSimplified'] = all_data['HouseStyle'].apply(
    lambda x: '1Story' if x in ['1Story', '1.5Fin', '1.5Unf']
    else '2Story' if x in ['2Story', '2.5Fin', '2.5Unf']
    else 'Other'
)

# Simplify building type
all_data['HomeType'] = all_data['BldgType'].apply(
    lambda x: 'SingleFamily' if x == '1Fam'
    else 'TownhouseEnd' if x == 'TwnhsE'
    else 'TownhouseInside' if x == 'TwnhsI'
    else 'Duplex' if x == '2fmCon'
    else 'Other'
)


STEP 8: Simplifying Categorical Features


In [None]:
# ================ 9. CONDITION & PROXIMITY FEATURES ================
print("\nSTEP 9: Creating Condition & Proximity Features")

# Create binary indicators for proximity features
pos_conditions = ['Norm', 'PosN', 'PosA']
neg_conditions = ['Artery', 'RRAn', 'RRAe', 'RRNn', 'RRNe']

all_data['NearPositiveFeature'] = (
    (all_data['Condition1'].isin(pos_conditions)) |
    (all_data['Condition2'].isin(pos_conditions))
).astype(int)

all_data['NearNegativeFeature'] = (
    (all_data['Condition1'].isin(neg_conditions)) |
    (all_data['Condition2'].isin(neg_conditions))
).astype(int)

# Simplified access quality
all_data['MainRoadAccess'] = ((all_data['Street'] == 'Pave') & (all_data['PavedDrive'] == 'Y')).astype(int)



STEP 9: Creating Condition & Proximity Features


In [None]:
# ================ 10. INTERACTION TERMS ================
print("\nSTEP 10: Creating Interaction Terms")

# Quality-Size interaction
all_data['QualitySize'] = all_data['OverallQual'] * all_data['TotalSF']

# Location-Quality interaction
all_data['LocationQuality'] = all_data['NeighborhoodTier_Encoded'] * all_data['OverallQual']

# Age-Quality interaction
all_data['AgeQuality'] = all_data['PropertyAge'] * all_data['OverallQual']
all_data['AgeQuality'] = all_data['AgeQuality'].apply(lambda x: np.log1p(x) if x > 0 else 0)

# Garage Quality-Size interaction
all_data['GarageQualityCars'] = all_data['GarageQual_Encoded'] * all_data['GarageCars']

# Basement Quality-Size interaction
all_data['BasementQualitySF'] = all_data['BsmtQual_Encoded'] * all_data['TotalBsmtSF']


STEP 10: Creating Interaction Terms


In [None]:
# ================ 11. TEMPORAL FEATURES ================
print("\nSTEP 11: Creating Temporal Features")

# Convert month sold to season
all_data['SeasonSold'] = all_data['MoSold'].apply(
    lambda x: 'Winter' if x in [12, 1, 2]
    else 'Spring' if x in [3, 4, 5]
    else 'Summer' if x in [6, 7, 8]
    else 'Fall'
)

# Create era based on year sold
all_data['SaleEra'] = pd.cut(
    all_data['YrSold'],
    bins=[1900, 2000, 2007, 2010, 2100],
    labels=['Pre2000', 'PreRecession', 'Recession', 'Recovery']
)


STEP 11: Creating Temporal Features


In [None]:
# ================ 12. POLYNOMIAL & RATIO FEATURES ================
print("\nSTEP 12: Creating Polynomial & Ratio Features")

# Squared terms for key numeric features
all_data['GrLivArea_Sq'] = all_data['GrLivArea'] ** 2
all_data['TotalBsmtSF_Sq'] = all_data['TotalBsmtSF'] ** 2
all_data['OverallQual_Sq'] = all_data['OverallQual'] ** 2
all_data['TotalSF_Sq'] = all_data['TotalSF'] ** 2
all_data['LotArea_Sq'] = all_data['LotArea'] ** 2

# Ratio features
all_data['RoomDensity'] = all_data['TotRmsAbvGrd'] / (all_data['GrLivArea'] + 1)  # Avoid division by zero
all_data['BedroomRatio'] = all_data['BedroomAbvGr'] / (all_data['TotRmsAbvGrd'] + 1)  # Avoid division by zero
all_data['BathroomRatio'] = all_data['TotalBathrooms'] / (all_data['BedroomAbvGr'] + 1)  # Avoid division by zero
all_data['LivingRoomRatio'] = (all_data['TotRmsAbvGrd'] - all_data['BedroomAbvGr']) / (all_data['TotRmsAbvGrd'] + 1)



STEP 12: Creating Polynomial & Ratio Features


In [None]:
print("\nSTEP 13: Applying Log Transformations to Skewed Features")

# List of numeric columns to check for skewness
numeric_cols = all_data.select_dtypes(include=['int64', 'float64']).columns
skewed_cols = []

for col in numeric_cols:
    # Skip binary columns
    if all_data[col].nunique() <= 2:
        continue

    # Skip columns that already have log versions
    if col.endswith('_Sq') or col.endswith('_Log'):
        continue

    # Check skewness
    skewness = all_data[col].skew()
    if abs(skewness) > 0.75:  # Threshold for deciding if a column is skewed
        skewed_cols.append(col)
        # Apply log transformation (add 1 to handle zeros)
        all_data[f'{col}_Log'] = np.log1p(all_data[col])

print(f"Log-transformed {len(skewed_cols)} skewed features")


STEP 13: Applying Log Transformations to Skewed Features
Log-transformed 42 skewed features


In [None]:
# ================ 14. ONE-HOT ENCODING FOR CATEGORICAL FEATURES ================
print("\nSTEP 14: One-Hot Encoding Categorical Features")

# First, identify which categorical columns to encode
categorical_cols = [
    'MSZoning', 'Neighborhood', 'BldgType', 'HouseStyle',
    'RoofStyle', 'Foundation', 'CentralAir', 'SaleType',
    'SaleCondition', 'SeasonSold', 'SaleEra',
    'StyleSimplified', 'HomeType'
]

# Filter to ensure columns exist
categorical_cols = [col for col in categorical_cols if col in all_data.columns]

# Use pd.get_dummies for one-hot encoding
all_data_encoded = pd.get_dummies(
    all_data,
    columns=categorical_cols,
    drop_first=True,  # Drop first category to avoid multicollinearity
    dummy_na=False  # Don't create a column for NA values
)

print(f"Shape after one-hot encoding: {all_data_encoded.shape}")



STEP 14: One-Hot Encoding Categorical Features
Shape after one-hot encoding: (2919, 244)


In [None]:

for col in all_data_encoded.columns:
    if all_data_encoded[col].dtype not in ['int64', 'float64']:
        dummies = pd.get_dummies(all_data_encoded[col], prefix=col, dummy_na=False)
        all_data_encoded = pd.concat([all_data_encoded.drop(columns=[col]), dummies], axis=1)


In [None]:
bool_cols = all_data_encoded.select_dtypes(include='bool').columns
all_data_encoded[bool_cols] = all_data_encoded[bool_cols].astype(int)


In [None]:
all_data_encoded

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,StyleSimplified_2Story_False,StyleSimplified_2Story_True,StyleSimplified_Other_False,StyleSimplified_Other_True,HomeType_Other_False,HomeType_Other_True,HomeType_SingleFamily_False,HomeType_SingleFamily_True,HomeType_TownhouseEnd_False,HomeType_TownhouseEnd_True
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,1,1,0,1,0,0,1,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,1,0,1,0,1,0,0,1,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,1,1,0,1,0,0,1,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,1,1,0,1,0,0,1,1,0
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,1,1,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,21.0,1936,4,7,1970,1970,0.0,0.0,0.0,...,0,1,1,0,0,1,1,0,1,0
1455,160,21.0,1894,4,5,1970,1970,0.0,252.0,0.0,...,0,1,1,0,1,0,1,0,0,1
1456,20,160.0,20000,5,7,1960,1996,0.0,1224.0,0.0,...,1,0,1,0,1,0,0,1,1,0
1457,85,62.0,10441,5,5,1992,1992,0.0,337.0,0.0,...,1,0,0,1,1,0,0,1,1,0


In [None]:
# ================ 15. FEATURE SELECTION ================
print("\nSTEP 15: Feature Selection")

# Split back into train and test sets
train_processed = all_data_encoded.iloc[:len(train)]
test_processed = all_data_encoded.iloc[len(train):]

# Apply log transformation to target (common practice for house prices)
target_log = np.log1p(target)

# Initialize RandomForest model for feature importance
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(train_processed, target_log)

# Get feature importances
importances = pd.DataFrame({
    'Feature': train_processed.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

# Select top 100 features (adjust based on your preference)
n_features = 100
selected_features = importances.head(n_features)['Feature'].tolist()

print(f"Selected top {n_features} features")
print("\nTop 20 features by importance:")
print(importances.head(20))

# Filter data to include only selected features
train_selected = train_processed[selected_features]
test_selected = test_processed[selected_features]

# ================ 16. FINAL PREPARATION ================
print("\nSTEP 16: Final Preparation")

# Build final train dataset
train_final = pd.DataFrame({
    'Id': train_ID,
    'SalePrice': target,
    'SalePrice_Log': target_log
})
train_final = pd.concat([train_final, train_selected], axis=1)

# Build final test dataset with NaNs for target columns
test_final = pd.DataFrame({
    'Id': test_ID,
    'SalePrice': np.nan,
    'SalePrice_Log': np.nan
})
test_final = pd.concat([test_final, test_selected], axis=1)

# Save processed datasets
train_final.to_csv('train_engineered.csv', index=False)
test_final.to_csv('test_engineered.csv', index=False)

print("\nFeature engineering complete!")
print(f"Final training set shape: {train_final.shape}")
print(f"Final test set shape: {test_final.shape}")



STEP 15: Feature Selection
Selected top 100 features

Top 20 features by importance:
                       Feature  Importance
78                 QualitySize    0.384135
130            QualitySize_Log    0.361760
79             LocationQuality    0.041929
72       MeanNeighborhoodPrice    0.017784
129  MeanNeighborhoodPrice_Log    0.012442
68                OverallGrade    0.007336
26                  GarageArea    0.006207
49           YearsSinceRemodel    0.004978
435          CentralAir_Y_True    0.004602
71               BathroomScore    0.003647
434         CentralAir_Y_False    0.003647
88                 RoomDensity    0.003631
51                     TotalSF    0.003624
6                 YearRemodAdd    0.003457
10                   BsmtUnfSF    0.003265
53       PctUnfinishedBasement    0.003245
98               BsmtUnfSF_Log    0.003084
4                  OverallCond    0.003079
8                   BsmtFinSF1    0.003010
45                 PropertyAge    0.002792

STEP 16: F

In [None]:
# Find columns not in common between train_final and test_final
train_cols = set(train_final.columns)
test_cols = set(test_final.columns)

# Columns only in train or only in test
only_in_train = train_cols - test_cols
only_in_test = test_cols - train_cols

print("Columns only in train_final:")
print(sorted(only_in_train))

print("\nColumns only in test_final:")
print(sorted(only_in_test))


Columns only in train_final:
[]

Columns only in test_final:
[]


In [None]:
!pip install xgboost



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, roc_auc_score
import numpy as np

# ========== DATA PREPARATION ==========
X = train_final.drop(columns=['Id', 'SalePrice', 'SalePrice_Log'])
y = train_final['SalePrice_Log']

# Initial train-test split
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler_X = StandardScaler()
X_train_full_scaled = scaler_X.fit_transform(X_train_full)
X_test_scaled = scaler_X.transform(X_test)

# Validation split from training data
X_train, X_val, y_train, y_val = train_test_split(X_train_full_scaled, y_train_full, test_size=0.2, random_state=42)

# ========== BINARY LABEL FOR AUC ==========
y_binary_full = (y > y.median()).astype(int)
_, y_test_bin = train_test_split(y_binary_full, test_size=0.2, random_state=42)
_, y_val_bin = train_test_split(y_binary_full[y_train_full.index], test_size=0.2, random_state=42)

# ========== MODELS ==========
models = {
    'Ridge': Ridge(alpha=10),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)
}

# ========== TRAIN & EVALUATE ==========
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    # Metrics
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    mae = mean_absolute_error(y_val, preds)
    r2 = r2_score(y_val, preds)

    # AUC (approximate using binary threshold)
    preds_bin = (preds > y.median()).astype(int)
    auc = roc_auc_score(y_val_bin, preds_bin)

    print(f"{name} Results:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE : {mae:.4f}")
    print(f"  R²  : {r2:.4f}")
    print(f"  AUC : {auc:.4f}")



Training Ridge...
Ridge Results:
  RMSE: 0.1191
  MAE : 0.0792
  R²  : 0.9140
  AUC : 0.9229

Training RandomForest...
RandomForest Results:
  RMSE: 0.1413
  MAE : 0.0942
  R²  : 0.8790
  AUC : 0.9156

Training XGBoost...
XGBoost Results:
  RMSE: 0.1358
  MAE : 0.0870
  R²  : 0.8882
  AUC : 0.9242


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np

# ========== DATA PREPARATION ==========
X = train_final.drop(columns=['Id', 'SalePrice', 'SalePrice_Log'])
y = train_final['SalePrice_Log']

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = RobustScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

X_train, X_val, y_train, y_val = train_test_split(X_train_full_scaled, y_train_full, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
cv = KFold(n_splits=3, shuffle=True, random_state=42)

models = {
    'Lasso': (Lasso(max_iter=10000), {'alpha': [0.001, 0.01]}),
    'RandomForest': (RandomForestRegressor(random_state=42), {'n_estimators': [100], 'max_depth': [10]}),
    'XGBoost': (XGBRegressor(objective='reg:squarederror', random_state=42), {'n_estimators': [100], 'max_depth': [3]}),
    'CatBoost': (CatBoostRegressor(verbose=0, random_state=42), {'depth': [3], 'learning_rate': [0.1]})
}

best_models = []
results = []

# Grid Search for each model
for name, (model, params) in models.items():
    grid = GridSearchCV(model, params, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(X_train_full_scaled, y_train_full)
    best_model = grid.best_estimator_
    best_models.append((name.lower(), best_model))

    y_test_pred = best_model.predict(X_test_scaled)
    results.append({
        'Model': name,
        'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'MAE': mean_absolute_error(y_test, y_test_pred),
        'R²': r2_score(y_test, y_test_pred)
    })

# ========== STACKED MODEL ==========
stacked_model = StackingRegressor(
    estimators=best_models,
    final_estimator=Ridge(),
    cv=5,
    n_jobs=-1
)

stacked_model.fit(X_train_full_scaled, y_train_full)
y_test_pred_stacked = stacked_model.predict(X_test_scaled)

results.append({
    'Model': 'Stacking',
    'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred_stacked)),
    'MAE': mean_absolute_error(y_test, y_test_pred_stacked),
    'R²': r2_score(y_test, y_test_pred_stacked)
})

# ========== DISPLAY FINAL RESULTS ==========
results_df = pd.DataFrame(results).sort_values('RMSE')
print("\nFinal Test Set Performance:")
print(results_df.to_string(index=False))

best_model_name = results_df.iloc[0]['Model']
print(f"\n✅ Best Model on Test Set: {best_model_name}")



Final Test Set Performance:
       Model     RMSE      MAE       R²
    Stacking 0.123310 0.081722 0.918519
    CatBoost 0.126885 0.086149 0.913725
       Lasso 0.128020 0.085824 0.912175
     XGBoost 0.135468 0.090388 0.901659
RandomForest 0.141886 0.092627 0.892120

✅ Best Model on Test Set: Stacking


In [None]:
# ========== CUSTOM CATBOOST EVALUATION ==========
from catboost import CatBoostRegressor
import pandas as pd # Import pandas if not already imported
import numpy as np  # Import numpy if not already imported
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # Import necessary metrics

catboost_model = CatBoostRegressor(
    learning_rate=0.01,
    l2_leaf_reg=0,
    iterations=3500,
    depth=4,
    colsample_bylevel=0.7,
    border_count=50,
    bootstrap_type='Bayesian',
    bagging_temperature=1,
    random_seed=42,
    verbose=0
)

# Assuming X_train_full_scaled and y_train_full are available from previous cells
# If not, make sure to run the preceding cells
catboost_model.fit(X_train_full_scaled, y_train_full)
y_test_pred_catboost_custom = catboost_model.predict(X_test_scaled)

# Append result using pd.concat
custom_result = {
    'Model': 'CatBoost (Custom)',
    'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred_catboost_custom)),
    'MAE': mean_absolute_error(y_test, y_test_pred_catboost_custom),
    'R²': r2_score(y_test, y_test_pred_catboost_custom)
}

# Convert custom_result to a DataFrame row
custom_result_df = pd.DataFrame([custom_result])

# Use pd.concat to add the new row
# Assuming results_df is already defined from the previous code block
results_df = pd.concat([results_df, custom_result_df], ignore_index=True).sort_values('RMSE')


# Display updated results
print("\n🔍 Updated Test Set Performance Including Custom CatBoost:")
print(results_df.to_string(index=False))

best_model_name = results_df.iloc[0]['Model']
print(f"\n✅ Updated Best Model on Test Set: {best_model_name}")


🔍 Updated Test Set Performance Including Custom CatBoost:
            Model     RMSE      MAE       R²
CatBoost (Custom) 0.122691 0.081540 0.919335
         Stacking 0.123310 0.081722 0.918519
         CatBoost 0.126885 0.086149 0.913725
            Lasso 0.128020 0.085824 0.912175
          XGBoost 0.135468 0.090388 0.901659
     RandomForest 0.141886 0.092627 0.892120

✅ Updated Best Model on Test Set: CatBoost (Custom)


In [None]:
best_models.append(('catboost(new)', catboost_model))

In [None]:
best_models

[('lasso', Lasso(alpha=0.001, max_iter=10000)),
 ('randomforest', RandomForestRegressor(max_depth=10, random_state=42)),
 ('xgboost',
  XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=3, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=100, n_jobs=None,
               num_parallel_tree=None, random_state=42, ...)),
 ('catboost', <catboost.core.CatBoostRegressor at 0x7bba6d594d10>),
 ('catboost(new)', <catboost.core

In [None]:

for name, model in best_models:
    if name == 'catboost(new)':
        bm = model
        break

In [None]:
bm

<catboost.core.CatBoostRegressor at 0x7bba41b5f0d0>

In [None]:
test_final

Unnamed: 0,Id,SalePrice,SalePrice_Log,QualitySize,QualitySize_Log,LocationQuality,MeanNeighborhoodPrice,MeanNeighborhoodPrice_Log,OverallGrade,GarageArea,...,PavedDrive_Y,NeighborhoodTier_Standard,HeatingQC_Fa,MSSubClass_Log,Condition1_Norm,Electrical_SBrkr,BedroomAbvGr,OverallQual,LandContour_Bnk,MSZoning_RL_True
0,1461,,,8890.0,9.092795,15,145847.080000,11.890321,30,730.0,...,1,0,0,3.044522,0,1,2,5,0,0
1,1462,,,15948.0,9.677151,18,145847.080000,11.890321,36,312.0,...,1,0,0,3.044522,1,1,3,6,0,1
2,1463,,,12785.0,9.456106,15,192854.506329,12.169697,25,482.0,...,1,0,0,4.110874,1,1,3,5,0,1
3,1464,,,15180.0,9.627800,18,192854.506329,12.169697,36,470.0,...,1,0,0,4.110874,1,1,3,6,0,1
4,1465,,,20480.0,9.927253,40,310499.000000,12.645939,40,506.0,...,1,0,0,4.795791,1,1,2,8,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,,,6552.0,8.787678,4,98576.470588,11.498598,28,0.0,...,1,0,0,5.081404,1,1,3,4,0,0
1455,2916,,,6552.0,8.787678,4,98576.470588,11.498598,20,286.0,...,1,0,0,5.081404,1,1,3,4,0,0
1456,2917,,,12240.0,9.412546,15,156270.122449,11.959348,35,576.0,...,1,0,0,3.044522,1,1,4,5,0,1
1457,2918,,,9410.0,9.149634,15,156270.122449,11.959348,25,0.0,...,1,0,0,4.454347,1,1,3,5,0,1


In [None]:
# ========== FINAL TRAINING ON FULL DATA ==========

bm.fit(X_train_full_scaled, y_train_full)



# Drop 'SalePrice' and 'SalePrice_Log' from test_final if they exist
test_final = test_final.drop(columns=[col for col in ['SalePrice', 'SalePrice_Log'] if col in test_final.columns])



In [None]:
scaler_X.feature_names_in_

array(['QualitySize', 'QualitySize_Log', 'LocationQuality',
       'MeanNeighborhoodPrice', 'MeanNeighborhoodPrice_Log',
       'OverallGrade', 'GarageArea', 'YearsSinceRemodel',
       'CentralAir_Y_True', 'BathroomScore', 'CentralAir_Y_False',
       'RoomDensity', 'TotalSF', 'YearRemodAdd', 'BsmtUnfSF',
       'PctUnfinishedBasement', 'BsmtUnfSF_Log', 'OverallCond',
       'BsmtFinSF1', 'PropertyAge', 'LotArea', 'TotalSF_Log',
       'TotalSF_Sq', '1stFlrSF_Log', '1stFlrSF', 'GrLivArea_Log',
       'LotArea_Log', 'BsmtFinSF1_Log', 'TotalBathrooms',
       'GarageYrBlt_Log', 'GrLivArea', 'ZoneType_R', 'YearBuilt',
       'LotArea_Sq', 'GrLivArea_Sq', 'KitchenQual_Encoded', 'GarageYrBlt',
       'BsmtFinType1_Encoded', 'GarageQualityCars', 'LotFrontage',
       'BasementQualitySF', '2ndFlrSF', 'MoSold', 'LotFrontage_Log',
       'BathroomRatio', 'BasementQualitySF_Log', 'LivingAreaRatio',
       'AgeQuality', 'MSZoning_RM_False', 'LivingAreaRatio_Log',
       'AgeQuality_Log', 'SaleCo

In [None]:
X_submission = test_final.drop(columns=['Id'])


In [None]:
# Get feature names used during scaling
scaled_features = scaler_X.feature_names_in_

# Get submission feature columns
submission_features = X_submission.columns

# Find differences
missing_in_submission = set(scaled_features) - set(submission_features)
extra_in_submission = set(submission_features) - set(scaled_features)

# Display differences
print("Features missing in submission:", missing_in_submission)
print("Extra features in submission:", extra_in_submission)


Features missing in submission: set()
Extra features in submission: set()


In [None]:

# Scale test data (same scaler used)
# Replace infinite values with a large finite value
import numpy as np
X_submission = X_submission.replace([np.inf, -np.inf], np.nan)
# If you have NaNs after this step, you can impute them with the mean/median
# or another appropriate value for your dataset
X_submission = X_submission.fillna(X_submission.mean()) # replace NaNs with the mean of each column

# Scale test data (same scaler used)
X_submission_scaled = scaler_X.transform(X_submission)

In [None]:


# Predict log SalePrice
log_preds = bm.predict(X_submission_scaled)

# Inverse log transformation to get actual prices
final_preds = np.expm1(log_preds)

# ========== GENERATE SUBMISSION FILE ==========

submission = test_final[['Id']].copy()
submission['SalePrice'] = final_preds


In [None]:
submission

Unnamed: 0,Id,SalePrice
0,1461,102843.016338
1,1462,147629.079014
2,1463,187893.336025
3,1464,202378.114381
4,1465,212814.768027
...,...,...
1454,2915,77022.950209
1455,2916,80680.394329
1456,2917,156499.216401
1457,2918,112511.570735


In [None]:
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/submission9.csv', index=False)

print(" Submission file saved as 'submission9.csv'.")

 Submission file saved as 'submission9.csv'.
