In [1]:
import pandas as pd

X_train_filled= pd.read_csv('/content/X_train_filled.csv')
X_test_filled= pd.read_csv('/content/X_test_filled.csv')
y_train= pd.read_csv('/content/y_train.csv')

In [2]:
X_train_filled.shape, X_test_filled.shape, y_train.shape

((1460, 75), (1459, 75), (1460, 1))

##**Oulier Handling**

In [3]:
import pandas as pd

# اختيار الأعمدة العددية فقط
num_cols = X_train_filled.select_dtypes(include=['int64', 'float64']).columns

# قاموس لتخزين النتائج
outliers_summary = {}

for col in num_cols:
    Q1 = X_train_filled[col].quantile(0.25)
    Q3 = X_train_filled[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_mask = (X_train_filled[col] < lower_bound) | (X_train_filled[col] > upper_bound)
    outliers_count = outlier_mask.sum()
    outliers_percentage = (outliers_count / len(X_train_filled)) * 100
    outliers_summary[col] = [outliers_count, outliers_percentage]

# تحويل القاموس لجدول
outliers_df = pd.DataFrame(outliers_summary, index=['Outlier Count', 'Outlier %']).T

# ترتيب الجدول حسب النسبة
outliers_df = outliers_df.sort_values('Outlier %', ascending=False)

# عرض الجدول
outliers_df

Unnamed: 0,Outlier Count,Outlier %
EnclosedPorch,208.0,14.246575
BsmtFinSF2,167.0,11.438356
OverallCond,125.0,8.561644
ScreenPorch,116.0,7.945205
MSSubClass,103.0,7.054795
MasVnrArea,96.0,6.575342
LotFrontage,87.0,5.958904
BsmtHalfBath,82.0,5.616438
OpenPorchSF,77.0,5.273973
LotArea,69.0,4.726027


In [4]:
import pandas as pd
import numpy as np

# ---- Columns with outliers based on previous analysis ----
outlier_cols = [
    'EnclosedPorch','BsmtFinSF2','OverallCond','ScreenPorch','MSSubClass',
    'MasVnrArea','LotFrontage','BsmtHalfBath','OpenPorchSF','LotArea',
    'KitchenAbvGr','TotalBsmtSF','MiscVal','BedroomAbvGr','WoodDeckSF',
    'GrLivArea','TotRmsAbvGrd','BsmtUnfSF','LowQualFinSF','3SsnPorch',
    'GarageArea','1stFlrSF','YearBuilt','PoolArea','BsmtFinSF1','GarageCars',
    'Fireplaces','OverallQual','2ndFlrSF','GarageYrBlt','BsmtFullBath'
]

# ---- Function to apply Capping based on IQR ----
def cap_outliers(df, cols):
    df_copy = df.copy()
    for col in cols:
        if col in df_copy.columns:
            Q1 = df_copy[col].quantile(0.25)
            Q3 = df_copy[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            df_copy[col] = np.where(df_copy[col] < lower, lower, df_copy[col])
            df_copy[col] = np.where(df_copy[col] > upper, upper, df_copy[col])
    return df_copy

# ---- Apply capping ----
X_train_capped = cap_outliers(X_train_filled, outlier_cols)
X_test_capped = cap_outliers(X_test_filled, outlier_cols)

# ---- Check ----
print("After outlier handling:")
print("X_train shape:", X_train_capped.shape)
print("X_test shape:", X_test_capped.shape)


After outlier handling:
X_train shape: (1460, 75)
X_test shape: (1459, 75)


In [5]:
import pandas as pd

# ---- الأعمدة العددية فقط ----
num_cols = X_train_capped.select_dtypes(include=['int64', 'float64']).columns

# ---- قاموس لتخزين النتائج ----
outliers_summary_after = {}

for col in num_cols:
    Q1 = X_train_capped[col].quantile(0.25)
    Q3 = X_train_capped[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_mask = (X_train_capped[col] < lower_bound) | (X_train_capped[col] > upper_bound)
    outliers_count = outlier_mask.sum()
    outliers_percentage = (outliers_count / len(X_train_capped)) * 100
    outliers_summary_after[col] = [outliers_count, outliers_percentage]

# ---- تحويل القاموس لجدول ----
outliers_df_after = pd.DataFrame(outliers_summary_after, index=['Outlier Count', 'Outlier %']).T

# ---- ترتيب الجدول حسب النسبة ----
outliers_df_after = outliers_df_after.sort_values('Outlier %', ascending=False)

# ---- عرض الجدول ----
outliers_df_after

Unnamed: 0,Outlier Count,Outlier %
MSSubClass,0.0,0.0
LotFrontage,0.0,0.0
LotArea,0.0,0.0
OverallQual,0.0,0.0
OverallCond,0.0,0.0
YearBuilt,0.0,0.0
YearRemodAdd,0.0,0.0
MasVnrArea,0.0,0.0
BsmtFinSF1,0.0,0.0
BsmtFinSF2,0.0,0.0


######**Skewed Feature Transformation, and Numerical Feature Scaling**

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

# ---- البيانات بعد معالجة outliers ----
X_train_capped = X_train_capped.copy()
X_test_capped = X_test_capped.copy()

# ---- اختيار الأعمدة العددية ----
num_cols = X_train_capped.select_dtypes(include=['int64', 'float64']).columns

# ---- Optional: Detect skewed columns (skew > 1) ----
skewed_cols = X_train_capped[num_cols].apply(lambda x: x.skew()).sort_values(ascending=False)
skewed_cols = skewed_cols[skewed_cols > 1].index.tolist()

# ---- Transform skewed features (log1p for positive values) ----
for col in skewed_cols:
    X_train_capped[col] = np.log1p(X_train_capped[col])
    X_test_capped[col] = np.log1p(X_test_capped[col])

# ---- Scale numerical columns using RobustScaler ----
scaler = RobustScaler()
X_train_scaled = X_train_capped.copy()
X_test_scaled = X_test_capped.copy()

X_train_scaled[num_cols] = scaler.fit_transform(X_train_capped[num_cols])
X_test_scaled[num_cols] = scaler.transform(X_test_capped[num_cols])

print("After skew transform + scaling:")
print("X_train shape:", X_train_scaled.shape)
print("X_test shape:", X_test_scaled.shape)

After skew transform + scaling:
X_train shape: (1460, 75)
X_test shape: (1459, 75)


##**Encoding**

In [7]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# ---- 1. Ordinal Encoding ----
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                'KitchenQual', 'HeatingQC', 'FireplaceQu', 'GarageQual', 'GarageCond']

ordinal_mapping = {
    'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0, 'NoBsmt': 0, 'NoFireplace': 0, 'NoGarage': 0
}

for col in ordinal_cols:
    X_train_capped[col] = X_train_capped[col].map(ordinal_mapping)
    X_test_capped[col] = X_test_capped[col].map(ordinal_mapping)

# ---- 2. One-Hot Encoding ----
ohe_cols = ['MSZoning', 'Street', 'LotShape', 'Neighborhood', 'BldgType', 'HouseStyle']

ohe = OneHotEncoder(sparse_output=False, drop='first')  # لتجنب dummy variable trap

X_train_ohe = pd.DataFrame(
    ohe.fit_transform(X_train_capped[ohe_cols]),
    columns=ohe.get_feature_names_out(ohe_cols),
    index=X_train_capped.index
)

X_test_ohe = pd.DataFrame(
    ohe.transform(X_test_capped[ohe_cols]),
    columns=ohe.get_feature_names_out(ohe_cols),
    index=X_test_capped.index
)

# ---- دمج الأعمدة المرمّزة مع الباقي ----
X_train_encoded = pd.concat([X_train_capped.drop(columns=ohe_cols), X_train_ohe], axis=1)
X_test_encoded = pd.concat([X_test_capped.drop(columns=ohe_cols), X_test_ohe], axis=1)

# ---- Align الأعمدة للتأكد من التطابق ----
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

print("X_train_encoded shape:", X_train_encoded.shape)
print("X_test_encoded shape:", X_test_encoded.shape)

X_train_encoded shape: (1460, 112)
X_test_encoded shape: (1459, 112)


In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# ---- التأكد من أن كل الأعمدة الرقمية فقط قبل تدريب الموديل ----
# اكتشاف أي أعمدة categorical متبقية
remaining_cat_cols_train = X_train_encoded.select_dtypes(include='object').columns.tolist()
remaining_cat_cols_test = X_test_encoded.select_dtypes(include='object').columns.tolist()

# دمج الأعمدة المتبقية من train و test لضمان التوافق
all_remaining_cat_cols = list(set(remaining_cat_cols_train + remaining_cat_cols_test))

if all_remaining_cat_cols:
    ohe_final = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')

    # Apply OHE to X_train_encoded
    X_train_ohe = pd.DataFrame(
        ohe_final.fit_transform(X_train_encoded[all_remaining_cat_cols]),
        columns=ohe_final.get_feature_names_out(all_remaining_cat_cols),
        index=X_train_encoded.index
    )
    X_train_encoded = pd.concat([X_train_encoded.drop(columns=all_remaining_cat_cols), X_train_ohe], axis=1)

    # Apply OHE to X_test_encoded
    X_test_ohe = pd.DataFrame(
        ohe_final.transform(X_test_encoded[all_remaining_cat_cols]),
        columns=ohe_final.get_feature_names_out(all_remaining_cat_cols),
        index=X_test_encoded.index
    )
    X_test_encoded = pd.concat([X_test_encoded.drop(columns=all_remaining_cat_cols), X_test_ohe], axis=1)

print("X_train_encoded shape after final encoding:", X_train_encoded.shape)
print("X_test_encoded shape after final encoding:", X_test_encoded.shape)

X_train_encoded shape after final encoding: (1460, 218)
X_test_encoded shape after final encoding: (1459, 218)


In [9]:
# حفظ الملفات بعد المعالجة
X_train_encoded.to_csv('/content/X_train_final.csv', index=False)
X_test_encoded.to_csv('/content/X_test_final.csv', index=False)
y_train.to_csv('/content/y_train_final.csv', index=False)

##**Gradient Boosting**

In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

# ---- Load the data ----
X_train = pd.read_csv('/content/X_train_final.csv')
X_test = pd.read_csv('/content/X_test_final.csv')
y_train = pd.read_csv('/content/y_train_final.csv')

# ---- Clean column names ----
X_train.columns = X_train.columns.str.strip().str.replace(' ', '_')
X_test.columns = X_test.columns.str.strip().str.replace(' ', '_')

# ---- Train/validation split ----
X_train_part, X_val, y_train_part, y_val = train_test_split(
    X_train, y_train.values.ravel(), test_size=0.2, random_state=42
)

# ---- Gradient Boosting Regressor ----
boost_tree = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
boost_tree.fit(X_train_part, y_train_part)

# ---- Predict ----
y_pred_train = boost_tree.predict(X_train_part)
y_pred_val = boost_tree.predict(X_val)
y_pred_test = boost_tree.predict(X_test)  # لو حابة تحصلي على النتائج

# ---- R² Scores ----
r2_train = r2_score(y_train_part, y_pred_train)
r2_val = r2_score(y_val, y_pred_val)

print("Gradient Boosting - Training R²:", r2_train)
print("Gradient Boosting - Validation R²:", r2_val)

Gradient Boosting - Training R²: 0.9820059969501734
Gradient Boosting - Validation R²: 0.9127823549068426


In [57]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# ---- Define estimators for Stacking ----
estimators = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42))
]

stack_model = StackingRegressor(
    estimators=estimators,
    final_estimator=LinearRegression()
)

# ---- Train the Stacking model ----
stack_model.fit(X_train_part, y_train_part)

# ---- Predict ----
y_pred_train_stack = stack_model.predict(X_train_part)
y_pred_val_stack = stack_model.predict(X_val)
y_pred_test_stack = stack_model.predict(X_test)  # لو حابة تحصلي على النتائج

# ---- R² Scores ----
r2_train_stack = r2_score(y_train_part, y_pred_train_stack)
r2_val_stack = r2_score(y_val, y_pred_val_stack)

print("Stacking Regressor - Training R²:", r2_train_stack)
print("Stacking Regressor - Validation R²:", r2_val_stack)

Stacking Regressor - Training R²: 0.9909637865850773
Stacking Regressor - Validation R²: 0.9162248845591724


In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

# ---- Train/validation split ----
X_train_part, X_val, y_train_part, y_val = train_test_split(
    X_train, y_train.values.ravel(), test_size=0.2, random_state=42
)

# ---- XGBoost Regressor ----
xgb_model = XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,  # أقل من 6
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# ---- Train the model ----
xgb_model.fit(X_train_part, y_train_part)

# ---- Predict & Evaluate ----
y_pred_xgb = xgb_model.predict(X_val)
r2_xgb = r2_score(y_val, y_pred_xgb)

print("XGBoost Regressor R²:", r2_xgb)

XGBoost Regressor R²: 0.9287456274032593


In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

# ---- Train/validation split ----
X_train_part, X_val, y_train_part, y_val = train_test_split(
    X_train, y_train.values.ravel(), test_size=0.2, random_state=42
)

# ---- XGBoost Regressor ----
xgb_model = XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,  # أقل من 6
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# ---- Train the model ----
xgb_model.fit(X_train_part, y_train_part)

# ---- Predict ----
y_pred_train_xgb = xgb_model.predict(X_train_part)
y_pred_val_xgb = xgb_model.predict(X_val)
y_pred_test_xgb = xgb_model.predict(X_test)  # لو حابة تحصلي على النتائج

# ---- R² Scores ----
r2_train_xgb = r2_score(y_train_part, y_pred_train_xgb)
r2_val_xgb = r2_score(y_val, y_pred_val_xgb)

print("XGBoost Regressor - Training R²:", r2_train_xgb)
print("XGBoost Regressor - Validation R²:", r2_val_xgb)

XGBoost Regressor - Training R²: 0.9915261268615723
XGBoost Regressor - Validation R²: 0.9287456274032593


In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# ---- Train/validation split ----
X_train_part, X_val, y_train_part, y_val = train_test_split(
    X_train, y_train.values.ravel(), test_size=0.2, random_state=42
)

# ---- Define models ----
models = {
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=400, learning_rate=0.05, max_depth=4, subsample=0.8, colsample_bytree=0.8, random_state=42),
    'Stacking': StackingRegressor(
        estimators=[
            ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
            ('xgb', XGBRegressor(n_estimators=100, random_state=42))
        ],
        final_estimator=LinearRegression()
    )
}

# ---- Evaluate models ----
results = []

for name, model in models.items():
    model.fit(X_train_part, y_train_part)
    y_pred_train = model.predict(X_train_part)
    y_pred_val = model.predict(X_val)

    r2_train = r2_score(y_train_part, y_pred_train)
    r2_val = r2_score(y_val, y_pred_val)

    overfit = "⚠ Overfitting" if r2_train - r2_val > 0.05 else "✅ Stable"

    results.append({
        'Model': name,
        'Train R²': r2_train,
        'Validation R²': r2_val
    })

# ---- Display results ----
results_df = pd.DataFrame(results)
print(results_df)

               Model  Train R²  Validation R²
0  Gradient Boosting  0.982006       0.912782
1            XGBoost  0.991526       0.928746
2           Stacking  0.990964       0.916225


In [67]:
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# ---- توقعات Train لكل موديل ----
y_train_part_pred_gb = boost_tree.predict(X_train_part)
y_train_part_pred_xgb = xgb_model.predict(X_train_part)
y_train_part_pred_stack = stack_model.predict(X_train_part)

# ---- توقعات Validation لكل موديل ----
y_val_pred_gb = boost_tree.predict(X_val)
y_val_pred_xgb = xgb_model.predict(X_val)
y_val_pred_stack = stack_model.predict(X_val)

# ---- قائمة الموديلات ----
models = ['Gradient Boosting', 'XGBoost', 'Stacking']
train_preds = [y_train_part_pred_gb, y_train_part_pred_xgb, y_train_part_pred_stack]
val_preds   = [y_val_pred_gb, y_val_pred_xgb, y_val_pred_stack]

# ---- حساب Metrics ----
results = []

for i, name in enumerate(models):
    r2_train = r2_score(y_train_part, train_preds[i])
    r2_val   = r2_score(y_val, val_preds[i])

    mae_train = mean_absolute_error(y_train_part, train_preds[i])
    mae_val   = mean_absolute_error(y_val, val_preds[i])

    rmse_train = np.sqrt(mean_squared_error(y_train_part, train_preds[i]))
    rmse_val   = np.sqrt(mean_squared_error(y_val, val_preds[i]))

    results.append({
        'Model': name,
        'Train R²': r2_train,
        'Validation R²': r2_val,
        'Train MAE': mae_train,
        'Validation MAE': mae_val,
        'Train RMSE': rmse_train,
        'Validation RMSE': rmse_val
    })

# ---- إنشاء جدول وعرضه ----
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Train R²,Validation R²,Train MAE,Validation MAE,Train RMSE,Validation RMSE
0,Gradient Boosting,0.982006,0.912782,7823.635335,16633.958449,10359.8329,25864.798257
1,XGBoost,0.991526,0.928746,5353.031738,15243.513672,7109.347931,23378.2901
2,Stacking,0.990964,0.916225,4591.511609,16421.92489,7341.454226,25349.210313
