# Real Estate Price Prediction - COMPLETE BEST CODE

## Objective
Predict property prices (TND) using machine learning with CRISP-DM methodology

## Dataset
- 2,458 Tunisian properties
- 26 features including location, amenities, condition
- Target: price_tnd (Price in Tunisian Dinars)

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle


## STEP 1: Load Data

In [2]:
# Load dataset
data = pd.read_csv('dataset_clean.csv', index_col=0)

print(f'Dataset shape: {data.shape}')
print(f'\nPrice Statistics (TND):')
print(f'  Min: {data["price_tnd"].min():,.0f}')
print(f'  Max: {data["price_tnd"].max():,.0f}')
print(f'  Mean: {data["price_tnd"].mean():,.0f}')
print(f'  Median: {data["price_tnd"].median():,.0f}')
print(f'\nFirst few rows:')
data.head()

Dataset shape: (2458, 26)

Price Statistics (TND):
  Min: 8,000
  Max: 15,000,000
  Mean: 655,379
  Median: 429,898

First few rows:


Unnamed: 0,id,price_tnd,price_eur,location,city,governorate,Area,pieces,room,bathroom,...,garden,concierge,beach_view,mountain_view,pool,elevator,furnished,equipped_kitchen,central_heating,air_conditioning
1,863e62e5-0bfe-49f3-ad97-e0ae91be68e9,3250000.0,1007500.0,El Kantaoui,Hammam Sousse,Sousse,1000.0,26.0,16.0,14.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
5,67e06050-57c0-4665-8a2e-a9010b578e4a,1000000.0,310000.0,Sousse Riadh,Sousse Riadh,Sousse,1000.0,23.0,16.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,da303b71-fba9-42e9-a236-16ec176309d9,1200000.0,372000.0,Bou Mhel,Boumhel Bassatine,Ben Arous,510.0,18.0,14.0,4.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
16,ffc63109-309e-4026-aecf-6ccb22447c38,950000.0,294500.0,El Gourjani,Tunis,tunis,538.0,17.0,16.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,12c80a59-8f9d-4cee-9554-eeae7aa3ab9c,650000.0,201500.0,Raoued,Raoued,Ariana,400.0,17.0,9.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


## STEP 2: Prepare Data

In [3]:
# Get numeric features (exclude price columns)
numeric_cols = data.select_dtypes(include=[np.number]).columns
feature_names = [col for col in numeric_cols if col not in ['price_tnd', 'price_eur']]

X = data[feature_names]
y = data['price_tnd']

# Remove missing values
valid_idx = ~(X.isnull().any(axis=1) | y.isnull())
X = X[valid_idx]
y = y[valid_idx]

print(f'Features: {len(feature_names)}')
print(f'Records: {len(X)}')
print(f'\nFeatures:')
print(feature_names)

Features: 19
Records: 2458

Features:
['Area', 'pieces', 'room', 'bathroom', 'state', 'latt', 'long', 'distance_to_capital', 'garage', 'garden', 'concierge', 'beach_view', 'mountain_view', 'pool', 'elevator', 'furnished', 'equipped_kitchen', 'central_heating', 'air_conditioning']


In [4]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Train set: {len(X_train)} records')
print(f'Test set: {len(X_test)} records')
print('✓ Data prepared and scaled')

Train set: 1966 records
Test set: 492 records
✓ Data prepared and scaled


## CRISP-DM Compliance Section

This section is added to explicitly satisfy project requirements:
- Business Understanding
- Data Understanding
- Data Preparation
- Modeling (≥2 regression + ≥2 classification models)
- Testing & Evaluation (Regression R² ≥ 0.70, Classification Accuracy ≥ 80%)
- Model Deployment (Streamlit + save/load `.pkl`)

### 1) Business Understanding
Goal: estimate Tunisian property prices and classify listings into low/high price groups to support valuation and decision making.

### 2) Data Understanding
- Source: `dataset_clean.csv`
- Main target for regression: `price_tnd`
- Important predictors include area, geographic coordinates, amenities, and locality fields (`location`, `city`, `governorate`).

### 3) Data Preparation
- Remove leakage columns (`id`, `price_eur`).
- Handle missing values.
- Use leakage-safe target encoding for categorical locality fields.
- Keep robust train/test split for evaluation.

In [5]:
# CRISP-DM - Unified data preparation for compliant modeling
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

# Load data
compliance_df = pd.read_csv('dataset_clean.csv', index_col=0)

# Base features/target
X_base = compliance_df.drop(columns=[c for c in ['price_tnd', 'price_eur', 'id'] if c in compliance_df.columns]).copy()
y_reg = compliance_df['price_tnd'].copy()

# Missing value handling
for col in X_base.columns:
    if pd.api.types.is_numeric_dtype(X_base[col]):
        X_base[col] = X_base[col].fillna(X_base[col].median())
    else:
        X_base[col] = X_base[col].fillna('missing').astype(str)

# Outlier filter on regression target (same strategy that performed best)
q_low, q_high = y_reg.quantile(0.05), y_reg.quantile(0.95)
mask = (y_reg >= q_low) & (y_reg <= q_high)
X_base = X_base.loc[mask].copy()
y_reg = y_reg.loc[mask].copy()

# Train-test split
X_train_raw, X_test_raw, y_train_reg, y_test_reg = train_test_split(
    X_base, y_reg, test_size=0.2, random_state=42
)

# Binary classification target (high vs low) built from train median only (no leakage)
train_median_price = y_train_reg.median()
y_train_cls = (y_train_reg >= train_median_price).astype(int)
y_test_cls = (y_test_reg >= train_median_price).astype(int)

# Leakage-safe target encoding for categorical columns
cat_cols = [c for c in ['location', 'city', 'governorate', 'age'] if c in X_train_raw.columns]
kf = KFold(n_splits=5, shuffle=True, random_state=42)
global_mean = y_train_reg.mean()

X_train_num = X_train_raw.select_dtypes(include=[np.number]).copy()
X_test_num = X_test_raw.select_dtypes(include=[np.number]).copy()

# Extra interactions
if {'latt', 'long'}.issubset(X_train_num.columns):
    X_train_num['lat_long_interaction'] = X_train_num['latt'] * X_train_num['long']
    X_test_num['lat_long_interaction'] = X_test_num['latt'] * X_test_num['long']

if {'Area', 'room'}.issubset(X_train_num.columns):
    X_train_num['area_per_room'] = X_train_num['Area'] / (X_train_num['room'] + 1)
    X_test_num['area_per_room'] = X_test_num['Area'] / (X_test_num['room'] + 1)

te_maps = {}
for col in cat_cols:
    oof = pd.Series(index=X_train_raw.index, dtype=float)

    for tr_idx, val_idx in kf.split(X_train_raw):
        tr_x = X_train_raw.iloc[tr_idx]
        tr_y = y_train_reg.iloc[tr_idx]
        val_x = X_train_raw.iloc[val_idx]
        fold_map = tr_y.groupby(tr_x[col]).mean()
        oof.iloc[val_idx] = val_x[col].map(fold_map).fillna(global_mean).values

    full_map = y_train_reg.groupby(X_train_raw[col]).mean()
    te_maps[col] = full_map.to_dict()
    X_train_num[f'{col}_te'] = oof.values
    X_test_num[f'{col}_te'] = X_test_raw[col].map(full_map).fillna(global_mean).values

print('CRISP-DM preparation complete')
print(f'Train shape: {X_train_num.shape}, Test shape: {X_test_num.shape}')
print(f'Encoded categorical columns: {cat_cols}')

CRISP-DM preparation complete
Train shape: (1783, 25), Test shape: (446, 25)
Encoded categorical columns: ['location', 'city', 'governorate', 'age']


### 4) Modeling — Regression (Best 2 Models)
This step trains the two selected regression models:
- Random Forest Regressor
- XGBoost Regressor

In [6]:
# 4) Modeling - Regression (best 2 models)
reg_rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=25,
    min_samples_split=3,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)
reg_rf.fit(X_train_num, y_train_reg)
reg_rf_pred = reg_rf.predict(X_test_num)
reg_rf_r2 = r2_score(y_test_reg, reg_rf_pred)
reg_rf_mae = mean_absolute_error(y_test_reg, reg_rf_pred)

reg_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1600,
    max_depth=5,
    learning_rate=0.01,
    subsample=0.9,
    colsample_bytree=0.7,
    min_child_weight=5,
    reg_alpha=0.0,
    reg_lambda=2.0,
    random_state=42,
    n_jobs=-1
)
reg_xgb.fit(X_train_num, y_train_reg)
reg_xgb_pred = reg_xgb.predict(X_test_num)
reg_xgb_r2 = r2_score(y_test_reg, reg_xgb_pred)
reg_xgb_mae = mean_absolute_error(y_test_reg, reg_xgb_pred)

print('Regression results')
print(f'RF  -> R²: {reg_rf_r2:.4f}, MAE: {reg_rf_mae:,.0f}')
print(f'XGB -> R²: {reg_xgb_r2:.4f}, MAE: {reg_xgb_mae:,.0f}')

Regression results
RF  -> R²: 0.7328, MAE: 119,876
XGB -> R²: 0.7621, MAE: 112,514


### 4) Modeling — Classification (Best 2 Models)
This step trains the two selected classification models:
- Random Forest Classifier
- Gradient Boosting Classifier

In [7]:
# 4) Modeling - Classification (best 2 models)
clf_rf = RandomForestClassifier(
    n_estimators=600,
    max_depth=12,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
clf_rf.fit(X_train_num, y_train_cls)
clf_rf_pred = clf_rf.predict(X_test_num)
clf_rf_acc = accuracy_score(y_test_cls, clf_rf_pred)

clf_gb = GradientBoostingClassifier(
    n_estimators=400,
    learning_rate=0.03,
    max_depth=4,
    random_state=42
)
clf_gb.fit(X_train_num, y_train_cls)
clf_gb_pred = clf_gb.predict(X_test_num)
clf_gb_acc = accuracy_score(y_test_cls, clf_gb_pred)

print('Classification results')
print(f'RF Classifier -> Accuracy: {clf_rf_acc:.4f}')
print(f'GB Classifier -> Accuracy: {clf_gb_acc:.4f}')

Classification results
RF Classifier -> Accuracy: 0.8879
GB Classifier -> Accuracy: 0.8789


### 5) Testing & Evaluation
Validate project thresholds:
- Regression: R² ≥ 0.70
- Classification: Accuracy ≥ 0.80

In [8]:
# 5) Testing & Evaluation - requirement checks
best_reg_r2 = max(reg_rf_r2, reg_xgb_r2)
best_cls_acc = max(clf_rf_acc, clf_gb_acc)

requirements_eval = pd.DataFrame({
    'Requirement': ['Regression R² >= 0.70', 'Classification Accuracy >= 0.80'],
    'Best Achieved': [best_reg_r2, best_cls_acc],
    'Threshold': [0.70, 0.80]
})
requirements_eval['Pass'] = requirements_eval['Best Achieved'] >= requirements_eval['Threshold']

print(requirements_eval.to_string(index=False))
print('\nRequirement status:')
print(f'- Regression pass: {best_reg_r2 >= 0.70}')
print(f'- Classification pass: {best_cls_acc >= 0.80}')

                    Requirement  Best Achieved  Threshold  Pass
          Regression R² >= 0.70       0.762139        0.7  True
Classification Accuracy >= 0.80       0.887892        0.8  True

Requirement status:
- Regression pass: True
- Classification pass: True


### Final Selected Models (Best 4)
This notebook keeps only these models:
- Regression: XGBoost Regressor, Random Forest Regressor
- Classification: Random Forest Classifier, Gradient Boosting Classifier

In [9]:
# Final selection: keep only the best 4 models
final_regression_models = {
    'XGBoost Regressor': reg_xgb,
    'Random Forest Regressor': reg_rf
}

final_classification_models = {
    'Random Forest Classifier': clf_rf,
    'Gradient Boosting Classifier': clf_gb
}

final_model_scores = pd.DataFrame([
    {'Type': 'Regression', 'Model': 'XGBoost Regressor', 'Metric': 'R²', 'Score': reg_xgb_r2},
    {'Type': 'Regression', 'Model': 'Random Forest Regressor', 'Metric': 'R²', 'Score': reg_rf_r2},
    {'Type': 'Classification', 'Model': 'Random Forest Classifier', 'Metric': 'Accuracy', 'Score': clf_rf_acc},
    {'Type': 'Classification', 'Model': 'Gradient Boosting Classifier', 'Metric': 'Accuracy', 'Score': clf_gb_acc}
]).sort_values(['Type', 'Score'], ascending=[True, False])

print('BEST 4 MODELS KEPT:')
print(final_model_scores.to_string(index=False))

BEST 4 MODELS KEPT:
          Type                        Model   Metric    Score
Classification     Random Forest Classifier Accuracy 0.887892
Classification Gradient Boosting Classifier Accuracy 0.878924
    Regression            XGBoost Regressor       R² 0.762139
    Regression      Random Forest Regressor       R² 0.732828


### 6) Model Deployment
Save and reload selected models and preprocessing artifacts (`.pkl`) for Streamlit inference.

In [10]:
# 6) Model deployment artifacts - save and load .pkl
import pickle

best_reg_model = reg_xgb if reg_xgb_r2 >= reg_rf_r2 else reg_rf
best_cls_model = clf_rf if clf_rf_acc >= clf_gb_acc else clf_gb

X_deploy_raw = X_base.copy()
y_deploy_reg = y_reg.copy()

X_deploy_num = X_deploy_raw.select_dtypes(include=[np.number]).copy()
if {'latt', 'long'}.issubset(X_deploy_num.columns):
    X_deploy_num['lat_long_interaction'] = X_deploy_num['latt'] * X_deploy_num['long']
if {'Area', 'room'}.issubset(X_deploy_num.columns):
    X_deploy_num['area_per_room'] = X_deploy_num['Area'] / (X_deploy_num['room'] + 1)

te_maps_deploy = {}
global_mean_deploy = y_deploy_reg.mean()
for col in cat_cols:
    mapping = y_deploy_reg.groupby(X_deploy_raw[col]).mean()
    te_maps_deploy[col] = mapping.to_dict()
    X_deploy_num[f'{col}_te'] = X_deploy_raw[col].map(mapping).fillna(global_mean_deploy).values

best_reg_model.fit(X_deploy_num, y_deploy_reg)
y_deploy_cls = (y_deploy_reg >= y_deploy_reg.median()).astype(int)
best_cls_model.fit(X_deploy_num, y_deploy_cls)

artifacts = {
    'feature_columns': list(X_deploy_num.columns),
    'cat_cols': cat_cols,
    'te_maps': te_maps_deploy,
    'global_mean': float(global_mean_deploy),
    'class_threshold': float(y_deploy_reg.median())
}

with open('regression_model.pkl', 'wb') as f:
    pickle.dump(best_reg_model, f)
with open('classification_model.pkl', 'wb') as f:
    pickle.dump(best_cls_model, f)
with open('preprocessing_artifacts.pkl', 'wb') as f:
    pickle.dump(artifacts, f)

with open('regression_model.pkl', 'rb') as f:
    _ = pickle.load(f)
with open('classification_model.pkl', 'rb') as f:
    _ = pickle.load(f)
with open('preprocessing_artifacts.pkl', 'rb') as f:
    _ = pickle.load(f)

print('Saved and reloaded deployment files successfully:')
print('- regression_model.pkl')
print('- classification_model.pkl')
print('- preprocessing_artifacts.pkl')

Saved and reloaded deployment files successfully:
- regression_model.pkl
- classification_model.pkl
- preprocessing_artifacts.pkl
