# Analyse the Influence of Outliers

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Models & Normalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Exporting plotly plots to pdf 
import plotly.io as pio

# Configuration: 

# Set to False for interactive zooming/hovering (Exploration)
# Set to True for static images (PDF Export)
EXPORT_MODE = True  

if EXPORT_MODE:
    # Forces all charts to be static images (requires 'pip install -U kaleido')
    pio.renderers.default = "png" # Use "png" if svg gives you trouble
    print("⚠️ EXPORT_MODE is ON. Charts will be static images.")
else:
    # Default interactive plotly
    pio.renderers.default = "notebook_connected"
    print("✅ Interactive mode. Charts will include zoom/hover.")

## Open the clean data

In [None]:
file_path = "Data/v0_cleaned_house_sales.csv"

df_clean = pd.read_csv(file_path)
df_clean.head()

## Analysing Prices

In [None]:
sns.histplot(df_clean.price)
plt.title("Histogram: price")

In [None]:
sns.boxplot(df_clean.price)
plt.title("Boxplot: price")

In [None]:
df_clean["price"].describe()

In [None]:
# 11 houses over 4 million
df_clean[df_clean.price > 4e6]

## How much of the data is represented by outliers? 

In [None]:
df_sorted = df_clean["price"].sort_values()

# Cumulative Quatiles
fig = px.ecdf(df_sorted, x="price", title="Cumulative Distribution of House Prices")

# Add a marker line at the 95th and 99th percentiles
fig.add_hline(y=0.95, line_dash="dot", annotation_text="95% of data", annotation_position="bottom right")
fig.show()


In [None]:
# quantiles
quantile_99 = df_clean.price.quantile(0.99)
quantile_95 = df_clean.price.quantile(0.95)

print("Quantile 99: ", quantile_99)
print("Quantile 95: ", quantile_95)


In [None]:
# Flag the datapoints inside each quantiles 99 and 95
df_clean["q_99"] = (df_clean.price < quantile_99).astype(int)
df_clean["q_95"] = (df_clean.price < quantile_95).astype(int)

df_clean.head()

In [None]:
print(f"The 99th quantile exclude {df_clean.shape[0] - df_clean.q_99.sum()} datapoints")
print(f"The 95th quantile exclude {df_clean.shape[0] - df_clean.q_95.sum()} datapoints")

## Sensitivity to outliers (metrics)

In [None]:
from utils import *

metrics_df = create_metrics_df()

### Random Forest flagging the outliers

In [None]:
# Split into train and test
seed = 13
# The price is the target variable
y = df_clean["price"]

# All other variables are the features for the baseline model
X = df_clean.drop(["price"], axis=1)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [None]:
X_train

In [None]:
# most common hyperparameters or the default ones
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(random_state=seed)#default values + random_state = 13
rf_regressor.fit(X_train, y_train)


metrics_df = add_new_metrics(metrics_df,
                             rf_regressor,
                             X_train,
                             y_train,
                             split = "train",
                             comments="Outliers flagging, no normalization.")

metrics_df = add_new_metrics(metrics_df,
                             rf_regressor,
                             X_test,
                             y_test,
                             split = "test",
                             comments="Outliers flagging, no normalization.")

In [None]:
metrics_df

### Random Forest removing top 1% outliers

In [None]:
# Split into train and test
# The price is the target variable
y = df_clean[df_clean.q_99 == 1]["price"]

# All other variables are the features for the baseline model
X = df_clean[df_clean.q_99 == 1].drop(["price", "q_99", "q_95"], axis=1)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [None]:
X_train

In [None]:
# most common hyperparameters or the default ones
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(random_state=seed)#default values + random_state = 13
rf_regressor.fit(X_train, y_train)


metrics_df = add_new_metrics(metrics_df,
                             rf_regressor,
                             X_train,
                             y_train,
                             split = "train",
                             comments="Removing top 1%, no normalization.")

metrics_df = add_new_metrics(metrics_df,
                             rf_regressor,
                             X_test,
                             y_test,
                             split = "test",
                             comments="Removing top 1%, no normalization.")

In [None]:
metrics_df

### Random Forest removing top 5% outliers

In [None]:
# Split into train and test
# The price is the target variable
y = df_clean[df_clean.q_95 == 1]["price"]

# All other variables are the features for the baseline model
X = df_clean[df_clean.q_95 == 1].drop(["price", "q_99", "q_95"], axis=1)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [None]:
X_train

In [None]:
# most common hyperparameters or the default ones
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(random_state=seed)#default values + random_state = 13
rf_regressor.fit(X_train, y_train)


metrics_df = add_new_metrics(metrics_df,
                             rf_regressor,
                             X_train,
                             y_train,
                             split = "train",
                             comments="Removing top 5%, no normalization.")

metrics_df = add_new_metrics(metrics_df,
                             rf_regressor,
                             X_test,
                             y_test,
                             split = "test",
                             comments="Removing top 5%, no normalization.")

In [None]:
metrics_df

### XGBoost flagging the outliers

In [None]:
# Split into train and test
seed = 13
# The price is the target variable
y = df_clean["price"]

# All other variables are the features for the baseline model
X = df_clean.drop(["price"], axis=1)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBRegressor(seed = seed)
xgb_clf.fit(X_train, y_train)

In [None]:
metrics_df = add_new_metrics(metrics_df,
                             xgb_clf,
                             X_train,
                             y_train,
                             split = "train",
                             comments="Outliers flagging, no normalization.")


metrics_df = add_new_metrics(metrics_df,
                             xgb_clf,
                             X_test,
                             y_test,
                             split = "test",
                             comments="Outliers flagging, no normalization.")

In [None]:
metrics_df

### XGBoost removing top 1% 

In [None]:
# Split into train and test
# The price is the target variable
y = df_clean[df_clean.q_99 == 1]["price"]

# All other variables are the features for the baseline model
X = df_clean[df_clean.q_99 == 1].drop(["price", "q_99", "q_95"], axis=1)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [None]:
X_train

In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBRegressor(seed = seed)
xgb_clf.fit(X_train, y_train)

In [None]:
metrics_df = add_new_metrics(metrics_df,
                             xgb_clf,
                             X_train,
                             y_train,
                             split = "train",
                             comments="Removing top 1%, no normalization.")


metrics_df = add_new_metrics(metrics_df,
                             xgb_clf,
                             X_test,
                             y_test,
                             split = "test",
                             comments="Removing top 1%, no normalization.")

In [None]:
metrics_df

### XGBoost removing top 5% 

In [None]:
# Split into train and test
# The price is the target variable
y = df_clean[df_clean.q_95 == 1]["price"]

# All other variables are the features for the baseline model
X = df_clean[df_clean.q_95 == 1].drop(["price", "q_99", "q_95"], axis=1)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBRegressor(seed = seed)
xgb_clf.fit(X_train, y_train)

In [None]:
metrics_df = add_new_metrics(metrics_df,
                             xgb_clf,
                             X_train,
                             y_train,
                             split = "train",
                             comments="Removing top 5%, no normalization.")


metrics_df = add_new_metrics(metrics_df,
                             xgb_clf,
                             X_test,
                             y_test,
                             split = "test",
                             comments="Removing top 5%, no normalization.")

In [None]:
metrics_df

The feature flagging for both 99% and 95% worked better than removing the outliers, which means that the information about the top priced houses is still important
to accurately predict the prices. In that sense, we will continue the analysis using the flagging of the columns instead of dropping them.

# Data Leakage Analysis: Outlier Flagging

Our initial approach calculated outlier thresholds using the entire dataset, introducing data leakage by allowing test set information to influence training features. While this is common in exploratory phases, it risks inflating model performance. In this section, we apply hypothesis testing to determine if correcting this leakage results in a statistically significant difference in model predictions.

In [None]:
# Apply outlier flagging
def apply_flagging(X, y):
    # quantiles
    quantile_99 = y.quantile(0.99)
    quantile_95 = y.quantile(0.95)

    print("Quantile 99: ", quantile_99)
    print("Quantile 95: ", quantile_95)

    # Flag the datapoints inside each quantiles 99 and 95
    X["q_99"] = (y < quantile_99).astype(int)
    X["q_95"] = (y < quantile_95).astype(int)

    return X

## Preparing Datasets

In [None]:
# Split into train and test
seed = 13
# The price is the target variable
y = df_clean["price"]

# All other variables are the features for the baseline model
X = df_clean.drop(["price"], axis=1)

### Dataset without Leakage

In [None]:
# Data without leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# Flagging
X_train = apply_flagging(X_train, y_train)
X_test = apply_flagging(X_test, y_test)

### Dataset with Leakage

In [None]:
# Data with leakage

X_leakage = apply_flagging(X, y)

X_train_leak, X_test_leak, y_train_leak, y_test_leak = train_test_split(X_leakage, y, test_size=0.2, random_state=seed)

### Experiment metrics df

In [None]:
# New metrics dataframe

leak_df = create_metrics_df()
leak_df

## Statistical Significance Test: Random Forest Performance

### Without leakage 

In [None]:
# most common hyperparameters or the default ones
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(random_state=seed)#default values + random_state = 13
rf_regressor.fit(X_train, y_train)


leak_df = add_new_metrics(leak_df,
                             rf_regressor,
                             X_train,
                             y_train,
                             split = "train",
                             comments="No leakage.")

leak_df = add_new_metrics(leak_df,
                             rf_regressor,
                             X_test,
                             y_test,
                             split = "test",
                             comments="No leakage.")

leak_df

### With leakage

In [None]:
# most common hyperparameters or the default ones
from sklearn.ensemble import RandomForestRegressor

rf_regressor_leak = RandomForestRegressor(random_state=seed)#default values + random_state = 13
rf_regressor_leak.fit(X_train_leak, y_train_leak)


leak_df = add_new_metrics(leak_df,
                             rf_regressor_leak,
                             X_train_leak,
                             y_train_leak,
                             split = "train",
                             comments="with leakage.")

leak_df = add_new_metrics(leak_df,
                             rf_regressor_leak,
                             X_test_leak,
                             y_test_leak,
                             split = "test",
                             comments="with leakage.")

leak_df

In [None]:
# Hypothesis testing for the test scores
from sklearn.model_selection import cross_val_score
from scipy import stats
import numpy as np

# Ho: scores_leakage = scores_no_leakage 
# Ha: scores_leakage != scores_no_leakage  

# Method 1: With leakage
scores_leakage = cross_val_score(rf_regressor_leak, X_test_leak, y_test_leak, 
                                  cv=5, scoring='r2')

# Method 2: Without leakage  
scores_no_leakage = cross_val_score(rf_regressor, X_test, y_test,
                                     cv=5, scoring='r2')

# Paired t-test (same folds, so paired)
t_stat, p_value = stats.ttest_rel(scores_leakage, scores_no_leakage)

print(f"Mean R² with leakage: {scores_leakage.mean():.4f} ± {scores_leakage.std():.4f}")
print(f"Mean R² without leakage: {scores_no_leakage.mean():.4f} ± {scores_no_leakage.std():.4f}")
print(f"p-value: {p_value:.4f}")

if p_value < 0.05:
    print("Difference is statistically significant. We reject the null hypothesis.")
else:
    print("Difference is NOT statistically significant. We cannot reject the null hypothesis.")


In [None]:
# Hypothesis testing for the train scores
from sklearn.model_selection import cross_val_score
from scipy import stats
import numpy as np

# Ho: scores_leakage = scores_no_leakage 
# Ha: scores_leakage != scores_no_leakage  

# Method 1: With leakage
scores_leakage = cross_val_score(rf_regressor_leak, X_train_leak, y_train_leak, 
                                  cv=5, scoring='r2')

# Method 2: Without leakage  
scores_no_leakage = cross_val_score(rf_regressor, X_train, y_train,
                                     cv=5, scoring='r2')

# Paired t-test (same folds, so paired)
t_stat, p_value = stats.ttest_rel(scores_leakage, scores_no_leakage)

print(f"Mean R² with leakage: {scores_leakage.mean():.4f} ± {scores_leakage.std():.4f}")
print(f"Mean R² without leakage: {scores_no_leakage.mean():.4f} ± {scores_no_leakage.std():.4f}")
print(f"p-value: {p_value:.4f}")

if p_value < 0.05:
    print("Difference is statistically significant. We reject the null hypothesis.")
else:
    print("Difference is NOT statistically significant. We cannot reject the null hypothesis.")


## Statistical Significance Test: XGBoost Performance

### No leakage

In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBRegressor(seed = seed)
xgb_clf.fit(X_train, y_train)

leak_df = add_new_metrics(leak_df,
                             xgb_clf,
                             X_train,
                             y_train,
                             split = "train",
                             comments="No leakage.")

leak_df = add_new_metrics(leak_df,
                             xgb_clf,
                             X_test,
                             y_test,
                             split = "test",
                             comments="No leakage.")

leak_df

In [None]:
xgb_clf_leak = xgb.XGBRegressor(seed = seed)
xgb_clf_leak.fit(X_train_leak, y_train_leak)

leak_df = add_new_metrics(leak_df,
                             xgb_clf_leak,
                             X_train_leak,
                             y_train_leak,
                             split = "train",
                             comments="with leakage.")

leak_df = add_new_metrics(leak_df,
                             xgb_clf_leak,
                             X_test_leak,
                             y_test_leak,
                             split = "test",
                             comments="with leakage.")

leak_df

In [None]:
# Hypothesis testing for the test scores
from sklearn.model_selection import cross_val_score
from scipy import stats
import numpy as np

# Ho: scores_leakage = scores_no_leakage 
# Ha: scores_leakage != scores_no_leakage  

# Method 1: With leakage
scores_leakage = cross_val_score(xgb_clf_leak, X_test_leak, y_test_leak, 
                                  cv=5, scoring='r2')

# Method 2: Without leakage  
scores_no_leakage = cross_val_score(xgb_clf, X_test, y_test,
                                     cv=5, scoring='r2')

# Paired t-test (same folds, so paired)
t_stat, p_value = stats.ttest_rel(scores_leakage, scores_no_leakage)

print(f"Mean R² with leakage: {scores_leakage.mean():.4f} ± {scores_leakage.std():.4f}")
print(f"Mean R² without leakage: {scores_no_leakage.mean():.4f} ± {scores_no_leakage.std():.4f}")
print(f"p-value: {p_value:.4f}")

if p_value < 0.05:
    print("Difference is statistically significant. We reject the null hypothesis.")
else:
    print("Difference is NOT statistically significant. We cannot reject the null hypothesis.")


In [None]:
# Hypothesis testing for the train scores
from sklearn.model_selection import cross_val_score
from scipy import stats
import numpy as np

# Ho: scores_leakage = scores_no_leakage 
# Ha: scores_leakage != scores_no_leakage  

# Method 1: With leakage
scores_leakage = cross_val_score(xgb_clf_leak, X_train_leak, y_train_leak, 
                                  cv=5, scoring='r2')

# Method 2: Without leakage  
scores_no_leakage = cross_val_score(xgb_clf, X_train, y_train,
                                     cv=5, scoring='r2')

# Paired t-test (same folds, so paired)
t_stat, p_value = stats.ttest_rel(scores_leakage, scores_no_leakage)

print(f"Mean R² with leakage: {scores_leakage.mean():.4f} ± {scores_leakage.std():.4f}")
print(f"Mean R² without leakage: {scores_no_leakage.mean():.4f} ± {scores_no_leakage.std():.4f}")
print(f"p-value: {p_value:.4f}")

if p_value < 0.05:
    print("Difference is statistically significant. We reject the null hypothesis.")
else:
    print("Difference is NOT statistically significant. We cannot reject the null hypothesis.")


## Conclusion on Leakage

Statistical testing showed no significant difference between the approaches. For the scope of this analysis, we will proceed with the current implementation for simplicity, while acknowledging that a production-grade deployment would require the leakage-free pipeline to ensure strict data isolation.

## Export Dataset and Metrics

In [None]:
# filename_metrics = "Metrics/outlier_analysis_metrics.csv"

# metrics_df.to_csv(filename_metrics, index = False)

In [None]:
# df_clean

In [None]:
# filename_data = "Data/v1_house_sales.csv"

# df_clean.to_csv(filename_data, index = False)