# Different Approach for estimating the median price

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsRegressor

import numbers
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

import statistics

from sklearn import set_config
set_config(transform_output="pandas")

In [2]:
df = pd.read_csv('flood_tool/resources/all_data.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'flood_tool/resources/all_data.csv'

In [None]:
df

Instead of considering whole dataset, we will try another approach. We will calculate the median price of the houses just by looking at the postcode. We think that houses that are located in same areas, should have a similar price.

So, we will split the UK postcode by the `sector` and the `district`

In [None]:
df = df[['postcode','medianPrice']]

In [None]:
df

In [None]:
data = df.copy()

In [None]:
data['district'] = data['postcode'].str.split(' ', expand=True)[0]

In [None]:
data['sector'] = data['postcode'].str[:-2].str.strip()

In [None]:
data['sector_number'] = df['postcode'].str.split(' ').str[1].str[0]

# Defining X and y + Train/Test splt

In [None]:
y = data['medianPrice'] 

In [None]:
X = data.drop(columns='medianPrice')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
y_train_mean = y_train.mean()

In [None]:
data

# KNN Regressor method

We will follow this approach:

1. 

In [None]:
def calculate_median_price(X_train, y_train, X_test, data):
    # Pre-compute mean median prices for each sector
    sector_means = data.groupby('sector')['medianPrice'].mean()

    # Prepare for batch KNN prediction
    knn_data = []
    knn_indices = []

    median_price = []
    
    for i, code in enumerate(X_test['postcode']):
        
        if code in X_train['postcode'].values:
            median_price.append(data[data['postcode'] == code]['medianPrice'].values[0])
            
        elif (sec := code.split(' ')[0] + ' ' + code.split(' ')[1][0]) in sector_means:
            median_price.append(sector_means[sec])
            
        elif code.split(' ')[0] in data['district'].values:
            knn_data.append(code.split(' ')[1][0])
            knn_indices.append(i)
            median_price.append(None)  # Placeholder for KNN prediction
        else:
            median_price.append(y_train.mean())

    # Batch KNN predictions
    if knn_data:
        X_test_sector_number = pd.DataFrame(knn_data, columns=['sector_number'])
        KNN_model = KNeighborsRegressor(n_neighbors=3, weights='distance', n_jobs=-1)
        KNN_model.fit(X_train[['sector_number']], y_train)
        y_pred = KNN_model.predict(X_test_sector_number)

        for idx, pred in zip(knn_indices, y_pred):
            median_price[idx] = pred

    median_price_no_nan = np.nan_to_num(median_price, nan=y_train.mean())
    return median_price_no_nan

In [None]:
median_price_prediction = calculate_median_price(X_train, y_train, X_test, data)

In [None]:
y_test_no_nan = np.nan_to_num(y_test, nan=y_train.mean())

In [None]:
mse = mean_squared_error(y_test_no_nan, median_price_prediction)
print("Mean Squared Error:", mse)

In [None]:
rmse = np.sqrt(mse)

In [None]:
rmse

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test_no_nan, median_price_prediction)
print("R² on Test Data:", r2)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
def calculate_median_price_lr(X_train, y_train, X_test, data):
    # Pre-compute mean median prices for each sector
    sector_means = data.groupby('sector')['medianPrice'].mean()

    # Prepare for batch KNN prediction
    knn_data = []
    knn_indices = []

    median_price = []
    
    for i, code in enumerate(X_test['postcode']):
        
        if code in X_train['postcode'].values:
            median_price.append(data[data['postcode'] == code]['medianPrice'].values[0])
            
        elif (sec := code.split(' ')[0] + ' ' + code.split(' ')[1][0]) in sector_means:
            median_price.append(sector_means[sec])
            
        elif code.split(' ')[0] in data['district'].values:
            knn_data.append(code.split(' ')[1][0])
            knn_indices.append(i)
            median_price.append(None)  # Placeholder for KNN prediction
        else:
            median_price.append(y_train.mean())

    # Batch KNN predictions
    if knn_data:
        X_test_sector_number = pd.DataFrame(knn_data, columns=['sector_number'])
        KNN_model = LinearRegression()
        KNN_model.fit(X_train[['sector_number']], y_train)
        y_pred = KNN_model.predict(X_test_sector_number)

        for idx, pred in zip(knn_indices, y_pred):
            median_price[idx] = pred

    median_price_no_nan = np.nan_to_num(median_price, nan=y_train.mean())
    return median_price_no_nan

In [None]:
median_price_prediction_lr = calculate_median_price_lr(X_train, y_train, X_test, data)

In [None]:
y_test_no_nan = np.nan_to_num(y_test, nan=y_train.mean())

In [None]:
mse_2 = mean_squared_error(y_test_no_nan, median_price_prediction_lr)
print("Mean Squared Error:", mse_2)

In [None]:
rmse = np.sqrt(mse)

In [None]:
rmse

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test_no_nan, median_price_prediction_lr)
print("R² on Test Data:", r2)