# Different Approach for estimating the median price

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsRegressor

import numbers
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

import statistics

from sklearn import set_config
set_config(transform_output="pandas")

In [2]:
df = pd.read_csv('flood_tool/resources/postcodes_labelled.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,postcode,easting,northing,soilType,elevation,localAuthority,riskLabel,medianPrice,historicallyFlooded
0,0,OL9 7NS,390978,403269,Unsurveyed/Urban,130,Oldham,1,119100.0,False
1,1,WV13 2LR,396607,298083,Unsurveyed/Urban,130,Walsall,1,84200.0,False
2,2,LS12 1LZ,427859,432937,Unsurveyed/Urban,60,Leeds,1,134900.0,False
3,3,SK15 1TS,395560,397900,Unsurveyed/Urban,120,Tameside,1,170200.0,False
4,4,TS17 9NN,445771,515362,Unsurveyed/Urban,20,Stockton-on-Tees,1,190600.0,False
...,...,...,...,...,...,...,...,...,...,...
79995,49995,L31 1DW,339403,399211,Unsurveyed/Urban,20,Sefton,1,221200.0,False
79996,49996,DL2 2WA,429177,514182,Unsurveyed/Urban,40,Darlington,1,295300.0,False
79997,49997,OL6 6HL,394074,399425,Unsurveyed/Urban,120,Tameside,1,126400.0,False
79998,49998,SR1 3BD,439200,557000,Unsurveyed/Urban,30,Sunderland,1,602400.0,False


Instead of considering whole dataset, we will try another approach. We will calculate the median price of the houses just by looking at the postcode. We think that houses that are located in same areas, should have a similar price.

So, we will split the UK postcode by the `sector` and the `district`

In [4]:
df = df[['postcode','medianPrice']]

In [5]:
df

Unnamed: 0,postcode,medianPrice
0,OL9 7NS,119100.0
1,WV13 2LR,84200.0
2,LS12 1LZ,134900.0
3,SK15 1TS,170200.0
4,TS17 9NN,190600.0
...,...,...
79995,L31 1DW,221200.0
79996,DL2 2WA,295300.0
79997,OL6 6HL,126400.0
79998,SR1 3BD,602400.0


In [6]:
data = df.copy()

In [7]:
data['district'] = data['postcode'].str.split(' ', expand=True)[0]

In [8]:
data['sector'] = data['postcode'].str[:-2].str.strip()

In [9]:
data['sector_number'] = df['postcode'].str.split(' ').str[1].str[0]

# Defining X and y + Train/Test splt

In [10]:
y = data['medianPrice'] 

In [11]:
X = data.drop(columns='medianPrice')

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [13]:
y_train_mean = y_train.median()

In [14]:
data

Unnamed: 0,postcode,medianPrice,district,sector,sector_number
0,OL9 7NS,119100.0,OL9,OL9 7,7
1,WV13 2LR,84200.0,WV13,WV13 2,2
2,LS12 1LZ,134900.0,LS12,LS12 1,1
3,SK15 1TS,170200.0,SK15,SK15 1,1
4,TS17 9NN,190600.0,TS17,TS17 9,9
...,...,...,...,...,...
79995,L31 1DW,221200.0,L31,L31 1,1
79996,DL2 2WA,295300.0,DL2,DL2 2,2
79997,OL6 6HL,126400.0,OL6,OL6 6,6
79998,SR1 3BD,602400.0,SR1,SR1 3,3


# KNN Regressor method

We will follow this approach:

1. 

In [15]:
def calculate_median_price(X_train, y_train, X_test, data):
    
    median_price = []
    for code in X_test['postcode']:
        
        if code in X_train['postcode'].values:
            median_price.append(data[data['postcode'] == code]['medianPrice'].values[0])
            
        elif code.split(' ')[0] + ' ' + code.split(' ')[1][0] in data['sector'].values:
            sec = code.split(' ')[0] + ' ' + code.split(' ')[1][0]
            median_price.append(data[data['sector'] == sec]['medianPrice'].median())
            
        elif code.split(' ')[0] in data['district'].values:
            district = data[data['district'] == code.split(' ')[0]]
            X_test_sec_num = code.split(' ')[1][0]

            dt_model = DecisionTreeRegressor(max_depth=7, min_samples_leaf=0.23, random_state=3)
            X = district[['sec_num']]
            y = district['medianPrice']
            dt_model.fit(X_train[['sec_num']], y_train)
            y_pred = dt_model.predict(pd.DataFrame([X_test_sec_num], columns=['sec_num']))
            median_price.append(y_pred[0])
            
        else:
            median_price.append(y_train.median())

    median_price = np.array(median_price)
    median_price_no_nan = np.nan_to_num(median_price, nan=y_train.mean())

    return median_price_no_nan

In [16]:
median_price_prediction = calculate_median_price(X_train, y_train, X_test, data)

KeyboardInterrupt: 

In [None]:
y_test_no_nan = np.nan_to_num(y_test, nan=y_train.mean())

In [None]:
mse = mean_squared_error(y_test_no_nan, median_price_prediction)
print("Mean Squared Error:", mse)

In [None]:
rmse = np.sqrt(mse)

In [None]:
rmse

# To deploy in the class:

In [None]:
def calculate_median_price_from_postcodes(self, postcode):
    
        sector_means = self.data.groupby('sector')['medianPrice'].mean()
        knn_data = []
        knn_indices = []
        median_price = []

        for code in postcode:
            
            if code in self.X_train['postcode'].values:
                median_price.append(self.data[self.data['postcode'] == code]['medianPrice'].values[0])
                
            elif (sec := code.split(' ')[0] + ' ' + code.split(' ')[1][0]) in sector_means:
                median_price.append(sector_means[sec])
                
            elif code.split(' ')[0] in self.data['district'].values:
                knn_data.append(code.split(' ')[1][0])
                knn_indices.append(i)
                median_price.append(None)  # Placeholder for KNN prediction
                
            else:
                median_price.append(self.y_train.median())

        if knn_data:
            X_test_sector_number = pd.DataFrame(knn_data, columns=['sector_number'])
            KNN_model = KNeighborsRegressor(n_neighbors=3, weights='distance', n_jobs=-1)
            KNN_model.fit(self.X_train[['sector_number']], self.y_train)
            y_pred = KNN_model.predict(X_test_sector_number)

            for idx, pred in zip(knn_indices, y_pred):
                median_price[idx] = pred

        median_price_no_nan = np.nan_to_num(median_price, nan=self.y_train.median())
        
        return pd.Series(data= median_price_no_nan, index = postcode, name='extra')

     
    

In [None]:
class Tool(object):
    # Existing __init__ and other methods remain unchanged

    def prepare_data(self):
        # Assuming 'self.df_combined_stacked' contains the necessary 'postcode' and 'medianPrice' data
        data = self.df_combined_stacked[['postcode', 'medianPrice']].copy()
        data['district'] = data['postcode'].str.split(' ', expand=True)[0]
        data['sector'] = data['postcode'].str[:-2].str.strip()
        data['sector_number'] = data['postcode'].str.split(' ').str[1].str[0]
        return data

    def calculate_median_price(self, X_train, y_train, X_test):
        data = self.prepare_data()
        sector_means = data.groupby('sector')['medianPrice'].mean()
        knn_data = []
        knn_indices = []
        median_price = []

        for i, code in enumerate(X_test['postcode']):
            if code in X_train['postcode'].values:
                median_price.append(data[data['postcode'] == code]['medianPrice'].values[0])
            elif (sec := code.split(' ')[0] + ' ' + code.split(' ')[1][0]) in sector_means:
                median_price.append(sector_means[sec])
            elif code.split(' ')[0] in data['district'].values:
                knn_data.append(code.split(' ')[1][0])
                knn_indices.append(i)
                median_price.append(None)  # Placeholder for KNN prediction
            else:
                median_price.append(y_train.mean())

        if knn_data:
            X_test_sector_number = pd.DataFrame(knn_data, columns=['sector_number'])
            KNN_model = KNeighborsRegressor(n_neighbors=3, weights='distance', n_jobs=-1)
            KNN_model.fit(X_train[['sector_number']], y_train)
            y_pred = KNN_model.predict(X_test_sector_number)

            for idx, pred in zip(knn_indices, y_pred):
                median_price[idx] = pred

        median_price_no_nan = np.nan_to_num(median_price, nan=y_train.mean())
        return median_price_no_nan

    def predict_median_house_price(self, postcodes, method="custom_knn"):
        data = self.prepare_data()
        y = data['medianPrice']
        X = data.drop(columns='medianPrice')
        X_train, X_test, y_train, _ = train_test_split(X, y, train_size=0.8, random_state=42)

        X_test_filtered = X_test[X_test['postcode'].isin(postcodes)]

        if method == "custom_knn":
            median_price_prediction = self.calculate_median_price(X_train, y_train, X_test_filtered)
            return pd.Series(median_price_prediction, index=X_test_filtered['postcode'], name="medianPrice")
        else:
            # Handle other methods
            pass

# Usage
tool = Tool(
    # Initialize with necessary data paths
)
predicted_prices = tool.predict_median_house_price(['postcode1', 'postcode2'])

In [None]:
df = pd.read_csv('flood_tool/resources/all_data.csv')
df = df[['postcode','medianPrice']]
data = df.copy()
data['district'] = data['postcode'].str.split(' ', expand=True)[0]
data['sector'] = data['postcode'].str[:-2].str.strip()
data['sector_number'] = df['postcode'].str.split(' ').str[1].str[0]
y = data['medianPrice'] 
X = data.drop(columns='medianPrice')
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
y_train_mean = y_train.mean()

def calculate_median_price(self, postcode):
    # Pre-compute mean median prices for each sector
    sector_means = data.groupby('sector')['medianPrice'].mean()

    # Prepare for batch d_tree prediction
    d_tree_data = []
    d_tree_indices = []

    median_price = []
    
    for i, code in enumerate(postcode):
        
        if code in X_train['postcode'].values:
            median_price.append(data[data['postcode'] == code]['medianPrice'].values[0])
            
        elif (sec := code.split(' ')[0] + ' ' + code.split(' ')[1][0]) in sector_means:
            median_price.append(sector_means[sec])
            
        elif code.split(' ')[0] in data['district'].values:
            d_tree_data.append(code.split(' ')[1][0])
            d_tree_indices.append(i)
            median_price.append(None) 
            
        else:
            median_price.append(y_train.mean())

  
    if d_tree_data:
        X_test_sector_number = pd.DataFrame(d_tree_data, columns=['sector_number'])
        d_tree_model = DecisionTreeRegressor(max_depth=7, min_samples_leaf=0.23, random_state=3)
        d_tree_model.fit(X_train[['sector_number']], y_train)
        y_pred = d_tree_model.predict(X_test_sector_number)

        for idx, pred in zip(d_tree_indices, y_pred):
            median_price[idx] = pred

    median_price_no_nan = np.nan_to_num(median_price, nan=y_train.mean())
    return median_price_no_nan

median_price_prediction = calculate_median_price(X_train, y_train, X_test, data)
y_test_no_nan = np.nan_to_num(y_test, nan=y_train.mean())
mse = mean_squared_error(y_test_no_nan, median_price_prediction)
rmse = np.sqrt(mse)


In [None]:
postcode = [1,2,2,2,3,4,4]

for i, code in enumerate(postcode):
    print(code)