# Different Approach for estimating the median price

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsRegressor

import numbers
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

import statistics

from sklearn import set_config
set_config(transform_output="pandas")

In [3]:
df = pd.read_csv('flood_tool/resources/postcodes_labelled.csv')

In [4]:
df

Unnamed: 0.1,Unnamed: 0,postcode,easting,northing,soilType,elevation,localAuthority,riskLabel,medianPrice,historicallyFlooded
0,0,OL9 7NS,390978,403269,Unsurveyed/Urban,130,Oldham,1,119100.0,False
1,1,WV13 2LR,396607,298083,Unsurveyed/Urban,130,Walsall,1,84200.0,False
2,2,LS12 1LZ,427859,432937,Unsurveyed/Urban,60,Leeds,1,134900.0,False
3,3,SK15 1TS,395560,397900,Unsurveyed/Urban,120,Tameside,1,170200.0,False
4,4,TS17 9NN,445771,515362,Unsurveyed/Urban,20,Stockton-on-Tees,1,190600.0,False
...,...,...,...,...,...,...,...,...,...,...
79995,49995,L31 1DW,339403,399211,Unsurveyed/Urban,20,Sefton,1,221200.0,False
79996,49996,DL2 2WA,429177,514182,Unsurveyed/Urban,40,Darlington,1,295300.0,False
79997,49997,OL6 6HL,394074,399425,Unsurveyed/Urban,120,Tameside,1,126400.0,False
79998,49998,SR1 3BD,439200,557000,Unsurveyed/Urban,30,Sunderland,1,602400.0,False


In [5]:
df.columns

Index(['Unnamed: 0', 'postcode', 'easting', 'northing', 'soilType',
       'elevation', 'localAuthority', 'riskLabel', 'medianPrice',
       'historicallyFlooded'],
      dtype='object')

We will just work with 2 columns, the `postcode` and the `MedianPrice`.

The `postcode` will be splitted into the `sector`, the `district`, and the `sector number`.


And we will follow this structure:


- If the exact postcode is found in the training set, the corresponding median price is used.

- If not, but the postcode's sector is found, the average median price for that sector is used.

- If the sector is not found but the outward district is found, a regressor algorithm (Decision Tree Regressor, KNN Regressor) is trained on the fly for that specific district using 'sector_number' as a feature, and the model is used to predict the median price (we took sector number so that we can peform regression with a numerical feature)

- If none of these conditions are met, we replace the medianPrice, by the mean medianPrice of our whole train set.

In [11]:
df = df[['postcode', 'medianPrice']]
df['outwardDistrict'] = df['postcode'].apply(lambda x: x.split(' ')[0])
df['sector'] = df['postcode'].apply(lambda x: x[:-2].strip())
df['sec_num'] = df['postcode'].apply(lambda x: x.split(' ')[1][0])

# Splitting into features and target
y = df['medianPrice'] 
X = df.drop(columns='medianPrice')

# Splitting into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
y_train_mean = y_train.mean()

# Prediction logic
median_price = []
for code in X_test['postcode']:
    
    if code in X_train['postcode']:
        median_price.append(df[df['postcode'] == code]['medianPrice'].values[0])
        
    elif code.split(' ')[0] + ' ' + code.split(' ')[1][0] in df['sector'].values:
        sec = code.split(' ')[0] + ' ' + code.split(' ')[1][0]
        median_price.append(df[df['sector'] == sec]['medianPrice'].mean())
        
    elif code.split(' ')[0] in df['outwardDistrict'].values:
        district = df[df['outwardDistrict'] == code.split(' ')[0]]
        X_test_sec_num = code.split(' ')[1][0]

        rf_model = DecisionTreeRegressor(max_depth=5, min_samples_leaf=0.13, n_jobs=-1)
        X = district[['sec_num']]
        y = district['medianPrice']
        KNN_model.fit(X_train[['sec_num']], y_train)
        y_pred = KNN_model.predict(pd.DataFrame([X_test_sec_num], columns=['sec_num']))
        median_price.append(y_pred[0])
    else:
        median_price.append(np.nan)

median_price = np.array(median_price)
median_price_no_nan = np.nan_to_num(median_price, nan=y_train_mean)

# Handling NaN values in y_test
y_test_no_nan = np.nan_to_num(y_test, nan=y_train_mean)

# Calculating Mean Squared Error
mse = mean_squared_error(y_test_no_nan, median_price_no_nan)
print("Mean Squared Error:", mse)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['outwardDistrict'] = df['postcode'].apply(lambda x: x.split(' ')[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sector'] = df['postcode'].apply(lambda x: x[:-2].strip())


Mean Squared Error: 303255015900.22675


In [17]:
df

Unnamed: 0,postcode,medianPrice,outwardDistrict,sector,sec_num
0,OL9 7NS,119100.0,OL9,OL9 7,7
1,WV13 2LR,84200.0,WV13,WV13 2,2
2,LS12 1LZ,134900.0,LS12,LS12 1,1
3,SK15 1TS,170200.0,SK15,SK15 1,1
4,TS17 9NN,190600.0,TS17,TS17 9,9
...,...,...,...,...,...
79995,L31 1DW,221200.0,L31,L31 1,1
79996,DL2 2WA,295300.0,DL2,DL2 2,2
79997,OL6 6HL,126400.0,OL6,OL6 6,6
79998,SR1 3BD,602400.0,SR1,SR1 3,3


In [12]:
rmse = np.sqrt(mse)

In [13]:
rmse

550685.9503385089

In [14]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test_no_nan, median_price_no_nan)
print("R² on Test Data:", r2)

R² on Test Data: 0.9024512646955822


We obtain a really high R2 !!!

## Adapting it so that it works in our class

In [16]:
class AuxiliarPricePredictor():
    def prepare_data(self):
        # Assuming 'self.df_combined_stacked' contains the necessary 'postcode' and 'medianPrice' data
        data = self.training_data[['postcode', 'medianPrice']].copy()
        data['district'] = data['postcode'].str.split(' ', expand=True)[0]
        data['sector'] = data['postcode'].str[:-2].str.strip()
        data['sector_number'] = data['postcode'].str.split(' ').str[1].str[0]
        self.data = data
        return self.data
    
    def calculate_median_price(self, postcode):
        # Pre-compute mean median prices for each sector
        sector_means = self.data.groupby('sector')['medianPrice'].mean()
 
        # Prepare for batch d_tree prediction
        d_tree_data = []
        d_tree_indices = []
 
        median_price = []
        
        for i, code in enumerate(postcode):
            
            if code in self.data['postcode'].values:
                median_price.append(self.data[self.data['postcode'] == code]['medianPrice'].values[0])
                
            elif (sec := code.split(' ')[0] + ' ' + code.split(' ')[1][0]) in sector_means:
                median_price.append(sector_means[sec])
                
            elif code.split(' ')[0] in self.data['district'].values:
                d_tree_data.append(code.split(' ')[1][0])
                d_tree_indices.append(i)
                median_price.append(None)
                
            else:
                median_price.append(self.data['medianPrice'].mean())
 
    
        if d_tree_data:
            X_test_sector_number = pd.DataFrame(d_tree_data, columns=['sector_number'])
            d_tree_model = DecisionTreeRegressor(max_depth=7, min_samples_leaf=0.23, random_state=3)
            d_tree_model.fit(self.data[['sector_number']], self.data['medianPrice'])
            y_pred = d_tree_model.predict(X_test_sector_number)
 
            for idx, pred in zip(d_tree_indices, y_pred):
                median_price[idx] = pred
 
        median_price_no_nan = np.nan_to_num(median_price, nan=self.data['medianPrice'].mean())
        return pd.Series(data= median_price_no_nan, index = postcode, name="decision tree method")
    
 
    def predict_median_house_price(
            self, postcodes: list[str], method: str = "all_england_median"
        ) -> pd.Series:
            """
            Generate series predicting median house price for a collection
            of poscodes.

            Parameters
            ----------

            postcodes : sequence of strs
                Sequence of postcodes.
            method : int (optional)
                optionally specify (via a key in the
                get_house_price_methods dict) the regression
                method to be used.

            Returns
            -------

            pandas.Series
                Series of median house price estimates indexed by postcodes.
            """

            # retrieve postcodes from postcode data source
            X = self.combined_data[self.combined_data["postcode"].isin(postcodes)].sort_values(
                by="postcode"
            )
            X.drop_duplicates(inplace=True)

            match method:
                case "decision_tree":
                    aux_price_predictor = AuxiliarPricePredictor()
                    aux_price_predictor.data = self.training_data
                    return aux_price_predictor.calculate_median_price(X["postcode"])