In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../flood_tool/resources/postcodes_sampled.csv')

df

Unnamed: 0,postcode,sector,easting,northing,localAuthority,altitude,soilType,riskLabel,medianPrice
0,PO7 8PR,PO7 8,469395.0,108803.0,Havant,30,Planosols,1,233500.0
1,SO17 1NS,SO17 1,442771.0,114321.0,Southampton,20,Unsurveyed/Urban,1,291800.0
2,TN28 8XN,TN28 8,606861.0,124689.0,Folkestone and Hythe,10,Cambisols,1,326500.0
3,KT3 4JW,KT3 4,521649.0,168848.0,Kingston upon Thames,20,Unsurveyed/Urban,1,875200.0
4,CT2 8AA,CT2 8,614532.0,158074.0,Canterbury,10,Unsurveyed/Urban,10,303500.0
...,...,...,...,...,...,...,...,...,...
39995,SE22 8BE,SE22 8,533403.0,175417.0,Southwark,20,Unsurveyed/Urban,1,674300.0
39996,SW10 0JB,SW10 0,526500.0,177609.0,Kensington and Chelsea,0,Unsurveyed/Urban,1,884900.0
39997,HP21 9QS,HP21 9,482072.0,211761.0,Buckinghamshire,90,Cambisols,1,302000.0
39998,TN15 8NY,TN15 8,560877.0,157522.0,Tonbridge and Malling,90,Luvisols,1,190000.0


In [3]:
df.isnull().sum()

postcode          0
sector            0
easting           0
northing          0
localAuthority    0
altitude          0
soilType          0
riskLabel         0
medianPrice       0
dtype: int64

In [4]:
duplicates_count = df.duplicated().sum()
print(duplicates_count)

0


In [5]:
df.riskLabel.value_counts()

1     36114
4      1588
6       870
8       461
5       254
7       205
3       161
10      131
2       114
9       102
Name: riskLabel, dtype: int64

In [11]:
df[df['riskLabel']==7].sample(frac=0.5, random_state=42)

Unnamed: 0,postcode,sector,easting,northing,localAuthority,altitude,soilType,riskLabel,medianPrice
3950,SL4 3JX,SL4 3,496347.0,175583.0,Windsor and Maidenhead,30,Arenosols,7,661200.0
2204,SW19 1DW,SW19 1,526567.0,170312.0,Merton,10,Unsurveyed/Urban,7,720900.0
20602,DA1 1BG,DA1 1,554142.0,174232.0,Dartford,10,Unsurveyed/Urban,7,19900.0
26439,SP6 1JY,SP6 1,414691.0,114632.0,New Forest,30,Luvisols,7,300900.0
13833,SW17 0DE,SW17 0,526974.0,171045.0,Wandsworth,10,Unsurveyed/Urban,7,595700.0
...,...,...,...,...,...,...,...,...,...
6971,OX49 5NQ,OX49 5,466595.0,195305.0,South Oxfordshire,90,Luvisols,7,647300.0
2443,ME20 7AJ,ME20 7,573882.0,158746.0,Maidstone,10,Luvisols,7,904300.0
38876,SM5 1HX,SM5 1,527811.0,166904.0,Sutton,20,Unsurveyed/Urban,7,334100.0
31436,SL1 5TF,SL1 5,494433.0,180160.0,Slough,30,Luvisols,7,662400.0


In [6]:
weight_lst = []
df_weight = pd.DataFrame()
for i in range(df.riskLabel.nunique()):
    ratio = 100/df[df['riskLabel']==i+1].shape[0]
    weight_lst.append(df[df['riskLabel']==i+1].sample(frac=ratio, random_state=42))
    df_weighted = df[df['riskLabel']==i+1].sample(frac=ratio, random_state=42)
    df_weight = pd.concat([df_weight,df_weighted])

df_weight.riskLabel.value_counts()

1     100
2     100
3     100
4     100
5     100
6     100
7     100
8     100
9     100
10    100
Name: riskLabel, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split

y = df_weight['riskLabel']
X = df_weight.drop(columns=['riskLabel', 'medianPrice', 'sector', 'postcode','localAuthority'])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state = 42)

In [8]:
from sklearn import set_config

set_config(display='diagram')

In [9]:
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_union
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

cat_pipe = make_column_transformer((OneHotEncoder(),X_train.select_dtypes(exclude=np.number).columns), 
                                   remainder='drop')
num_pipe = make_column_transformer((StandardScaler(),X_train.select_dtypes(include=np.number).columns), 
                                   remainder='drop')
preproc = make_union(cat_pipe, num_pipe)

preproc

In [10]:
preproc.fit(X_train)

X_train_t = preproc.transform(X_train)
X_test_t = preproc.transform(X_test)