In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline

df = pd.read_csv('postcodes_sampled.csv') # read the data from a csv file
y = df.riskLabel # define you target as the column you want to predit
X = df.drop(columns=['postcode', 'sector', 'localAuthority', 'riskLabel', 'medianPrice']) # this if you feature column

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42) #Train_test_split

preproc = ColumnTransformer([
    ('num_transformer', MinMaxScaler(), X_train.select_dtypes(include=np.number).columns),
    ('cat_transformer', OneHotEncoder(sparse=False), X_train.select_dtypes(exclude=np.number).columns) 
]) # Column transformer performing pipeline on numerical and categorical data


X_train = preproc.fit_transform(X_train) # fit & transform 

svc = SVC(C=0.03, kernel='rbf')
svc_reg = SVR()

knn = KNeighborsClassifier()
knn_reg = KNeighborsRegressor()

tree = DecisionTreeClassifier()
tree_reg = DecisionTreeRegressor()

svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
tree.fit(X_train, y_train)

X_test = preproc.transform(X_test)

y_predsvc = svc.predict(X_test)
y_predtree = tree.predict(X_test)
y_predknn = knn.predict(X_test)

print(f'accuracy of SVC {accuracy_score(y_test, y_predsvc)}')
print(f'accuracy of Decision Tree {accuracy_score(y_test, y_predtree)}')
print(f'accuracy of KNN {accuracy_score(y_test, y_predknn)}')


accuracy of SVC 0.9048
accuracy of Decision Tree 0.9019
accuracy of KNN 0.9264


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, max_error
import math

In [3]:
df = pd.read_csv('postcodes_sampled.csv') # read the data from a csv file
y = df.riskLabel # define you target as the column you want to predit
X = df.drop(columns=['postcode', 'sector', 'localAuthority', 'riskLabel', 'medianPrice']) # this if you feature column

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42) #Train_test_split


In [4]:
preproc = ColumnTransformer([
    ('num_transformer', MinMaxScaler(), X_train.select_dtypes(include=np.number).columns),
    ('cat_transformer', OneHotEncoder(sparse=False), X_train.select_dtypes(exclude=np.number).columns) 
]) # Column transformer performing pipeline on numerical and categorical data

In [5]:
final_risklabel_pipeline_regression = Pipeline([
    ('preprocessing', preproc),
    ('classifier', LinearRegression())
])
final_risklabel_pipeline_regression

In [6]:
# Train pipeline
final_pipe_trained = final_risklabel_pipeline_regression.fit(X_train,y_train)

In [7]:
# Make predictions
y_pred = final_pipe_trained.predict(X_test)
pd.DataFrame(y_pred.astype(int)).value_counts()

1    8856
0     846
2     291
3       7
dtype: int64

In [8]:
# Score model
final_pipe_trained.score(X_test, y_test)

0.06104264040462126

In [9]:
y_test.value_counts()

1     9048
4      359
6      237
8      104
5       69
7       59
10      35
3       32
2       31
9       26
Name: riskLabel, dtype: int64

In [10]:
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
rsquared = r2_score(y_test, y_pred)

print('MSE =', round(mse, 2))
print('RMSE =', round(rmse, 2))
print('MAE =', round(mae, 2))
print('R2 =', round(rsquared, 2))

MSE = 1.91
RMSE = 1.38
MAE = 0.75
R2 = 0.06


In [11]:
#knn_reg.fit(X_train,y_train)
#tree_reg.fit(X_train,y_train)
#svc_reg.fit(X_train,y_train)

In [12]:
#knn_reg.score(X_test,y_test)
#tree_reg.score(X_test,y_test)
#svc_reg.score(X_test,y_test)

In [13]:
#y_predsvc_reg = svc.predict(X_test)
#y_predtree_reg = tree.predict(X_test)
#y_predknn_reg = knn.predict(X_test)

In [14]:
#print(f'Regression of knn {knn_reg.score(X_test,y_test)}')
#print(f'Regression of Dec_tree {tree_reg.score(X_test,y_test)}')
#print(f'Regression of SVC {svc_reg.score(X_test,y_test)}')