In [93]:
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import SGDClassifier
from sklearn import svm

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Data Preprocessing

In [94]:
data = pd.read_csv("resources/postcodes_sampled.csv")

In [95]:
X = data[["easting","northing"]]
y = data['riskLabel']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42) # Holdout

In [96]:
features

['easting', 'northing', 'riskLabel']

In [97]:
features = ["easting","northing","riskLabel"]
data_train, data_test = train_test_split(data, test_size=.3)

data_train = pd.DataFrame(data_train, columns=features)
data_test = pd.DataFrame(data_test, columns=features)

In [110]:
data_cv = data_train.sample(frac=.05)

In [111]:
X_train = data_train.drop('riskLabel', axis=1)
X_test = data_test.drop('riskLabel', axis=1)
X_cv = data_cv.drop('riskLabel', axis=1)
y_train = data_train['riskLabel'].copy()
y_test = data_test['riskLabel'].copy()
y_cv = data_cv['riskLabel'].copy()

# SVC

In [None]:
clf = svm.SVC()
params = {'kernel':['linear','poly', 'rbf', 'sigmoid'],
         'gamma': ['scale','auto']}

# , 'poly', 'rbf', 'sigmoid', 'precomputed'

svc_gs = GridSearchCV(clf, params, scoring="precision", cv=5, n_jobs=-1)
svc_gs.fit(X_cv, y_cv)



In [103]:
svc_gs.best_params_

{'gamma': 'scale', 'kernel': 'linear'}

In [104]:
svc_gs.best_score_

nan

In [105]:
svc_gs.best_estimator_

SVC(kernel='linear')

In [77]:
clf = svm.SVC()
clf.fit(X_train, y_train)
cv_results = cross_validate(clf, X_train, y_train, scoring=['accuracy'])
pd.DataFrame(cv_results).mean()

fit_time         1.721393
score_time       0.688344
test_accuracy    0.947286
dtype: float64

In [85]:
y_pred = clf.predict(X_test)
accuracy_svc = clf.score(X_test, y_test)
# accuracy2 = accuracy_score(y_test, y_pred)
print(accuracy_svc)

0.9423333333333334


# Random Forest

In [79]:
forest_classifier = RandomForestClassifier()
forest_classifier.fit(X_train, y_train)
cv_results = cross_validate(forest_classifier, X_train, y_train, scoring=['accuracy'])

In [86]:
y_pred = forest_classifier.predict(X_test)
accuracy_rf = forest_classifier.score(X_test, y_test)
# accuracy2 = accuracy_score(y_test, y_pred)
print(accuracy_rf)

0.9438333333333333


# KNN classifier

In [81]:
neigh = KNeighborsClassifier(n_neighbors=50)
neigh.fit(X_train, y_train)
cv_results = cross_validate(neigh, X_train, y_train, scoring=['accuracy'])
pd.DataFrame(cv_results).mean()

fit_time         0.003917
score_time       0.055523
test_accuracy    0.947286
dtype: float64

In [87]:
y_pred = neigh.predict(X_test)
accuracy_neigh = neigh.score(X_test, y_test)
# accuracy2 = accuracy_score(y_test, y_pred)
print(accuracy_neigh)

0.9423333333333334


# SGD

In [None]:
svc_bis = SGDClassifier()
params = {'penalty':['l2', 'l1', 'elasticnet'],
         'alpha': [1/10,5/10, 8/10]}

# , 'poly', 'rbf', 'sigmoid', 'precomputed'

svc_gs = GridSearchCV(clf, params, scoring="precision", cv=5, n_jobs=-1)
svc_gs.fit(X_cv, y_cv)

In [83]:
svc_bis = SGDClassifier(loss='hinge', penalty='l1', alpha=1/10)
svc_bis.fit(X_train, y_train)
cv_results = cross_validate(svc_bis, X_train, y_train, scoring=['accuracy'])
pd.DataFrame(cv_results).mean()



fit_time         2.749445
score_time       0.001220
test_accuracy    0.933429
dtype: float64

In [88]:
y_pred = svc_bis.predict(X_test)
accuracy_svc_bis = svc_bis.score(X_test, y_test)
accuracy2 = accuracy_score(y_test, y_pred)
print(accuracy_svc_bis)

0.9328333333333333


In [91]:
print("rf", accuracy_rf)
print("svc", accuracy_svc)
print("knn", accuracy_neigh)
print("sgd", accuracy_svc_bis)

rf 0.9438333333333333
svc 0.9423333333333334
knn 0.9423333333333334
sgd 0.9328333333333333
