Importing libraries and data

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

data=pd.read_csv('Lecture6.csv',index_col='Country')

Display all columns and rows

In [None]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width', 1000)


Visualizing the data

In [None]:
sns.scatterplot(data,y="Happiness",x="GDP_pc",hue='Regime')

#log transform GDP_pc and plot again. Note that np.log takes the natural logarithm; i.e. base=Euler's number
data['GDP_log']=np.log(data.GDP_pc)
sns.scatterplot(data,y="Happiness",x="GDP_log",hue='Regime')


Training K-NN classifier

In [None]:
# Define outcome and predictors
y = data['Regime']
x = data[['Happiness','GDP_log']]

# Decide optimal k (no. of neighbours) using simple rule of thumb
optimal_k = round(np.sqrt(len(data)))

# Create a kNN classifier and fit it to data. If n_neigbors is not specified, the default value=5
knn = KNeighborsClassifier(n_neighbors = optimal_k)
knn.fit(x,y)


# Predict the y labels and add to dataset
knn.predict(x)
data['predicted']=knn.predict(x)
data[['Regime','predicted']]

# Evaluate accuracy
knn.score(x, y)


Visualizing outcomes: confusion matrix

In [None]:
#Confusion matrix with raw frequencies and percentages
cnf_matrix = metrics.confusion_matrix(data['Regime'], data['predicted'])  #real, then predicted
cnf_matrix


Visualizing outcomes: heatmap

In [None]:
#Heatmap with actual numbers (rows=actual labels, columns=predicted labels)
labels = data['Regime'].unique()   #obtain labels in correct order

sns.heatmap(cnf_matrix, annot=True, cmap="Blues", yticklabels=labels, xticklabels=labels, annot_kws={"size": 15})
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

#Heatmap with percentages (rows=actual labels, columns=predicted labels)
sns.heatmap(cnf_matrix/np.sum(cnf_matrix), annot=True, cmap="Blues",fmt='.2%', yticklabels=labels, xticklabels=labels, annot_kws={"size": 15})
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

Visualizing outcomes: classification report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(data['Regime'],data['predicted']))


Visualizing outcomes: scatterplots

In [None]:
#Scatterplots of actual vs. predicted labels
sns.scatterplot(data,y='Happiness',x='GDP_log',hue='Regime', s=80)
plt.title("Actual labels")

sns.scatterplot(data,y='Happiness',x='GDP_log',hue='predicted', s=80)
plt.title("Predicted labels")


Extracting 'boundary' countries

In [None]:

data.loc[(data['Regime'] == 'Flawed') & (data['predicted'] =='Hybrid')]
data.loc[(data['Regime'] == 'Hybrid') & (data['predicted'] =='Flawed')]

SEMINAR 6: A MORE PRINCIPLED APPROACH TO DECIDE THE BEST K

In [None]:
# Setup arrays to store accuracy values
neighbors = np.arange(1, 16)
accuracy = np.empty(len(neighbors))

# Loop over different values of k, fit model, and compute accuracy
for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x,y)
    accuracy[i] = knn.score(x, y)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, accuracy)
plt.xticks(neighbors)
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()
