In [13]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import mode

In [14]:
# Calculating Euclidean distance
def distance(v1, v2):
    temp = np.sum((v1-v2)**2)
    dist = np.sqrt(temp)
    return dist

In [15]:
# Calculating KNN with majority vote method
def predict(x_train, y_train , x_test, k):
    # Array to store predicted classification
    y_pred = []
     
    # Loop through the Datapoints to be classified
    for x in x_test:
        # Array to store distances
        distances = []
         
        # Checking distance against each other point
        for i in range(len(x_train)): 
            # Calculating distance between points
            temp = distance(np.array(x_train[i]) , x) 
            distances.append(temp)
        
        # Convering to numpy array
        distances = np.array(distances) 
         
        # Sorting and keeping the K nearest datapoints
        index = np.argsort(distances)[:k] 
        labels = y_train[index] 
        
        # Majority voting method
        pred = mode(labels) 
        pred = pred.mode[0]
        y_pred.append(pred)
 
    return y_pred

In [25]:
def knn(x_train, x_test, y_train, y_test, k):
    # Predicting classification
    y_pred = predict(x_train, y_train, x_test , k)
    
    #Checking the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    return y_pred, accuracy

In [36]:
data = pd.read_csv("Data/data_AgeGroup.csv", index_col = [0])

data

Unnamed: 0,score,Influence,Modesty,Daring,Confidence,Ruling,Responsibility,IntrovExtro,Success,Humble,...,MakeBelieve,Leadership Origin,Biography,Public Opinion,Capability,Extraordinary,elapse,gender,age,Age_Group
0,18,0,0,0,1,0,0,0,0,1,...,0,0,0,0,1,1,211,1,50,5.0
1,6,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,149,1,40,4.0
2,27,1,0,0,0,1,1,1,1,1,...,1,1,1,0,1,1,168,1,28,3.0
3,29,1,1,0,1,1,1,1,1,0,...,1,1,0,0,1,0,230,1,37,4.0
4,19,1,0,0,0,1,1,0,1,1,...,0,1,0,0,0,1,361,1,27,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10413,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,243,1,24,2.0
10414,10,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,197,1,39,4.0
10415,6,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,447,2,33,3.0
10416,12,0,0,1,0,0,1,0,1,0,...,1,0,0,0,0,0,167,1,24,2.0


### KNN Setup

In [37]:
x_knn = data.iloc[:,1:41]
y_gender = data['gender']
y_ageg = data['Age_Group']

# Choosing 1000 random samples because I don't want my poor computer to explode
random.seed(123)
num_list = random.sample(range(len(data)), 1000)

x_knn = x_knn.iloc[num_list,:]
y_gender = y_gender.iloc[num_list]
y_ageg = y_ageg.iloc[num_list]

# Split data
# Gender group
x_train_knn1, x_test_knn1, y_train_knn1, y_test_knn1 = train_test_split(
    x_knn, y_gender, test_size=0.5, random_state=1)

# Age group
x_train_knn2, x_test_knn2, y_train_knn2, y_test_knn2 = train_test_split(
    x_knn, y_ageg, test_size=0.5, random_state=1)

#### Predicting Gender using KNN

In [39]:
# Applying our function for different K values
# Gender 

x_train_knn1 = x_train_knn1.to_numpy()
x_test_knn1 = x_test_knn1.to_numpy()
y_train_knn1 = y_train_knn1.to_numpy()
y_test_knn1 = y_test_knn1.to_numpy()

In [40]:
# James' code

for k in range(1,11,2):
    y_pred_knn1, accuracy1 = knn(x_train_knn1, x_test_knn1, y_train_knn1, y_test_knn1, k)
    print('For K =', k, ', Accuracy =', accuracy1)

For K = 1 , Accuracy = 0.584
For K = 3 , Accuracy = 0.546
For K = 5 , Accuracy = 0.568
For K = 7 , Accuracy = 0.59
For K = 9 , Accuracy = 0.626


In [21]:
# Library code

model = KNeighborsClassifier(n_neighbors=3)
model.fit(x_train_knn1, y_train_knn1)
y_pred1 = model.predict(x_test_knn1)
print("Accuracy:", accuracy_score(y_test_knn1, y_pred1))

Accuracy: 0.57


In [48]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix
import plotly.express as px

conmat = confusion_matrix(y_test_knn1, y_pred_knn1)
display(conmat)
fig = px.imshow(
    np.round((conmat / conmat.sum()) * 100, 3),
    labels=dict(x="Predicted Values", y="True Values", color="Percentage Classified"),
    x=["Male", "Female", "Other"],
    y=["Male", "Female", "Other"],
)
fig.update_xaxes(side="top")
fig.show()

array([[206,  90,   0],
       [ 96, 107,   0],
       [  1,   0,   0]])

#### Predicting age group using KNN

In [49]:
# Applying our function for different K values
# Age group

x_train_knn2 = x_train_knn2.to_numpy()
x_test_knn2 = x_test_knn2.to_numpy()
y_train_knn2 = y_train_knn2.to_numpy()
y_test_knn2 = y_test_knn2.to_numpy()

In [50]:
# James' code

for k in range(1,11,2):
    y_pred_knn2, accuracy2 = knn(x_train_knn2, x_test_knn2, y_train_knn2, y_test_knn2, k)
    print('For K =', k, ', Accuracy =', accuracy2)

For K = 1 , Accuracy = 0.22
For K = 3 , Accuracy = 0.236
For K = 5 , Accuracy = 0.268
For K = 7 , Accuracy = 0.256
For K = 9 , Accuracy = 0.256


In [51]:
# Library code

model = KNeighborsClassifier(n_neighbors=3)
model.fit(x_train_knn2, y_train_knn2)
y_pred2 = model.predict(x_test_knn2)
print("Accuracy:",accuracy_score(y_test_knn2, y_pred2))

Accuracy: 0.234


In [52]:
# Confustion Matrix

conmat = confusion_matrix(y_test_knn2, y_pred_knn2)
display(conmat)
fig = px.imshow(
    np.round((conmat / conmat.sum()) * 100, 3),
    labels=dict(x="Predicted Values", y="True Values", color="Percentage Classified"),
    x=["Under 18", "18-24", "25-34", "35-44", "45-54", "55-64", "65+",],
    y=["Under 18", "18-24", "25-34", "35-44", "45-54", "55-64", "65+",],
)
fig.update_xaxes(side="top")
fig.show()

array([[ 0, 18,  6,  5,  1,  0,  0],
       [ 3, 56, 38, 22,  4,  1,  0],
       [ 1, 69, 40, 35,  8,  2,  0],
       [ 1, 33, 17, 28,  2,  2,  0],
       [ 2, 19, 22, 21,  3,  2,  0],
       [ 0,  9,  6,  9,  4,  1,  0],
       [ 0,  0,  3,  5,  2,  0,  0]])