In [104]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import mode

In [105]:
# Calculating Euclidean distance
def distance(v1, v2):
    temp = np.sum((v1-v2)**2)
    dist = np.sqrt(temp)
    return dist

In [106]:
# Calculating KNN with majority vote method
def predict(x_train, y_train , x_test, k):
    # Array to store predicted classification
    y_pred = []
     
    # Loop through the Datapoints to be classified
    for x in x_test:
        # Array to store distances
        distances = []
         
        # Checking distance against each other point
        for i in range(len(x_train)): 
            # Calculating distance between points
            temp = distance(np.array(x_train[i]) , x) 
            distances.append(temp)
        
        # Convering to numpy array
        distances = np.array(distances) 
         
        # Sorting and keeping the K nearest datapoints
        index = np.argsort(distances)[:k] 
        labels = y_train[index] 
        
        # Majority voting method
        pred = mode(labels) 
        pred = pred.mode[0]
        y_pred.append(pred)
 
    return y_pred

In [107]:
def knn(x_train, x_test, y_train, y_test, k):
    # Predicting classification
    y_pred = predict(x_train, y_train, x_test , k)
    
    #Checking the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [108]:
data = pd.read_csv("Data/data_clean.csv", index_col = [0])

data.loc[(data['age'] > 14) & (data['age'] <= 24), 'Age_Group'] = 1 #'Youth (15-24)'
data.loc[(data['age'] > 25) & (data['age'] <= 64), 'Age_Group'] = 2 #'Adults (25-64)'
data.loc[(data['age'] >= 65), 'Age_Group'] = 3 # 'Seniors (65+)'

# Find NAs and fix
data = data.dropna()
data['Age_Group'] = data['Age_Group'].astype('int')

Unnamed: 0,score,Influence,Modesty,Daring,Confidence,Ruling,Responsibility,IntrovExtro,Success,Humble,...,MakeBelieve,Leadership Origin,Biography,Public Opinion,Capability,Extraordinary,elapse,gender,age,Age_Group
0,18,2,2,2,2,1,2,1,2,2,...,1,2,2,2,1,2,211,1,50,2
1,6,2,2,2,1,2,2,1,2,1,...,1,2,2,2,2,1,149,1,40,2
2,27,1,2,2,1,2,1,2,1,2,...,2,1,1,2,1,2,168,1,28,2
3,29,1,1,2,2,2,1,2,1,1,...,2,1,2,2,1,1,230,1,37,2
4,6,1,2,1,1,1,2,1,2,1,...,1,2,2,2,0,1,389,1,50,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11238,1,2,2,2,1,1,2,1,2,1,...,1,2,2,2,2,1,243,1,24,1
11239,10,2,2,1,1,1,2,1,1,1,...,1,2,1,2,2,1,197,1,39,2
11240,6,1,2,2,1,1,2,1,2,1,...,1,2,2,2,2,1,447,2,33,2
11241,12,2,2,1,1,1,1,1,1,1,...,2,2,2,2,2,1,167,1,24,1


### KNN Setup

In [109]:
x_knn = data.iloc[:,1:41]
y_gender = data.iloc[:,42]
y_ageg = data.iloc[:,44]

# Choosing 1000 random samples because I don't want my poor computer to explode
random.seed(123)
num_list = random.sample(range(10714), 1000)

x_knn = x_knn.iloc[num_list,:]
y_gender = y_gender.iloc[num_list]
y_ageg = y_ageg.iloc[num_list]

# Split data
# Gender group
x_train_knn1, x_test_knn1, y_train_knn1, y_test_knn1 = train_test_split(
    x_knn, y_gender, test_size=0.5, random_state=10)

# Age group
x_train_knn2, x_test_knn2, y_train_knn2, y_test_knn2 = train_test_split(
    x_knn, y_ageg, test_size=0.5, random_state=10)

#### Predicting Gender using KNN

In [110]:
# Applying our function for different K values
# Gender 

x_train_knn1 = x_train_knn1.to_numpy()
x_test_knn1 = x_test_knn1.to_numpy()
y_train_knn1 = y_train_knn1.to_numpy()
y_test_knn1 = y_test_knn1.to_numpy()

In [111]:
# James' code

for k in range(1,11,2):
    accuracy = knn(x_train_knn1, x_test_knn1, y_train_knn1, y_test_knn1, k)
    print('For K =', k, ', Accuracy =', accuracy)

For K = 1 , Accuracy = 0.544
For K = 3 , Accuracy = 0.538
For K = 5 , Accuracy = 0.554
For K = 7 , Accuracy = 0.536
For K = 9 , Accuracy = 0.548


In [112]:
# Library code

model = KNeighborsClassifier(n_neighbors=3)
model.fit(x_train_knn1, y_train_knn1)
y_pred1 = model.predict(x_test_knn1)
print("Accuracy:", accuracy_score(y_test_knn1, y_pred1))

Accuracy: 0.546


#### Predicting age group using KNN

In [113]:
# Applying our function for different K values
# Age group

x_train_knn2 = x_train_knn2.to_numpy()
x_test_knn2 = x_test_knn2.to_numpy()
y_train_knn2 = y_train_knn2.to_numpy()
y_test_knn2 = y_test_knn2.to_numpy()

In [114]:
# James' code

for k in range(1,11,2):
    accuracy = knn(x_train_knn2, x_test_knn2, y_train_knn2, y_test_knn2, k)
    print('For K =', k, ', Accuracy =', accuracy)

For K = 1 , Accuracy = 0.588
For K = 3 , Accuracy = 0.64
For K = 5 , Accuracy = 0.668
For K = 7 , Accuracy = 0.676
For K = 9 , Accuracy = 0.674


In [115]:
# Library code

model = KNeighborsClassifier(n_neighbors=3)
model.fit(x_train_knn2, y_train_knn2)
y_pred2 = model.predict(x_test_knn2)
print("Accuracy:",accuracy_score(y_test_knn2, y_pred2))

Accuracy: 0.628
