# KNN Notebook

This notebook runs KNN and gets pretty high accuracy values...

### Setup

In [1]:
import csv
import heapq
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

%load_ext autoreload
%autoreload 2

### Parse node labels

In [2]:
node_label = {}
#read node labels
with open('data/icwsm_polarization/all.nodes') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter='\t')
    for row in csv_reader:
        # 1 for left, 0 for right
        if row[1] == 'left':
            node_label[row[0]] = 1
        elif row[1] == 'right':
            node_label[row[0]] = 0

### Parse Most Frequent Hashtags

In [6]:
word_freq = {}
with open('data/icwsm_polarization/all.edgelist') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\t')
        line_count = 0
        for row in csv_reader:
            for token in row[5:]:
                if token not in word_freq.keys():
                    word_freq[token] = 1
                else:
                    word_freq[token] += 1
            line_count += 1
        
        most_freq = heapq.nlargest(200, word_freq, key=word_freq.get)

### Build bag of hashtags

In [8]:
#build dictionary where every node has a list of (bag of hashtags, label) tuples
node_tweets = {}

#iterate over every row in all.edgelist
with open('data/icwsm_polarization/all.edgelist') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter='\t')
    for row in csv_reader:
        datapoint = []
        #if retweet, take label from B node, else take label from A node
        if row[2] == 'retweet':
            #is a retweet -> Take label from B node
            #check if B node in edge is in labels
            if row[1] not in node_label.keys():
                #node label is '-'
                continue
            #construct bag of hashtags
            for token in most_freq:
                if token in row[5:]:
                    datapoint.append(1)
                else:
                    datapoint.append(0)
            #add (data, label) tuple to node dictionary
            if row[1] not in node_tweets.keys():
                node_tweets[row[1]] = [(datapoint, node_label[row[1]])]
            else:
                node_tweets[row[1]].append((datapoint, node_label[row[1]]))
        else:
            #is a mention -> take label from A node
            if row[0] not in node_label.keys():
                #node label is '-'
                continue
            #construct bag of hashtags
            for token in most_freq:
                if token in row[5:]:
                    datapoint.append(1)
                else:
                    datapoint.append(0)
            #add (data, label) tuple to node dictionary
            if row[0] not in node_tweets.keys():
                node_tweets[row[0]] = [(datapoint, node_label[row[0]])]
            else:
                node_tweets[row[0]].append((datapoint, node_label[row[0]]))

### Split data into train and test based on indivial nodes

In [17]:
keys = list(node_tweets.keys())
random.shuffle(keys)
test_size = 0.3
n_train = int((1-test_size) * len(keys))
train_keys = keys[:n_train]
test_keys = keys[n_train:]

X = []
y = []
'''    for tup in node_tweets[key]:
        X.append(tup[0])
        y.append(tup[1])
'''
for key in train_keys:
    tup = node_tweets[key][0]
    X.append(tup[0])
    y.append(tup[1])
X_train = np.asarray(X)
y_train = np.asarray(y)

X = []
y = []
for key in test_keys:
    tup = node_tweets[key][0]
    X.append(tup[0])
    y.append(tup[1])
X_test = np.asarray(X)
y_test = np.asarray(y)

### Create KNN Classifier and Run

In [18]:
#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8308106987352271
