# Label Propagation (No Network) Notebook

This notebook runs Label Propagation assuming no existing network

### Setup

In [1]:
import csv
import heapq
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.semi_supervised import LabelPropagation

%load_ext autoreload
%autoreload 2

### Parse node labels

In [9]:
node_label = {}
#read node labels
with open('data/icwsm_polarization/all.nodes') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter='\t')
    for row in csv_reader:
        # 1 for left, 2 for right
        if row[1] == 'left':
            node_label[row[0]] = 1
        elif row[1] == 'right':
            node_label[row[0]] = 2

### Parse Most Frequent Hashtags

In [10]:
word_freq = {}
with open('data/icwsm_polarization/all.edgelist') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\t')
        line_count = 0
        for row in csv_reader:
            for token in row[5:]:
                if token not in word_freq.keys():
                    word_freq[token] = 1
                else:
                    word_freq[token] += 1
            line_count += 1
        
        most_freq = heapq.nlargest(200, word_freq, key=word_freq.get)

### Build bag of hashtags

In [11]:
#build dictionary where every node has a list of hashtag counts
node_tweets = {}

#iterate over every row in all.edgelist
with open('data/icwsm_polarization/all.edgelist') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter='\t')
    for row in csv_reader:
        #if retweet, take label from B node, else take label from A node
        if row[2] == 'retweet':
            #is a retweet -> Take label from B node
            #check if B node in edge is in labels
            if row[1] not in node_label.keys():
                #node label is '-'
                continue
            if row[1] not in node_tweets.keys():
                node_tweets[row[1]] = np.zeros(200)
            #construct bag of hashtags (one for each node)
            for i, token in enumerate(most_freq):
                if token in row[5:]:
                    node_tweets[row[1]][i] += 1

        else:
            #is a mention -> take label from A node
            if row[0] not in node_label.keys():
                #node label is '-'
                continue
            if row[0] not in node_tweets.keys():
                node_tweets[row[0]] = np.zeros(200)
            #construct bag of hashtags
            for i, token in enumerate(most_freq):
                if token in row[5:]:
                    node_tweets[row[0]][i] += 1

### Split data into train and test based on indivial nodes

In [12]:
keys = list(node_tweets.keys())
random.shuffle(keys)
test_size = 0.3
n_train = int((1-test_size) * len(keys))
train_keys = keys[:n_train]
test_keys = keys[n_train:]

X = []
y = []
for key in train_keys:
    tup = node_tweets[key]
    X.append(tup)
    y.append(node_label[key])
X_train = np.asarray(X)
y_train = np.asarray(y)

X = []
y = []
for key in test_keys:
    tup = node_tweets[key]
    X.append(tup)
    y.append(node_label[key])
X_test = np.asarray(X)
y_test = np.asarray(y)

### Set random labels as unknown

In [13]:
p = 0.3 #proportion of labels that are set as unobserved
rng = np.random.RandomState(42)
unlabel_points = rng.rand(len(y_train)) < p
labels = np.copy(y_train)
labels[unlabel_points] = -1

In [14]:
print(labels[:20])

[ 1  1  2  1 -1 -1 -1  2  1  1 -1  2  1 -1 -1 -1  2  1  1 -1]


### Create Prop Model and Test

In [16]:
#Create Prop Model
label_prop_model = LabelPropagation(max_iter=2000)

#Fit model to train data
label_prop_model.fit(X_train, labels)

  self.label_distributions_ /= normalizer


LabelPropagation(gamma=20, kernel='rbf', max_iter=2000, n_jobs=None,
                 n_neighbors=7, tol=0.001)

In [17]:
acc = label_prop_model.score(X_test, y_test)
print(f'Accuracy: {acc}')

Accuracy: 0.6162139747045408
