In [1]:
import csv
import os
import networkx as nx
import numpy as np
from sklearn.linear_model import LogisticRegression
import community as community_louvain
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
import pickle
from sklearn.ensemble import VotingClassifier

import parent_modules
import preprocessor

%load_ext autoreload
%load_ext nb_black
%autoreload 2

from definitions import *

<IPython.core.display.Javascript object>

### Load the retweet network as a directed graph

In [2]:
G = nx.read_weighted_edgelist(
    os.path.join(DATA_DIR, "retweet_weighted.edgelist"),
    create_using=nx.DiGraph(),
    nodetype=int,
)

print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())

Number of nodes: 784898
Number of edges: 7401920


<IPython.core.display.Javascript object>

### Store the ID of the user that posted each message, and initialize for each user a 15-dimensional vector that will store the number of messages of each class posted by the user

In [3]:
posted_by = dict()
posts_per_class = dict()
users = list()
with open(os.path.join(DATA_DIR, "posts.tsv"), "r") as f:
    for line in f:
        t = line.split("\t")
        posted_by[int(t[0])] = int(t[1])
        posts_per_class[int(t[1])] = np.zeros(15)
        users.append(int(t[1]))
users = set(users)

<IPython.core.display.Javascript object>

### Read training data. Given a message posted by user A that belongs to class B, increase the number of posts of class B posted by user A by 1 

In [4]:
train_index = list()
y_train = list()
with open(os.path.join(DATA_DIR, "train.csv"), "r") as f:
    for line in f:
        t = line.split(",")
        train_index.append(int(t[0]))
        y_train.append(int(t[1]))
        posts_per_class[posted_by[int(t[0])]][int(t[1][:-1])] += 1

<IPython.core.display.Javascript object>

### Read test data

In [5]:
test_index = list()
with open(os.path.join(DATA_DIR, "test.csv"), "r") as f:
    for line in f:
        t = line.split(",")
        test_index.append(int(t[0]))

<IPython.core.display.Javascript object>

## Community Detection

In [6]:
G_un = G.copy().to_undirected()
for node in G:
    for ngbr in nx.neighbors(G, node):
        if node in nx.neighbors(G, ngbr):
            G_un.edges[node, ngbr]["weight"] = (
                G.edges[node, ngbr]["weight"] + G.edges[ngbr, node]["weight"]
            )

<IPython.core.display.Javascript object>

In [7]:
print("Number of edges:", G.number_of_edges())
print("Number of undirected edges:", G_un.number_of_edges())

Number of edges: 7401920
Number of undirected edges: 7383985


<IPython.core.display.Javascript object>

In [8]:
initial_partition = dict()
for user in G.nodes():
    initial_partition[user] = -1
    has_succ = False
    has_pred = False
    succ_posts_per_class = np.zeros((1, 15))
    pred_posts_per_class = np.zeros((1, 15))
    for successor in G.successors(user):
        if successor in posts_per_class:
            has_succ = True
            succ_posts_per_class[0,:15] += posts_per_class[successor]
    for predecessor in G.predecessors(user):
        if predecessor in posts_per_class:
            has_pred = True
            pred_posts_per_class[0,:15] += posts_per_class[predecessor]

    if user in posts_per_class:
        initial_partition[user] = posts_per_class[user].tolist().index(max(posts_per_class[user]))
    elif has_pred:
        maxValue = np.max(pred_posts_per_class)
        index_of_maximum = (np.where(pred_posts_per_class[0] == maxValue)[0])[0]
        initial_partition[user] = index_of_maximum+15
    elif has_succ:
        maxValue = np.max(succ_posts_per_class)
        index_of_maximum = (np.where(succ_posts_per_class[0] == maxValue)[0])[0]
        initial_partition[user] = index_of_maximum+30


<IPython.core.display.Javascript object>

In [9]:
for comm in set(initial_partition.values()):
    times = list(initial_partition.values()).count(comm)
    print("{} has occurred {} times".format(comm, times))

0 has occurred 6994 times
1 has occurred 458 times
2 has occurred 1855 times
3 has occurred 811 times
4 has occurred 829 times
5 has occurred 105 times
6 has occurred 78 times
7 has occurred 14 times
8 has occurred 75 times
9 has occurred 94 times
10 has occurred 416 times
11 has occurred 230 times
12 has occurred 246 times
13 has occurred 143 times
14 has occurred 292 times
15 has occurred 13780 times
16 has occurred 2293 times
17 has occurred 8314 times
18 has occurred 1070 times
19 has occurred 2743 times
20 has occurred 209 times
21 has occurred 383 times
22 has occurred 26 times
23 has occurred 104 times
24 has occurred 503 times
25 has occurred 1249 times
26 has occurred 795 times
27 has occurred 935 times
28 has occurred 211 times
29 has occurred 420 times
30 has occurred 354732 times
31 has occurred 71960 times
32 has occurred 132642 times
33 has occurred 101924 times
34 has occurred 22967 times
35 has occurred 3978 times
36 has occurred 4476 times
37 has occurred 264 times
38 

<IPython.core.display.Javascript object>

 ### Create the training matrix. Each row corresponds to a message. Use the following 15-dimensional vector of the user that posted the message and concatenate to that vector the following three features:<br/>(1) in-degree of user <br/> (2) out-degree of user <br/> (3) community user belongs to

In [10]:
X_train = np.zeros((len(train_index), 16))
for i, idx in enumerate(train_index):
    for successor in G.successors(posted_by[idx]):
        if successor in posts_per_class:
            X_train[i, :15] += posts_per_class[successor]

    for predecessor in G.predecessors(posted_by[idx]):
        if predecessor in posts_per_class:
            X_train[i, :15] += posts_per_class[predecessor]
    if np.sum(X_train[i, :15]) > 0:
        X_train[i, :15] /= np.sum(X_train[i, :15])

    X_train[i, 15] = G.degree(posted_by[idx])

<IPython.core.display.Javascript object>

### Create the test matrix. Each row corresponds to a message.Use the following 15-dimensional vector of the user that posted the message and concatenate to that vector the following three features: <br/>(1) in-degree of user<br/>(2) out-degree of user <br/> (3) community user belongs to

In [11]:
X_test = np.zeros((len(test_index), 16))
for i, idx in enumerate(test_index):
    for successor in G.successors(posted_by[idx]):
        if successor in posts_per_class:
            X_test[i, :15] += posts_per_class[successor]

    for predecessor in G.predecessors(posted_by[idx]):
        if predecessor in posts_per_class:
            X_test[i, :15] += posts_per_class[predecessor]

    if np.sum(X_test[i, :15]) > 0:
        X_test[i, :15] /= np.sum(X_test[i, :15])

    X_test[i, 15] = G.degree(posted_by[idx])

<IPython.core.display.Javascript object>

# skip

In [None]:
# compute the best partition
low_partition = community_louvain.best_partition(G_un, resolution=0.8)

In [None]:
low_res_train = np.zeros((len(train_index), 2))
for i,idx in enumerate(train_index):
    low_res_train[i,0] = idx
    low_res_train[i,1] = low_partition[posted_by[idx]]
    
low_res_test = np.zeros((len(test_index), 2))
for i,idx in enumerate(test_index):
    low_res_test[i,0] = idx
    low_res_test[i,1] = low_partition[posted_by[idx]]  
low_res = {"low_res_train":low_res_train,"low_res_test":low_res_test}

In [None]:
# compute the best partition
high_partition = community_louvain.best_partition(G_un,resolution = 3)

In [None]:
high_res_train = np.zeros((len(train_index), 2))
for i,idx in enumerate(train_index):
    high_res_train[i,0] = idx
    high_res_train[i,1] = high_partition[posted_by[idx]]
    
high_res_test = np.zeros((len(test_index), 2))
for i,idx in enumerate(test_index):
    high_res_test[i,0] = idx
    high_res_test[i,1] = high_partition[posted_by[idx]]  
high_res = {"high_res_train":high_res_train,"high_res_test":high_res_test}

In [None]:
# compute the best partition
initial_partition = community_louvain.best_partition(G_un,partition = initial_partition)

In [None]:
initial_train = np.zeros((len(train_index), 2))
for i,idx in enumerate(train_index):
    initial_train[i,0] = idx
    initial_train[i,1] = initial_partition[posted_by[idx]]
    
initial_test = np.zeros((len(test_index), 2))
for i,idx in enumerate(test_index):
    initial_test[i,0] = idx
    initial_test[i,1] = initial_partition[posted_by[idx]] 
initial = {"initial_train":initial_train,"initial_test":initial_test}

In [None]:
dict_to_export = {"low_res":low_res,"high_res":high_res,"initial":initial}

In [None]:
with open('communities.pickle', 'wb') as handle:
    pickle.dump(dict_to_export, handle, protocol=pickle.HIGHEST_PROTOCOL)

# stop skip

In [12]:
y_train_pred_lin = list()
y_pred_lin = list()

with open("text_train_predictions.csv", "r") as f:
    for line in f:
        t = line.split(",")
        t = [float(i) for i in t]
        y_train_pred_lin.append(t)

with open("text_predictions.csv", "r") as f:
    for line in f:
        t = line.split(",")
        t = [float(i) for i in t]
        y_pred_lin.append(t)

<IPython.core.display.Javascript object>

In [13]:
with (open("communities.pickle", "rb")) as openfile:
    dict_to_export = pickle.load(openfile)

low_res = dict_to_export["low_res"]
low_res_train = low_res["low_res_train"]
low_res_test = low_res["low_res_test"]

high_res = dict_to_export["high_res"]
high_res_train = high_res["high_res_train"]
high_res_test = high_res["high_res_test"]

initial = dict_to_export["initial"]
initial_train = initial["initial_train"]
initial_test = initial["initial_test"]

<IPython.core.display.Javascript object>

In [14]:
neighbour_weight = 0.2
prediction_weight = 1 - neighbour_weight

<IPython.core.display.Javascript object>

In [15]:
X_train_with_comm = np.zeros((len(train_index), 17))
for i, idx in enumerate(train_index):
    n = np.zeros(15)
    n += X_train[i, :15]
    n *= neighbour_weight
    X_train_with_comm[i, :15] += n

    p = np.zeros(15)
    p += y_train_pred_lin[i]
    p *= prediction_weight
    X_train_with_comm[i, :15] += p
    X_train_with_comm[i, :15] /= 2

    X_train_with_comm[i, 15] = low_res_train[i][1]
    X_train_with_comm[i, 16] = G.degree(posted_by[idx])

<IPython.core.display.Javascript object>

In [16]:
X_test_with_comm = np.zeros((len(test_index), 17))
for i, idx in enumerate(test_index):
    n = np.zeros(15)
    n += X_test[i, :15]
    n *= neighbour_weight
    X_test_with_comm[i, :15] += n

    p = np.zeros(15)
    p += y_pred_lin[i]
    p *= prediction_weight
    X_test_with_comm[i, :15] += p
    X_test_with_comm[i, :15] /= 2
    X_test_with_comm[i, 15] = low_res_train[i][1]
    X_test_with_comm[i, 16] = G.degree(posted_by[idx])

<IPython.core.display.Javascript object>

In [17]:
X_train_dev, X_test_dev, y_train_dev, y_test_dev = train_test_split(
    X_train_with_comm, y_train, test_size=0.2
)

<IPython.core.display.Javascript object>

## Voting ML Classifiers

In [27]:
total_entries = len(y_train)
class_counts = {i: len(list(filter(lambda x: x == i, y_train))) for i in range(15)}
class_weights = {
    cls: np.round((1 / class_counts[cls]) * total_entries / 2, 4) for cls in range(15)
}

<IPython.core.display.Javascript object>

In [20]:
clf_lin = LogisticRegression(
    solver="newton-cg",
    multi_class="multinomial",
    class_weight=class_weight,
    max_iter=10000,
)
clf_rd_frst = RandomForestClassifier(
    max_depth=150, criterion="entropy", class_weight=class_weight
)
models = list()
models.append(("clf_lin", clf_lin))
models.append(("clf_rd_frst", clf_rd_frst))
ensemble = VotingClassifier(estimators=models, voting="soft")

<IPython.core.display.Javascript object>

## Evalutation

In [21]:
dev_ensemble = ensemble.fit(X_train_dev, y_train_dev)
vot_pred_dev = ensemble.predict_proba(X_test_dev)
log = log_loss(y_test_dev, vot_pred_dev)
print("log loss lin:", log)

log loss lin: 0.08614925959452087


<IPython.core.display.Javascript object>

### Prediction

In [25]:
full_ensemble = ensemble.fit(X_train_with_comm, y_train)
vot_pred_test = full_ensemble.predict_proba(X_test_with_comm)

<IPython.core.display.Javascript object>

In [26]:
with open("voting_last_chance.csv", "w") as csvfile:
    writer = csv.writer(csvfile, delimiter=",")
    lst = ["id"]
    for i in range(15):
        lst.append("class_" + str(i))
    writer.writerow(lst)
    for i, idx in enumerate(test_index):
        lst = vot_pred_test[i, :].tolist()
        lst.insert(0, idx)
        writer.writerow(lst)

<IPython.core.display.Javascript object>