## Imports

### Imports

In [79]:
import random, gensim, nltk, re, json, os, torch, time, pickle
import numpy as np
import pandas as pd
import networkx as nx
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import scipy.sparse as sp
import plotly.graph_objects as go
from pandarallel import pandarallel
from math import exp
from multiprocessing import Process
from multiprocessing import Manager
from statistics import mode, mean, stdev
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from typing import Dict, Tuple, Sequence, Any, List, Set, Union, Collection, Optional
from sklearn import metrics
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, MeanShift, DBSCAN
from sklearn.metrics import recall_score, f1_score, precision_score, accuracy_score, roc_auc_score, average_precision_score
from collections import defaultdict, Counter

## Defined Functions

### PID

In [80]:
class PIDControl():
    """docstring for ClassName"""

    def __init__(self, Kp=0.001, Ki=-0.001):
        """define them out of loop"""
        # self.exp_KL = exp_KL
        self.I_k1 = 0.0
        self.W_k1 = 0.0
        self.e_k1 = 0.0
        self.Kp = Kp
        self.Ki = Ki

    def _Kp_fun(self, Err, scale=1):
        return 1.0 / (1.0 + float(scale) * exp(Err))

    def pid(self, exp_KL, KL_loss):
        """
     position PID algorithm
     Input: KL_loss
     return: weight for KL loss, beta
     """
        error_k = exp_KL - KL_loss
        ## comput U as the control factor
        Pk = self.Kp * self._Kp_fun(error_k)
        Ik = self.I_k1 + self.Ki * error_k
        # Dk = (error_k - self.e_k1) * Kd

        ## window up for integrator
        if self.W_k1 < 0 and self.W_k1 >= 1:
            Ik = self.I_k1

        Wk = Pk + Ik
        self.W_k1 = Wk
        self.I_k1 = Ik
        self.e_k1 = error_k

        ## min and max value
        if Wk >= 1:
            Wk = 1.0
        if Wk < 0:
            Wk = 0.0

        return Wk

### Discriminator

In [81]:
def kaiming_init(m):
    if isinstance(m, (nn.Linear, nn.Conv2d)):
        init.kaiming_normal_(m.weight)
        if m.bias is not None:
            m.bias.data.fill_(0)
    elif isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d)):
        m.weight.data.fill_(1)
        if m.bias is not None:
            m.bias.data.fill_(0)

In [82]:
def normal_init(m):
    if isinstance(m, (nn.Linear, nn.Conv2d)):
        init.normal_(m.weight, 0, 0.02)
        if m.bias is not None:
            m.bias.data.fill_(0)
    elif isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d)):
        m.weight.data.fill_(1)
        if m.bias is not None:
            m.bias.data.fill_(0)

In [83]:
class Discriminator(nn.Module):
    def __init__(self, z_dim):
        super(Discriminator, self).__init__()
        self.z_dim = z_dim
        self.net = nn.Sequential(
            nn.Linear(z_dim, 1000),
            nn.LeakyReLU(0.2, True),
            nn.Linear(1000, 1000),
            nn.LeakyReLU(0.2, True),
            nn.Linear(1000, 2),
        )
        self.weight_init()

    def weight_init(self, mode='normal'):
        if mode == 'kaiming':
            initializer = kaiming_init
        elif mode == 'normal':
            initializer = normal_init

        for block in self._modules:
            for m in self._modules[block]:
                initializer(m)

    def forward(self, z):
        return self.net(z).squeeze()

### Model

In [84]:
def glorot_init(input_dim, output_dim):
    init_range = np.sqrt(6.0 / (input_dim + output_dim))
    initial = torch.rand(input_dim, output_dim) * 2 * init_range - init_range
    return nn.Parameter(initial)

In [85]:
class GraphConvSparse(nn.Module):
    def __init__(self, input_dim, output_dim, adj, activation=F.relu, **kwargs):
        super(GraphConvSparse, self).__init__(**kwargs)
        self.weight = glorot_init(input_dim, output_dim)
        self.adj = adj
        self.activation = activation

    def forward(self, inputs):
        x = inputs
        x = torch.mm(x, self.weight)
        x = torch.mm(self.adj, x)
        outputs = self.activation(x)
        return outputs

In [86]:
class InfoVGAE(nn.Module):
    def __init__(self, args, adj):
        super(InfoVGAE, self).__init__()
        self.input_dim = args.input_dim
        self.adj_matrix = adj
        self.hidden1_dim = args.hidden1_dim
        self.hidden2_dim = args.hidden2_dim
        self.base_gcn = GraphConvSparse(self.input_dim, self.hidden1_dim, adj, activation=F.relu)
        self.gcn_mean = GraphConvSparse(self.hidden1_dim, self.hidden2_dim, adj, activation=lambda x: x)
        self.gcn_logstddev = GraphConvSparse(self.hidden1_dim, self.hidden2_dim, adj, activation=lambda x: x)
        self.args = args

        assert args.num_user is not None
        assert args.num_assertion is not None
        self.num_user = args.num_user
        self.num_assertion = args.num_assertion
        self.user_nodes_mask = torch.zeros((self.num_user + self.num_assertion, self.hidden2_dim))
        self.user_nodes_mask[:self.num_user, :] = 1.0
        self.asser_nodes_mask = torch.zeros((self.num_user + self.num_assertion, self.hidden2_dim))
        self.asser_nodes_mask[self.num_user:, :] = 1.0
        if self.args.use_cuda:
            self.user_nodes_mask = self.user_nodes_mask.cuda()
            self.asser_nodes_mask = self.asser_nodes_mask.cuda()

    def encode(self, x):
        hidden = self.base_gcn(x)
        self.mean = self.gcn_mean(hidden)
        self.logstd = self.gcn_logstddev(hidden)
        gaussian_noise = torch.randn(x.size(0), self.hidden2_dim).cuda() if self.args.use_cuda else torch.randn(x.size(0), self.hidden2_dim)
        sampled_z = F.relu(gaussian_noise * torch.exp(self.logstd) + self.mean)
        return sampled_z

    def encode_normal(self, x):
        hidden = self.base_gcn(x)
        self.mean = self.gcn_mean(hidden)
        self.logstd = self.gcn_logstddev(hidden)
        gaussian_noise = torch.randn(x.size(0), self.hidden2_dim).cuda() if self.args.use_cuda else torch.randn(x.size(0), self.hidden2_dim)
        sampled_z = gaussian_noise * torch.exp(self.logstd) + self.mean
        return sampled_z

    def decode(self, z):
        return torch.sigmoid(torch.matmul(z, z.t()))

    def forward(self, x):
        z = self.encode(x)
        return self.decode(z)

### Feature Builder

In [87]:
class FeatureBuilderBase():
    def __init__(self):
        self.module_name = "FeatureBuilderBase"

    def initialize(self, params):
        for key, value in params:
            setattr(self, key, value)

    # user-index map: user_name --> i
    def get_user2index(self, data):
        userMap = dict()
        for i, user in enumerate(data.name.unique()):
            userMap[user] = i
        return userMap

    # tweet-index map: tweet_text --> j
    def get_tweet2index(self, data):
        tweetMap = dict()
        for i, tweet in enumerate(data.postTweet.unique()):
            tweetMap[tweet] = i
        return tweetMap

    # construct user-tweet matrix: userTweet[i, j] = cnt
    def get_bimatrix(self, user2index, tweet2index, data):
        userTweet = np.zeros((len(user2index), len(tweet2index)))
        for user, tweet in data[['name', 'postTweet']].iloc[::-1].values:
            userTweet[user2index[user], tweet2index[tweet]] += 1
        return userTweet

In [88]:
class MFFeatureBuilder(FeatureBuilderBase):
    def __init__(self, processed_data, mode, num_process=40):
        super(MFFeatureBuilder).__init__()
        self.data = processed_data
        self.build_key_matrix(self.data)
        self.mode = mode
        self.num_process = num_process

        self.supporting_modes = ["multiply", "m_smooth"]
        if self.mode not in self.supporting_modes:
            raise NotImplementedError("Only support modes: {}".format(self.supporting_modes))

    # construct keyword list {keywords}
    def get_keywordList(self, data):
        keywordList = []
        for tweet in data.postTweet:
            keywordList += tweet.split()
        keywordList = set(keywordList)
        return keywordList

    # get user-key matrix: user --> keywords count
    def get_user2keywords(self, data, keyword_list, user):
        tempKey = []
        tempCount = dict.fromkeys(keyword_list, 0)
        for tweet in data[data.name == user].postTweet:
            tempKey += tweet.split()

        for word in set(tempKey):
            tempCount[word] = tempKey.count(word)
        return list(tempCount.values())

    # construct user-keyword Matrix: for every user
    def get_users2keywords(self, user2index, data, keyword_list):
        userKey = pd.DataFrame(list(user2index.keys()), columns=['name'])
        tic = time.time()
        userKey['dist'] = userKey.name.parallel_apply(lambda x: self.get_user2keywords(data, keyword_list, x))
        userKey = np.array(userKey.dist.values.tolist())
        return userKey

    # get tweet-key matrix: tweet --> keywords count
    def get_tweet2keywords(self, keyword_list, tweet):
        tempKey = tweet.split()
        tempCount = dict.fromkeys(keyword_list, 0)

        for word in set(tempKey):
            tempCount[word] = tempKey.count(word)
        return list(tempCount.values())

    # construct tweet-keyword Matrix: for every tweet
    def get_tweets2keywords(self, tweet2index, keyword_list):
        tweets2keywords = pd.DataFrame(list(tweet2index.keys()), columns=['tweet'])
        tic2 = time.time()
        tweets2keywords['dist'] = tweets2keywords.tweet.parallel_apply(lambda x: self.get_tweet2keywords(keyword_list, x))
        tweets2keywords = np.array(tweets2keywords.dist.values.tolist())
        return tweets2keywords

    # interpolation function
    def phi(self, nz_index, tweet2index, index, r):
        if index in nz_index:
            return 1
        s = 0
        for i in nz_index:
            s += np.exp(- r * np.linalg.norm(np.array(tweet2index[index, :]) - np.array(tweet2index[i, :]), 2) ** 2) / 4
        if s < 0.2:
            return 0
        else:
            return s

    # son-process function
    def interpolation(self, result, bimatrix, tweets2keywords, k, num_process):
        for i in range(bimatrix.shape[0]):
            if i % num_process == k:
                print('process', k, '{} / {}'.format(i, bimatrix.shape[0]))
                nz_index = np.where(bimatrix[i, :] > 0)[0]
                index = np.arange(bimatrix.shape[1])
                result[i] = np.vectorize(self.phi, \
                                         excluded=['nz_index', 'tweetKey'])(nz_index=nz_index,
                                                                            tweet2index=tweets2keywords,
                                                                            index=index, r=2)

    # Message Similarity (M-Module): Second Step
    def Mmodule(self, bimatrix, tweets2keywords, num_process):
        """
        K assigns the number of processes
        """
        manager = Manager()
        result = manager.dict()

        plist = []
        for k in range(num_process):
            temp = Process(target=self.interpolation,
                           args=(result, bimatrix, tweets2keywords, k, num_process))
            plist.append(temp)

        for i in plist:
            i.start()
        for i in plist:
            i.join()

        new_features = []
        for _, j in sorted(dict(result).items(), key=lambda x: x[0]):
            new_features.append(j)
        new_features = np.array(new_features)

        return new_features

    # First Step Process
    def build_key_matrix(self, data):
        # get Maps
        self.user2index, self.tweet2index = self.get_user2index(data), self.get_tweet2index(data)
        # get biMatrix
        self.userTweet = self.get_bimatrix(self.user2index, self.tweet2index, data)
        # get keyList
        self.keyword_list = self.get_keywordList(data)
        # userKey
        self.users2keywords = self.get_users2keywords(self.user2index, data, self.keyword_list)
        # tweetKey
        self.tweets2keywords = self.get_tweets2keywords(self.tweet2index, self.keyword_list)
        # bimatrix
        self.bimatrix = self.get_bimatrix(self.user2index, self.tweet2index, data)

    def build_index_mapping_only(self):
        # get Maps
        self.user2index, self.tweet2index = self.get_user2index(self.data), self.get_tweet2index(self.data)
        # get keyList
        self.keyword_list = self.get_keywordList(self.data)
        # userKey
        self.users2keywords = self.get_users2keywords(self.user2index, self.data, self.keyword_list)
        # tweetKey
        self.tweets2keywords = self.get_tweets2keywords(self.tweet2index, self.keyword_list)

    def build(self):
        self.build_key_matrix(self.data)
        if self.mode == "multiply":
            # normalize by 2-norm
            users2keywords = self.users2keywords / (self.users2keywords ** 2).sum(axis=1).reshape(-1, 1) ** 0.5
            tweets2keywords = self.tweets2keywords / (self.tweets2keywords ** 2).sum(axis=1).reshape(-1, 1) ** 0.5
            return users2keywords @ tweets2keywords.T
        elif self.mode == "m_smooth":
            features = self.Mmodule(self.bimatrix, self.tweets2keywords, self.num_process)
            return features
        else:
            raise NotImplementedError(self.module_name)

### Dataset

In [89]:
class DatasetBase():
    def __init__(self):
        self.name = "DatasetBase"

    # tokenize filter
    def lenFilter(self, word):
        return len(word) >= 2

    # tokenize
    def tokenize(self, text, stopwords=[], keyword=[]):
        # get rid of URL
        original_text = str(text).lower()
        tok = original_text.split(' ')
        text = u''
        for x in tok:
            if len(keyword) > 0:
                if x not in keyword: continue
            elif len(stopwords) > 0:
                if len(x) == 0:
                    continue
                elif x[0:4] == 'http' or x[0:5] == 'https':
                    continue
                elif x[0] == '@':
                    continue
                elif x in stopwords:
                    continue
            text = text + ' ' + x
        translate_to = u' '

        word_sep = u" ,.?:;'\"/<>`!$%^&*()-=+~[]\\|{}()\n\t" \
                   + u"©℗®℠™،、⟨⟩‒–—―…„“”–――»«›‹‘’：（）！？=【】　・" \
                   + u"⁄·† ‡°″¡¿÷№ºª‰¶′″‴§|‖¦⁂❧☞‽⸮◊※⁀「」﹁﹂『』﹃﹄《》―—" \
                   + u"“”‘’、，一。►…¿«「」ー⋘▕▕▔▏┈⋙一ー।;!؟"
        word_sep = u'#' + word_sep
        translate_table = dict((ord(char), translate_to) for char in word_sep)
        tokens = text.translate(translate_table).split(' ')
        return ' '.join(sorted(list(filter(self.lenFilter, tokens))))

    # from rawTweet to clean keyword text
    def textProcess(self, data, keyword_path, stopword_path, kthreshold, uthreshold):
        stopwords = []
        keyword = []
        if keyword_path == 'N':
            # get stopwords
            with open(stopword_path, 'r') as infile:
                for word in infile.readlines():
                    stopwords.append(word[:-1])

            data['postTweet'] = data.rawTweet.parallel_apply(
                lambda x: self.tokenize(x, stopwords=stopwords, keyword=[]))
        else:
            # get stopwords
            with open('processed/keyword.txt', 'r') as infile:
                for word in infile.readlines():
                    keyword.append(word[:-1])
            data['postTweet'] = data.rawTweet.parallel_apply(
                lambda x: self.tokenize(x, stopwords=[], keyword=keyword))

        """
        # number of keywords >= 5
        data['keyN'] = data.postTweet.apply(lambda x: len(x.split()))
        data = data[data.keyN >= kthreshold]

        userDict = dict()
        for u in data.name.values:
            try:
                userDict[u] += 1
            except:
                userDict[u] = 1

        pickedPopUsers = np.array(list(userDict.keys()))[np.where(np.array(list(userDict.values())) >= uthreshold)]
        data = data[data.name.isin(pickedPopUsers)]
        data.reset_index(drop=True, inplace=True)
        """

        return data

    def getAdjMatrix(self, nameList, user2id, csv_path, add_self_loop=True, directed=True):
        friend = pd.read_csv(csv_path, sep='\t')

        adjTable = sp.lil_matrix((len(nameList), len(nameList)))

        for u1, u2 in friend.values:
            if (u1 in nameList) and (u2 in nameList):
                adjTable[user2id[u1], user2id[u2]] += 1
                if not directed:
                    adjTable[user2id[u2], user2id[u1]] += 1
        if add_self_loop:
            adjTable += sp.diags([1.0], shape=(len(nameList), len(nameList)))
        # A = A / A.sum(axis=1).reshape(-1, 1)
        # A = (A + np.diag(np.ones(A.shape[0]))) / 2
        return adjTable

    def build(self):
        raise NotImplementedError(self.name)

    def get_feature_similarity_matrix(self):
        raise NotImplementedError(self.name)

In [90]:
class MFDataset(DatasetBase):
    def __init__(self, csv_path, friend_path, keyword_path, stopword_path,
                 mode="multiply", kthreshold=5, uthreshold=3, num_process=40,
                 add_self_loop=True, directed=True, args=None):
        super(MFDataset).__init__()
        self.name = "MFDataset"
        self.csv_path = csv_path
        self.kthreshold = kthreshold
        self.uthreshold = uthreshold
        self.mode = mode
        self.keyword_path = keyword_path
        self.stopword_path = stopword_path
        self.num_process = num_process
        self.friend_path = friend_path
        self.add_self_loop = add_self_loop
        self.directed = directed
        self.args = args

        self.feature_builder = None

    def build(self):
        print("{} Building...".format(self.name))
        data = pd.read_csv(self.csv_path, sep='\t')
        processed_data = self.textProcess(data, self.keyword_path, self.stopword_path, self.kthreshold, self.uthreshold)
        # Dump tweet label for eval
        self.dump_assertion_label(processed_data)
        self.feature_builder = MFFeatureBuilder(processed_data=processed_data, mode=self.mode,
                                                num_process=self.num_process)
        features = self.feature_builder.build()
        user2index = self.feature_builder.user2index
        name_list = processed_data.name.unique().tolist()
        adj_matrix = self.getAdjMatrix(name_list, user2index, self.friend_path,
                                       add_self_loop=self.add_self_loop, directed=self.directed)
        print("{} Processing Done".format(self.name))
        return adj_matrix, features, None, name_list

    def dump_assertion_label(self, processed_data):
        labels = []
        with open("dataset/eurovision/annotations_asser_label.pkl", "rb") as fin:
            tweet2label = pickle.load(fin)
        failed = 0
        for i, assertion in enumerate(processed_data.postTweet.unique()):
            asser = assertion.replace(" rt ", " ").replace(" ht ", " ").replace(" in ", " ")
            if asser not in tweet2label.keys():
                failed += 1
                labels.append(4096)
            else:
                labels.append(tweet2label[asser])
        with open(self.args.output_path + "/MF_feature_dim1_label.pkl", "wb") as fout:
            pickle.dump(np.array(labels).astype("int32"), fout)
            print("Assertion Label Dump Success.")


    def get_feature_similarity_matrix(self):
        normed_t2k = self.feature_builder.tweets2keywords / self.feature_builder.tweets2keywords.sum(axis=1).reshape(-1, 1)
        # normed_t2k = self.feature_builder.tweets2keywords
        return normed_t2k @ normed_t2k.transpose()

In [91]:
class TwitterDataset(MFDataset):
    def __init__(self, csv_path, keyword_path, stopword_path,
                     mode="multiply", kthreshold=5, uthreshold=3, num_process=40,
                     add_self_loop=True, directed=True, args=None):
        super(TwitterDataset, self).__init__(csv_path, None, keyword_path, stopword_path,
                                            mode, kthreshold, uthreshold, num_process, add_self_loop, directed, args)
        self.name = "TwitterDataset"
        self.processed_data = None
        self.user_label = None
        self.asser_label = None
        self.asserlist = None
        self.name_list = None

        pkl_path = "/".join(args.data_path.split("/")[:-1])
        if not os.path.exists(pkl_path + "/" + "TwitterDataset.pkl"):
            print("Preprocess and dump dataset...")
            self.preprocessing()
            """with open(pkl_path + "/" + "TwitterDataset.pkl", "wb") as fout:
                pickle.dump([self.data, self.processed_data, self.name_list, self.feature_builder], fout)"""
        else:
            print("Use existing dataset pkl {} (Remove file to re-build) ...".format(pkl_path + "/" + "TwitterDataset.pkl"))
            with open(pkl_path + "/" + "TwitterDataset.pkl", "rb") as fin:
                self.data, self.processed_data, self.name_list, self.feature_builder = pickle.load(fin)

    def preprocessing(self):
        # Preprocessing
        self.data = pd.read_csv(self.csv_path, sep='\t')
        self.processed_data = self.textProcess(self.data, self.keyword_path, self.stopword_path, self.kthreshold, self.uthreshold)
        self.name_list = self.processed_data.name.unique().tolist()

        # Feature builder for index mapping
        self.feature_builder = MFFeatureBuilder(processed_data=self.processed_data, mode=self.mode,
                                                num_process=self.num_process)
        self.feature_builder.build_index_mapping_only()

    def build(self):
        print("{} Building...".format(self.name))

        self.num_user = self.feature_builder.users2keywords.shape[0]
        print('Num users:', self.num_user)
        self.num_assertion = self.feature_builder.tweets2keywords.shape[0]
        self.num_nodes = self.num_user + self.num_assertion
        # Heterogeneous adjacent matrix
        self.het_matrix = sp.lil_matrix((self.num_nodes, self.num_nodes))

        # Get tweeting matrix
        tweeting_matrix = self.get_tweeting_matrix(self.processed_data, self.num_user, self.num_assertion)
        self.het_matrix[:self.num_user, self.num_user:self.num_user + self.num_assertion] = tweeting_matrix
        self.het_matrix[self.num_user:self.num_user + self.num_assertion, :self.num_user] = tweeting_matrix.transpose()

        # Get following matrix
        if self.args.use_follow:
            following_matrix = self.get_following_matrix()
            self.het_matrix[:self.num_user, :self.num_user] = following_matrix

        print("{} Processing Done".format(self.name))
        # Return adj matrix
        return self.het_matrix

    def cosine_similarity(self, a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-07)

    def get_following_matrix(self):
        assert self.args.use_follow
        assert self.args.follow_path is not None
        following_matrix = np.zeros([self.num_user, self.num_user])
        str_name_list = [str(k) for k in self.name_list]
        with open(self.args.follow_path, "r") as fin:
            for line in fin:
                splts = line.strip().split("	")
                if splts[0] in str_name_list and splts[1] in str_name_list:
                    fr = str_name_list.index(splts[0])
                    to = str_name_list.index(splts[1])
                else:
                    continue
                following_matrix[fr][to] = 1
                following_matrix[to][fr] = 1
        return following_matrix

    def get_tweet_similarity_matrix(self, processed_data):
        X = []
        for tweet in processed_data.postTweet.unique():
            X.append(tweet.split(" "))
        builder = TfidfEmbeddingVectorizer(X=X)
        X_emb = builder.transform(X)
        X_emb = np.array(X_emb)
        similarity_matrix = np.array(self.cosine_similarity(X_emb, dense_output=True))
        return similarity_matrix

    def get_mention_matrix(self, processed_data, num_user):
        assert self.name_list is not None
        mention_matrix = np.zeros((num_user, num_user))
        for i, item in processed_data.iterrows():
            user_name = item["name"]
            init_tweet = item["rawTweet"]
            splits = init_tweet.split(" ")
            mention_name = None
            if splits[0] == "RT":
                mention_name = splits[1].strip().replace("@", "").replace(":", "")
            if mention_name is not None and user_name in self.name_list and mention_name in self.name_list:
                mention_matrix[self.name_list.index(user_name)][self.name_list.index(mention_name)] += 1
                mention_matrix[self.name_list.index(mention_name)][self.name_list.index(user_name)] += 1
        return mention_matrix

    def get_tweeting_matrix(self, processed_data, num_user, num_assertion):
        tweeting_matrix = np.zeros((num_user, num_assertion))
        for i, item in processed_data.iterrows():
            postTweet = item["postTweet"]
            tweet_index = self.feature_builder.tweet2index[postTweet]
            user_name = item["name"]
            user_index = self.feature_builder.user2index[user_name]
            tweeting_matrix[user_index][tweet_index] += 1
        return tweeting_matrix

    def dump_label(self):
        assert self.processed_data is not None
        # Assume there is a label field in data.csv. 0 for unknown, 1 for trump, 2 for biden
        num_assertion = self.feature_builder.tweets2keywords.shape[0]

        # calculate tweet assertion label
        self.asser_label = np.zeros(num_assertion).astype("int32")
        self.asserlist = [None for _ in range(num_assertion)]
        for i, item in self.processed_data.iterrows():
            label = item["label"]
            postTweet = item["postTweet"]
            tweet_id = self.feature_builder.tweet2index[postTweet]
            if self.asserlist[tweet_id] is None:
                self.asserlist[tweet_id] = item["rawTweet"]
            if self.asser_label[tweet_id] == 0:
                self.asser_label[tweet_id] = label

        # calculate tweet label
        num_user = self.feature_builder.users2keywords.shape[0]
        self.user_label = np.zeros(num_user).astype("int32")
        user_label_candidate = [[] for _ in range(num_user)]
        for i, item in self.processed_data.iterrows():
            label = item["label"]
            user_name = item["name"]
            user_index = self.feature_builder.user2index[user_name]
            if label != 0:
                user_label_candidate[user_index].append(label)
        for i in range(num_user):
            if not user_label_candidate[i]:
                self.user_label[i] = 0
            else:
                self.user_label[i] = Counter(user_label_candidate[i]).most_common(1)[0][0]

        # dump label to label.bin
        save_path = self.args.output_path + "/label.bin"
        with open(save_path, "wb") as fout:
            pickle.dump({"user_label": self.user_label, "assertion_label": self.asser_label}, fout)

        # dump the representative assertion list
        save_path = self.args.output_path + "/asserlist.json"
        with open(save_path, "w", encoding="utf-8") as fout:
            json.dump(self.asserlist, fout, indent=2)

        # dump namelist for evaluation
        with open(self.args.output_path + "/namelist.json", 'w') as fout:
            json.dump(self.name_list, fout, indent=2)

        # dump tweet to assertion id mapping
        tweet2asserid = {}
        for i, item in self.processed_data.iterrows():
            postTweet = item["postTweet"]
            if "id" in item.keys():
                tweet_id = item["id"]
            else:
                tweet_id = item["tweet_id"]
            assertion_id = self.feature_builder.tweet2index[postTweet]
            tweet2asserid[tweet_id] = assertion_id
        with open(self.args.output_path + "/tweet_to_assertion_id_map.json", 'w') as fout:
            json.dump(tweet2asserid, fout, indent=2)

        print("Dump Label file success {}".format(save_path))

    def dump_processed_data(self):
        self.processed_data.to_csv(self.args.output_path + "/" + "processed_data.csv",
                                   sep="\t", encoding="utf-8", index=False)
        print("Dump processed data success {}".format(self.args.output_path + "/" + "processed_data.csv"))

### Evaluator

In [92]:
class Evaluator:
    def __init__(self, user_plot_config=None, asser_plot_config=None):
        self.initialized = False
        self.output_dir = None
        self.embedding = None
        self.labels = None
        self.user_label = None
        self.asser_label = None
        self.user_clustering_pred = None
        self.asser_clustering_pred = None
        self.namelist = None
        self.asserlist = None
        self.num_user = -1
        self.num_asser = -1
        self.user_plot_config = [
            [1, "#178bff", "User Con"],
            # [0, "#3b3b3b", "User Neu"],
            [2, "#ff5c1c", "User Pro"]
        ] if user_plot_config is None else user_plot_config
        self.asser_plot_config = [
            [1, "#30a5ff", "Assertion Con"],
            # [0, "#4d4d4d", "Assertion Neu"],
            [2, "#fc8128", "Assertion Pro"]
        ] if asser_plot_config is None else asser_plot_config

    def initialize(self):
        if self.labels is not None:
            self.user_label = self.labels["user_label"]
            self.asser_label = self.labels["assertion_label"]
        self.num_user = self.user_label.shape[0]
        self.num_asser = self.asser_label.shape[0]

        for item in self.asser_plot_config:
            item[0] += 1000
        self.asser_label += 1000

        self.initialized = True

    def init_from_dir(self, dir):
        with open(dir + "/embedding.bin", 'rb') as fin:
            self.embedding = pickle.load(fin)
        with open(dir + "/label.bin", 'rb') as fin:
            self.labels = pickle.load(fin)
        with open(dir + "/namelist.json", 'r') as fin:
            self.namelist = json.load(fin)
        with open(dir + "/asserlist.json", 'r') as fin:
            self.asserlist = json.load(fin)
        self.output_dir = dir
        self.initialize()

    def init_from_value(self, embedding, user_label, asser_label, namelist, asserlist, output_dir="."):
        self.embedding = embedding
        self.user_label = user_label
        self.asser_label = asser_label
        self.namelist = namelist
        self.asserlist = asserlist
        self.output_dir = output_dir
        self.initialize()

    def run_clustering(self, n_clusters=2):
        assert n_clusters == 2

        self.user_clustering_pred, _ = self.k_means(self.embedding[:self.num_user])
        self.asser_clustering_pred, _ = self.k_means(self.embedding[self.num_user:])

        for i in range(self.num_user):
            self.user_clustering_pred[i] = \
                self.user_plot_config[0][0] if self.user_clustering_pred[i] == 0 else self.user_plot_config[1][0]

        for i in range(self.num_asser):
            self.asser_clustering_pred[i] = \
                self.asser_plot_config[0][0] if self.asser_clustering_pred[i] == 0 else self.asser_plot_config[1][0]

    def plot_clustering(self, permulate=None, show=False, save=True, tag=""):
        print("Evaluator plot clustering prediction with config:")
        print("user_plot_config: " + str(self.user_plot_config))
        print("asser_plot_config: " + str(self.asser_plot_config))
        assert self.user_clustering_pred is not None
        assert self.asser_clustering_pred is not None
        pred = np.concatenate([self.user_clustering_pred, self.asser_clustering_pred], axis=0)
        label = np.concatenate([self.user_label, self.asser_label], axis=0)
        # Only plot labeled data
        pred[label == 0] = -1
        pred[label == 1000] = -2
        if self.embedding.shape[1] == 1:
            self.plot_1d(self.embedding, pred, self.user_plot_config, self.asser_plot_config, show, save)
        elif self.embedding.shape[1] == 2:
            self.plot_2d(self.embedding, pred, self.user_plot_config, self.asser_plot_config, show, save)
        elif self.embedding.shape[1] == 3:
            self.plot_3d(self.embedding, pred, self.user_plot_config, self.asser_plot_config, permulate, show, save, tag=tag)

    def dump_topk_json(self, K=50):
        save_path = self.output_dir + "/top_tweet_table.json".format(K)
        res = {"reprA": [], "reprB": [], "reprC": []}

        collection = [(
            self.asserlist[i],
            self.embedding[self.num_user + i][0]
        ) for i in range(self.num_asser)]
        collection = sorted(collection, key=lambda x: x[1], reverse=True)
        for item in collection[:K]:
            res["reprA"].append(item[0])

        collection = [(
            self.asserlist[i],
            self.embedding[self.num_user + i][1]
        ) for i in range(self.num_asser)]
        collection = sorted(collection, key=lambda x: x[1], reverse=True)
        for item in collection[:K]:
            res["reprB"].append(item[0])

        collection = [(
            self.asserlist[i],
            self.embedding[self.num_user + i][2]
        ) for i in range(self.num_asser)]
        collection = sorted(collection, key=lambda x: x[1], reverse=True)
        for item in collection[:K]:
            res["reprC"].append(item[0])

        with open(save_path, "w") as fout:
            json.dump(res, fout, indent=2)

    def plot(self, permulate=None, show=False, save=True, tag=""):
        print("Evaluator plot label with config:")
        print("user_plot_config: " + str(self.user_plot_config))
        print("asser_plot_config: " + str(self.asser_plot_config))
        label = np.concatenate([self.user_label, self.asser_label], axis=0)
        if self.embedding.shape[1] == 1:
            self.plot_1d(self.embedding, label, self.user_plot_config, self.asser_plot_config, show, save)
        elif self.embedding.shape[1] == 2:
            self.plot_2d(self.embedding, label, self.user_plot_config, self.asser_plot_config, show, save)
        elif self.embedding.shape[1] == 3:
            self.plot_3d(self.embedding, label, self.user_plot_config, self.asser_plot_config, permulate, show, save, tag=tag)

    def purity_score(self, y_true, y_pred):
        # compute contingency matrix (also called confusion matrix)
        contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
        # return purity
        return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

    def numerical_evaluate(self, verbose=False):
        user_pred = self.user_clustering_pred[self.user_label != 0]
        user_pred_reverse = user_pred.copy()  # 1 --> 2; 2--> 1
        user_pred_reverse = 3 - user_pred_reverse
        user_label = self.user_label[self.user_label != 0]

        asser_pred = self.asser_clustering_pred[self.asser_label != 1000]
        asser_pred_reverse = asser_pred.copy()  # 1001 --> 1002; 1002 --> 1001
        asser_pred_reverse = 2003 - asser_pred_reverse
        asser_label = self.asser_label[self.asser_label != 1000]

        user_pre = [precision_score(user_label, user_pred, pos_label=1),
                    precision_score(user_label, user_pred_reverse, pos_label=1)]
        user_recall = [recall_score(user_label, user_pred, pos_label=1),
                       recall_score(user_label, user_pred_reverse, pos_label=1)]
        user_f1 = [f1_score(user_label, user_pred, pos_label=1),
                   f1_score(user_label, user_pred_reverse, pos_label=1)]

        asser_pre = [precision_score(asser_label, asser_pred, pos_label=1001),
                     precision_score(asser_label, asser_pred_reverse, pos_label=1001)]
        asser_recall = [recall_score(asser_label, asser_pred, pos_label=1001),
                        recall_score(asser_label, asser_pred_reverse, pos_label=1001)]
        asser_f1 = [f1_score(asser_label, asser_pred, pos_label=1001),
                    f1_score(asser_label, asser_pred_reverse, pos_label=1001)]

        log_content = ""
        log_content += "#User: {}, #Assertion: {}".format(self.num_user, self.num_asser) + "\n"
        log_content += "User precision: {:.4f}  User recall: {:.4f}  User F1: {:.4f}".format(user_pre[0], user_recall[0], user_f1[0]) + "\n"
        log_content += "User precision: {:.4f}  User recall: {:.4f}  User F1: {:.4f}".format(user_pre[1], user_recall[1], user_f1[1]) + "\n"
        log_content += "User Purity: {:.4f}".format(self.purity_score(user_label, user_pred)) + "\n"
        log_content += "Assertion precision: {:.4f}  Assertion recall: {:.4f}  Assertion F1: {:.4f}".format(
            asser_pre[0], asser_recall[0], asser_f1[0]) + "\n"
        log_content += "Assertion precision: {:.4f}  Assertion recall: {:.4f}  Assertion F1: {:.4f}".format(
            asser_pre[1], asser_recall[1], asser_f1[1]) + "\n"
        log_content += "Assertion Purity: {:.4f}".format(self.purity_score(asser_label, asser_pred)) + "\n"
        if verbose:
            print(log_content)

        best_arg = np.argmax(user_f1)
        if best_arg == 0:
            best = user_pred
        else:
            best = user_pred_reverse
        return log_content, max(user_f1[0], user_f1[1]), max(asser_f1[0], asser_f1[1]), best, user_label

    # -------------------------------- Function Utils --------------------------------

    def dbscan(self, embedding, cosine_norm=False, eps=0.5, min_samples=5):
        db = DBSCAN(eps=eps, min_samples=min_samples)
        if cosine_norm:
            length = np.sqrt((embedding ** 2).sum(axis=1))[:, None]
            embedding = embedding / length
        clustering = db.fit(embedding)
        pred = clustering.labels_
        pred[pred < 0] = 0
        return pred, clustering

    def mean_shift(self, embedding, cosine_norm=False, bandwidth=None):
        ms = MeanShift(bandwidth=bandwidth)
        if cosine_norm:
            length = np.sqrt((embedding ** 2).sum(axis=1))[:, None]
            embedding = embedding / length
        clustering = ms.fit(embedding)
        return clustering.labels_, clustering

    def k_means(self, embedding, cosine_norm=False, n_clusters=2, n_init=10):
        km = KMeans(
            n_clusters=n_clusters, n_init=n_init
        )

        if cosine_norm:
            length = np.sqrt((embedding ** 2).sum(axis=1))[:, None]
            embedding = embedding / length

        km_result = km.fit_predict(embedding)
        return km_result, km

    def time_tag(self):
        return time.strftime("%Y%m%d%H%M%S_", time.localtime()) + str(time.time()).split(".")[1]

    def plot_1d(self, embedding, label, user_plot_config, asser_plot_config, show=False, save=True):
        assert embedding.shape[1] == 1
        assert embedding.shape[0] == label.shape[0]

        for l, c, t in user_plot_config:
            emb = embedding[label == l]
            plt.scatter(emb[:, 0].reshape(-1),
                        np.zeros(emb[:, 0].reshape(-1).shape) + 0.15 * np.random.random(
                            size=emb[:, 0].reshape(-1).shape),
                        marker="o", color=c, label=t, s=10)

        for l, c, t in asser_plot_config:
            emb = embedding[label == l]
            plt.scatter(emb[:, 0].reshape(-1),
                        np.ones(emb[:, 0].reshape(-1).shape) + 0.15 * np.random.random(
                            size=emb[:, 0].reshape(-1).shape),
                        marker="^", color=c, label=t, s=10)

        plt.tick_params(labelsize=14)
        plt.legend(loc='best', prop={'size': 14})
        if save:
            plt.savefig(self.output_dir + "/1d_evaluation_{}.jpg".format(self.time_tag()), dpi=500)
        if show:
            plt.show()

    def plot_2d(self, embedding, label, user_plot_config, asser_plot_config, show=False, save=True):
        assert embedding.shape[1] == 2
        assert embedding.shape[0] == label.shape[0]

        for l, c, t in user_plot_config:
            emb = embedding[label == l]
            plt.scatter(emb[:, 0].reshape(-1), emb[:, 1].reshape(-1), marker="o", color=c, label=t, s=10)

        for l, c, t in asser_plot_config:
            emb = embedding[label == l]
            plt.scatter(emb[:, 0].reshape(-1), emb[:, 1].reshape(-1), marker="^", color=c, label=t, s=10)

        plt.tick_params(labelsize=16)
        plt.legend(loc='best', prop={'size': 14})
        if save:
            plt.savefig(self.output_dir + "/2d_evaluation_{}.jpg".format(self.time_tag()), dpi=500)
        if show:
            plt.show()

    def plot_3d(self, embedding, label, user_plot_config, asser_plot_config, permulate=None, show=False, save=True, tag=""):
        if permulate is None:
            permulate = [0, 1, 2]
        assert embedding.shape[1] == 3
        assert embedding.shape[0] == label.shape[0]

        fig = plt.figure(figsize=(8, 6))
        ax = fig.add_subplot(111, projection='3d')

        for l, c, t in user_plot_config:
            emb = embedding[label == l]
            ax.scatter(emb[:, permulate[0]].reshape(-1), emb[:, permulate[1]].reshape(-1), emb[:, permulate[2]].reshape(-1),
                       marker="o", color=c, label=t)

        for l, c, t in asser_plot_config:
            emb = embedding[label == l]
            ax.scatter(emb[:, permulate[0]].reshape(-1), emb[:, permulate[1]].reshape(-1), emb[:, permulate[2]].reshape(-1),
                       marker="^", color=c, label=t)

        plt.legend(loc='upper right', prop={'size': 14})
        ax.set_xlabel("Dim 1")
        ax.set_ylabel("Dim 2")
        ax.set_zlabel("Dim 3")
        ax.view_init(20, 120)

        if save:
            plt.savefig(self.output_dir + "/3d_evaluation_cluster_{}_{}.pdf".format(tag, self.time_tag()), bbox_inches='tight')
        if show:
            plt.show()

### Trainer

In [93]:
def sp_sparse_to_torch_longtensor(coo_matrix):
    i = torch.LongTensor(np.vstack((coo_matrix.row, coo_matrix.col)))
    v = torch.LongTensor(coo_matrix.data)
    return torch.sparse.LongTensor(i, v, torch.Size(coo_matrix.shape))

In [94]:
def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

In [95]:
def preprocess_graph(adj):
    adj = sp.coo_matrix(adj)
    adj_ = adj + sp.eye(adj.shape[0])
    rowsum = np.array(adj_.sum(1))
    degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten())
    adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()
    return sparse_to_tuple(adj_normalized)

In [96]:
def permute_dims(z):
    assert z.dim() == 2

    B, _ = z.size()
    perm_z = []
    for z_j in z.split(1, 1):
        perm = torch.randperm(B).to(z.device)
        perm_z_j = z_j[perm]
        perm_z.append(perm_z_j)

    return torch.cat(perm_z, 1)

In [97]:
class TrainerBase():
    def __init__(self):
        self.name = "TrainerBase"

    def train(self):
        raise NotImplementedError(self.name)

In [98]:
class InfoVGAETrainer(TrainerBase):
    def __init__(self, adj_matrix, features, args, dataset):
        super(InfoVGAETrainer).__init__()
        self.name = "InfoVGAETrainer"
        self.adj_matrix = adj_matrix
        self.features = features
        self.args = args
        self.dataset = dataset

        self.model = None
        self.optimizer = None
        self.D = None
        self.optimizer_D = None

        self.result_embedding = None

    def train(self):
        print("Training using {}".format(self.name))

        # Store original adjacency matrix (without diagonal entries) for later
        adj_orig = self.adj_matrix
        adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
        adj_orig.eliminate_zeros()

        adj_train = self.adj_matrix
        adj = adj_train

        # Some preprocessing
        adj_norm = preprocess_graph(adj)

        features = sparse_to_tuple(sp.coo_matrix(self.features))

        # Create Model
        pos_weight = self.args.pos_weight_lambda * float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
        print("Positive sample weight: {}".format(pos_weight))

        norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

        adj_label = adj_train + sp.eye(adj_train.shape[0])
        adj_label = sparse_to_tuple(adj_label)

        adj_norm = torch.sparse.FloatTensor(torch.LongTensor(adj_norm[0].T),
                                            torch.FloatTensor(adj_norm[1]),
                                            torch.Size(adj_norm[2]))
        adj_label = torch.sparse.FloatTensor(torch.LongTensor(adj_label[0].T),
                                             torch.FloatTensor(adj_label[1]),
                                             torch.Size(adj_label[2]))
        features = torch.sparse.FloatTensor(torch.LongTensor(features[0].T),
                                            torch.FloatTensor(features[1]),
                                            torch.Size(features[2]))

        weight_mask = adj_label.to_dense().view(-1) == 1
        weight_tensor = torch.ones(weight_mask.size(0))
        weight_tensor[weight_mask] = pos_weight
        ones = torch.ones(self.adj_matrix.shape[0], dtype=torch.long)
        zeros = torch.zeros(self.adj_matrix.shape[0], dtype=torch.long)

        if self.args.use_cuda:
            adj_norm = adj_norm.cuda()
            adj_label = adj_label.cuda()
            features = features.cuda()
            weight_tensor = weight_tensor.cuda()
            ones = ones.cuda()
            zeros = zeros.cuda()

        # init model and optimizer
        self.model = InfoVGAE(self.args, adj_norm)
        if self.args.use_cuda:
            self.model = self.model.cuda()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.learning_rate)
        self.D = Discriminator(self.args.hidden2_dim)
        if self.args.use_cuda:
            self.D = self.D.cuda()
        self.optimizer_D = torch.optim.Adam(self.D.parameters(), lr=self.args.lr_D,
                                            betas=(self.args.beta1_D, self.args.beta2_D))

        # train model
        Kp = 0.001
        Ki = -0.001
        PID = PIDControl(Kp, Ki)
        Exp_KL = 0.005
        for epoch in range(self.args.epochs):
            t = time.time()

            # Train VAE
            z = self.model.encode(features)
            A_pred = self.model.decode(z)
            D_z = self.D(z)

            vae_recon_loss = norm * F.binary_cross_entropy(A_pred.view(-1), adj_label.to_dense().view(-1),
                                                           weight=weight_tensor)
            kl_divergence = 0.5 / A_pred.size(0) * (1 + 2 * self.model.logstd - self.model.mean ** 2 -
                                                    torch.exp(self.model.logstd) ** 2).sum(1).mean()
            vae_tc_loss = (D_z[:, :1] - D_z[:, 1:]).mean() * self.args.gamma
            weight = PID.pid(Exp_KL, kl_divergence.item())  # get the weight on KL term with PI module
            vae_loss = vae_recon_loss - weight * kl_divergence + vae_tc_loss

            self.optimizer.zero_grad()
            vae_loss.backward()
            self.optimizer.step()

            # Train Discriminator
            z = self.model.encode(features)
            D_z = self.D(z)
            z_prime = self.model.encode(features)
            z_pperm = permute_dims(z_prime).detach()
            D_z_pperm = self.D(z_pperm)
            D_tc_loss = 0.5 * (F.cross_entropy(D_z, zeros) + F.cross_entropy(D_z_pperm, ones))

            self.optimizer_D.zero_grad()
            D_tc_loss.backward()
            self.optimizer_D.step()

            if epoch % 1 == 0:
                evaluator = Evaluator()
                embedding = self.model.encode(features).detach().cpu().numpy()
                evaluator.init_from_value(embedding, self.dataset.user_label.copy(), self.dataset.asser_label.copy(),
                                          self.dataset.name_list.copy(), self.dataset.asserlist.copy(),
                                          output_dir=self.args.output_path)
                # evaluator.plot(show=False, save=True, tag=str(epoch))
                evaluator.run_clustering()
                # evaluator.plot_clustering(show=False, tag=str(epoch))
                eval_log, user_f1, asser_f1, _, _ = evaluator.numerical_evaluate()
                log = "Epoch: {}, loss_recon: {:.5f}, loss_kl: {:.5f}, loss_tc: {:.5f}, loss_VAE: {:.5f}, loss_D: {:.5f}, user_f1: {:.5f}, asser_f1: {:.5f}".format(
                        epoch,
                        vae_recon_loss.item(),
                        - weight * kl_divergence,
                        vae_tc_loss.item(),
                        vae_loss.item(),
                        D_tc_loss,
                        user_f1,
                        asser_f1)
                print(log)
                with open(self.args.output_path + "/log.txt", "a") as fout:
                    fout.write("Epoch: {}\n".format(epoch))
                    fout.write(eval_log)
                    fout.write(log + "\n\n")

        self.result_embedding = self.model.encode(features).detach().cpu().numpy()

    def save(self, path=None):
        path = self.args.output_path if path is None else path
        # Save result embedding of nodes
        with open(path + "/args.json", 'w') as fout:
            json.dump(vars(self.args), fout)
        with open(path + "/embedding.bin", 'wb') as fout:
            pickle.dump(self.result_embedding, fout)
            print("Embedding and dependencies are saved in {}".format(path))

    def get_scores(self, adj_orig, edges_pos, edges_neg, adj_rec):
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))

        # Predict on test set of edges
        preds = []
        pos = []
        for e in edges_pos:
            # print(e)
            # print(adj_rec[e[0], e[1]])
            preds.append(sigmoid(adj_rec[e[0], e[1]].item()))
            pos.append(adj_orig[e[0], e[1]])

        preds_neg = []
        neg = []
        for e in edges_neg:
            preds_neg.append(sigmoid(adj_rec[e[0], e[1]].data))
            neg.append(adj_orig[e[0], e[1]])

        preds_all = np.hstack([preds, preds_neg])
        labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
        roc_score = roc_auc_score(labels_all, preds_all)
        ap_score = average_precision_score(labels_all, preds_all)

        return roc_score, ap_score

    def get_acc(self, adj_rec, adj_label):
        labels_all = adj_label.to_dense().view(-1).long()
        preds_all = (adj_rec > 0.5).view(-1).long()
        accuracy = (preds_all == labels_all).sum().float() / labels_all.size(0)
        return accuracy

### Run All

In [99]:
class arguments():
  def __init__(self, model, epochs, lr, device, n_p, seed, dataset, asl, directed, data_p, data_jp, stop_p, key_p, kthresh, uthresh, hidden1, hidden2, use_feat, n_u, n_a, lam, gamma, lrD,
                beta1, beta2, output_p, use_foll, foll_p, use_cuda):
    self.model = model
    self.epochs = epochs
    self.learning_rate = lr
    self.device = device
    self.num_process = n_p
    self.seed = seed
    self.dataset = dataset
    self.add_self_loop = asl
    self.directed = directed
    self.data_path = data_p
    self.data_json_path = data_jp
    self.stopword_path = stop_p
    self.keyword_path = key_p
    self.kthreshold = kthresh
    self.uthreshold = uthresh
    self.hidden1_dim = hidden1
    self.hidden2_dim = hidden2
    self.use_feature = use_feat
    self.num_user = n_u
    self.num_assertion = n_a
    self.pos_weight_lambda = lam
    self.gamma = gamma
    self.lr_D = lrD
    self.beta1_D = beta1
    self.beta2_D = beta2
    self.output_path = output_p
    self.use_follow = use_foll
    self.follow_path = foll_p
    self.use_cuda = use_cuda

In [100]:
def run_InfoVGAE(data, data_path, stopword_path, seed):
  args = arguments('InfoVGAE', 300, 0.01, 0, 40, seed, data, True, False, data_path, None, stopword_path, 'N', 5, 3, 32, 2, True, None, None, 1.0, 1e-3, 1e-3, 0.5, 0.9, './output',
          False, None, False)
  if torch.cuda.is_available() and args.device != "":
      device = "cuda:{}".format(args.device)
  else:
      device = "cpu"
  setattr(args, "device", device)
  os.makedirs(args.output_path, exist_ok=True)

  np.random.seed(args.seed)
  torch.manual_seed(args.seed)

  pandarallel.initialize()

  dataset = TwitterDataset(csv_path=args.data_path,
                          keyword_path=args.keyword_path, stopword_path=args.stopword_path,
                          mode="multiply", kthreshold=args.kthreshold,
                          uthreshold=args.uthreshold, num_process=args.num_process,
                          add_self_loop=True, directed=False, args=args)
  adj_matrix = dataset.build()
  setattr(args, "num_user", dataset.num_user)
  setattr(args, "num_assertion", dataset.num_assertion)
  # dump label and namelist for evaluation
  dataset.dump_label()

  feature = sp.diags([1.0], shape=(dataset.num_nodes, dataset.num_nodes))
  setattr(args, "input_dim", dataset.num_nodes)
  trainer = InfoVGAETrainer(adj_matrix, feature, args, dataset)
  trainer.train()
  trainer.save()

  print("Running Evaluation ...")
  evaluator = Evaluator()
  evaluator.init_from_value(trainer.result_embedding, dataset.user_label, dataset.asser_label,
                            dataset.name_list, dataset.asserlist,
                            output_dir=args.output_path)
  #evaluator.plot(show=False, save=True)
  evaluator.run_clustering()
  #evaluator.plot_clustering(show=False)
  _, _, _, preds, true = evaluator.numerical_evaluate(verbose=False)
  #evaluator.dump_topk_json()
  return preds, true, dataset.feature_builder.user2index

## Run Model

In [101]:
# options : 'euro', 'timme', 'cd', 'conref'
dat = 'timme'
# topics for CD; options 'all', 'abortion', 'marijuana', 'gayRights', or 'obama'
top = 'all'
path = './Processed/'
# whether or not to use TIMME-All when running with TIMME; False runs TIMME-Pure
t_all = False
# English stopwords from InfoVGAE
# Italian stopwords from : https://github.com/stopwords-iso/stopwords-it

if dat in ['euro', 'timme', 'cd']:
  stopwords_path = './Datasets/Stopwords/stopwords_en.txt'
else:
  stopwords_path = './Datasets/Stopwords/stopwords_it.txt'

if dat == 'cd':
  dat = dat + top

if dat == 'timme':
  if t_all:
    dat = dat + '_all'

full_path = path + dat + '_data.csv'

In [102]:
seeds = [0, 7, 21, 42, 69, 118, 130, 287, 574, 1024]
res = []
truth = []

for seed in seeds:
    preds, true, mapping = run_InfoVGAE(dat, full_path, stopwords_path, seed)
    res.append(preds)
    truth.append(true)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/
Preprocess and dump dataset...
TwitterDataset Building...
Num users: 430
TwitterDataset Processing Done
Dump Label file success ./output/asserlist.json
Training using InfoVGAETrainer
Positive sample weight: 221.99753119913186
Epoch: 0, loss_recon: 0.74218, loss_kl: 0.00000, loss_tc: -0.00000, loss_VAE: 0.74218, loss_D: 0.69314, user_f1: 0.61483, asser_f1: 0.70949
Epoch: 1, loss_recon: 0.74189, loss_kl: 0.00000, loss_tc: 0.00005, loss_VAE: 0.74194, loss_D: 0.69339, user_f1: 0.57955, asser_f1: 0.70733
Epoch: 2, loss_recon: 0.73305, loss_kl: 0.00000, loss_tc: -0.00001, loss_VAE: 0.73304, loss_D: 0.69273, user_f1: 0.59273, asser_f1: 0.71264
Epoch: 3, loss_recon: 0.74026, loss_kl: 0.00000, loss_tc: -0.00005, loss_VAE: 0.74021, loss_D: 0.69416, user_f1: 0.59528, asser_f1

In [103]:
accs = []
fs = []

for i in range(len(seeds)):
    acc = metrics.accuracy_score(truth[i], res[i])
    _, _, f, _ = metrics.precision_recall_fscore_support(truth[i], res[i], average='weighted')
    accs.append(acc)
    fs.append(f)

acc_m = mean(accs)
f_m = mean(fs)
if len(accs) > 1:
    acc_st = stdev(accs)
    f_st = stdev(fs)
    print("Mean Accuracy :", round(acc_m, 4) * 100, "\tSt. dev :", round(acc_st, 4) * 100)
    print("Mean Weighted F1 :", round(f_m, 4) * 100, "\tSt. dev :", round(f_st, 4) * 100)
else:
    print("Accuracy :", round(acc_m, 4) * 100)
    print("Weighted F1 :", round(f_m, 4) * 100)

Mean Accuracy : 70.49 	St. dev : 1.76
Mean Weighted F1 : 67.69 	St. dev : 2.3
