# Regression Analysis

*Use the "1 - Create Network and Backbone WhatsApps" notebook to download and uncompress the data*

In [1]:
%load_ext autoreload
%autoreload 2
from dateutil import rrule
from datetime import date, datetime, timedelta
import pandas as pd
import pickle as pkl
import random
import matplotlib.pyplot as plt
import fastplot
import seaborn as sns
import scipy
import numpy as np
import statsmodels.api as sm
from sklearn import linear_model
import itertools
from sklearn.model_selection import train_test_split
import sklearn.metrics
import networkx as nx
pd.options.mode.chained_assignment = None 

In [2]:
from collections import defaultdict, Counter

def compute_regression(X, Y): 
    # with sklearn
    regr = linear_model.LinearRegression()
    regr.fit(X, Y)
    # with statsmodels
    X_new = sm.add_constant(X) # adding a constant
    model = sm.OLS(Y, X_new).fit()
    return model, regr



def gini(x):
    mad = np.abs(np.subtract.outer(x, x)).mean()
    rmad = mad/np.mean(x)
    g = 0.5 * rmad
    return g

def extract_variables(df_full, edge_list):
    
    # "Timestamp", "Snapshot_ID", "Group_ID", "User_ID", "Message_ID", "Media_Type"
    user_windows_msg = defaultdict(set)
    user_window = defaultdict(set)
    msg_first = {}
    u2g = defaultdict(set)
    g2u = defaultdict(set)
    teste = 1
    for tup in df_full.itertuples():
        t = [1]
        win = tup[1].split(" ")[0]
        #win = str(tup[0])
        g = tup[2]
        u = tup[3]
        msg = tup[4]

        
        user_windows_msg[u].add(win + " " + msg)
        user_window[u].add(win)
        
        if msg not in msg_first or msg_first[msg][0]>t:
            msg_first[msg] = (t,u)            
        u2g[u].add(g)
        g2u[g].add(u)    
        
    
    # Number of messages they were the first to post (fresh content)
    user_first = Counter()
    for msg in msg_first:
        user_first[msg_first[msg][1]]+=1    
    edge_list["u_first_nb"] = edge_list['u'].apply(lambda e: user_first[e])
    edge_list["v_first_nb"] = edge_list['v'].apply(lambda e: user_first[e]) 
  

    #Gini on message to group distribution - Count 
    df_temp = df_full
    df_temp = df_temp.groupby(['User_ID', 'Group_ID'])["Group_ID"].count().reset_index(name="Count")
    dict_user_groups = {}
    for i, row in df_temp.iterrows():
        if row['User_ID'] not in dict_user_groups:
            dict_user_groups[row['User_ID']] = {}
        dict_user_groups[row['User_ID']][row['Group_ID']] = row['Count']

    dict_user_gini = {}
    for user, dict_dist in dict_user_groups.items():
        dict_user_gini[user] = gini(list(dict_user_groups[user].values()))
        
    edge_list["u_n_gini_count"] = edge_list['u'].map(dict_user_gini)
    edge_list["v_n_gini_count"] = edge_list['v'].map(dict_user_gini)      
        

    
    
    #Number of distinct groups u and v have posted
    df_temp = df_full
    df_temp = df_temp.groupby(['User_ID'])["Group_ID"].nunique().reset_index(name="nunique")
    dict_user2nmessages = dict(zip(df_temp['User_ID'],df_temp['nunique']))
    
    edge_list["u_n_unique_group"] = edge_list['u'].map(dict_user2nmessages)
    edge_list["v_n_unique_group"] = edge_list['v'].map(dict_user2nmessages)
    
    
    #Common number of groups u and v have posted
    df_temp = df_full
    #df_temp = df_temp.groupby(['User_ID','Message_ID'])['Message_ID'].count().reset_index(name="count")
    dict_user2messages = df_temp.groupby('User_ID')['Group_ID'].apply(lambda x: list(np.unique(x)))
    dict_user2messages = dict_user2messages.to_frame().reset_index()
    dict_user2messages = dict(zip(dict_user2messages['User_ID'],dict_user2messages['Group_ID']))
 
    dict_index_value = {}
    edge_list['list_u'] = edge_list['u'].map(dict_user2messages)
    edge_list['list_v'] = edge_list['v'].map(dict_user2messages)
        
    for row in edge_list.loc[edge_list.list_u.isnull(), 'list_u'].index:
        edge_list.at[row, 'list_u'] = []
    for row in edge_list.loc[edge_list.list_v.isnull(), 'list_v'].index:
        edge_list.at[row, 'list_v'] = []

    edge_list['Common_n_group'] = edge_list[['list_u', 'list_v']].apply(lambda r: len(set(r['list_u']) & set(r['list_v'])), axis=1)
    edge_list["Common_n_group"] = edge_list["Common_n_group"].fillna(0)
    edge_list["Common_n_group"] = edge_list["Common_n_group"].fillna(0)
    del edge_list['list_u']
    del edge_list['list_v']
    

    #Number of messages unique shared by u and v
    df_temp = df_full
    df_temp = df_temp.groupby(['User_ID'])["Message_ID"].nunique().reset_index(name="nunique")
    dict_user2nmessages = dict(zip(df_temp['User_ID'],df_temp['nunique']))
    edge_list["u_n_unique_msg"] = edge_list['u'].map(dict_user2nmessages)
    edge_list["v_n_unique_msg"] = edge_list['v'].map(dict_user2nmessages)
    
    
    #Number of messages total shared by u and v
    df_temp = df_full
    df_temp = df_temp.groupby(['User_ID'])["Message_ID"].count().reset_index(name="count")
    dict_user2nmessages = dict(zip(df_temp['User_ID'],df_temp['count']))
    edge_list["u_n_total_msg"] = edge_list['u'].map(dict_user2nmessages)
    edge_list["v_n_total_msg"] = edge_list['v'].map(dict_user2nmessages)

    return edge_list

def sort_edges(edge_list):
    tmp = edge_list.apply(lambda r:(r['u'],r['v'],r['w']) if (r['u'] < r['v']) else (r['v'], r['u'], r['w']), axis=1)       
    edge_list = pd.DataFrame(list(tmp), columns=['u', 'v', 'w'])
    del tmp
    return edge_list

# Read the data complete network

In [3]:
PATH_Data = 'WhatsApp/data/'
PATH_Networks = 'WhatsApp/networks/'

k = 'October'

df_full = pd.read_csv('WhatsApp/data/'+'whatsapp_messages.csv', names=["Timestamp", "Snapshot_ID", "Group_ID", "User_ID", 

                                                                       "Message_ID", "Media_Type", "Misinformation"])
df_full = df_full[df_full['Snapshot_ID'].isin([40,41,42,43])]

df_full['Timestamp'] =  pd.to_datetime(df_full['Timestamp'], errors='coerce')
df_full['Timestamp'] = df_full['Timestamp'].apply(lambda x: x.strftime('%Y-%m-%d %H')if not pd.isnull(x) else '')

edge_list = pd.read_csv(PATH_Networks+k+'.edgelist', 
                                 names=['u', 'v', 'w', 'u_id', 'v_id'], header=None, delimiter=' ')


edge_list = sort_edges(edge_list)
edge_list = extract_variables(df_full[["Timestamp", "Group_ID", "User_ID", "Message_ID", "Media_Type", "Misinformation"]], 
                              edge_list[['u', 'v', 'w']])

X = np.sqrt(edge_list[['u_first_nb', 'v_first_nb', 'u_n_gini_count',
       'v_n_gini_count', 'u_n_unique_group', 'v_n_unique_group',
       'Common_n_group', 'u_n_unique_msg', 'v_n_unique_msg', 'u_n_total_msg',
       'v_n_total_msg']])  

Y = np.sqrt(edge_list['w'])

model, model_2 = compute_regression(X, Y)

edge_list['y'] = Y
edge_list['y_hat'] = model_2.predict(X)
edge_list['res'] = (Y-edge_list['y_hat'])

print("From Sklearn: \n")
print('Intercept: \n', model_2.intercept_)
print('Coefficients: \n', model_2.coef_)
print('R^2: {0}'.format(model_2.score(X, Y)))
RMSE = round(sklearn.metrics.mean_squared_error((edge_list['y'])**2, (edge_list['y_hat'])**2, squared=False),3)
print('RMSE', RMSE)
print('NRMSE', RMSE/np.mean(edge_list['y']))
PATH_Regression = 'WhatsApp/regression/'+k+'-'
pkl.dump(edge_list, open(PATH_Regression+'edge_list_Original.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model, open(PATH_Regression+'stats_Original.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model_2, open(PATH_Regression+'sklearn_Original.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

From Sklearn: 

Intercept: 
 0.7655192525036638
Coefficients: 
 [-0.17043706 -0.21367018 -0.00809859  0.16898989  0.0072726  -0.10446354
  0.0256984   0.16157639  0.19108931  0.00655042  0.02170815]
R^2: 0.21739345039601154
RMSE 2.399
NRMSE 1.9801804321118794


# Threshold

In [4]:
k = 'October'

for percentile in [0.995, 0.99, 0.95, 0.9, 0.8]:
    print('---------------------', percentile*100, '---------------------')
    PATH_Networks = 'WhatsApp/networks/'
    PATH_Regression = 'WhatsApp/regression/'+k+'-'
    edge_list = pd.read_csv(PATH_Networks+k+'.edgelist', 
                                     names=['u', 'v', 'w', 'u_id', 'v_id'], header=None, delimiter=' ')
    edge_list = sort_edges(edge_list)
    dist_weights = list(edge_list['w'])
    threshold = np.percentile(dist_weights, (percentile)*100)
    edge_list = edge_list[edge_list['w'] > threshold]
    
    df_full = pd.read_csv('WhatsApp/data/'+'whatsapp_messages.csv', names=["Timestamp", "Snapshot_ID", "Group_ID", "User_ID", 
                                                                           "Message_ID", "Media_Type", "Misinformation"])

    df_full = df_full[df_full['Snapshot_ID'].isin([40,41,42,43])]

    df_full['Timestamp'] =  pd.to_datetime(df_full['Timestamp'], errors='coerce')
    df_full['Timestamp'] = df_full['Timestamp'].apply(lambda x: x.strftime('%Y-%m-%d %H')if not pd.isnull(x) else '')
    edge_list = extract_variables(df_full[["Timestamp", "Group_ID", "User_ID", "Message_ID", "Media_Type", "Misinformation"]], edge_list[['u', 'v', 'w']])
    del df_full
    X = np.sqrt(edge_list[['u_first_nb', 'v_first_nb', 'u_n_gini_count',
           'v_n_gini_count', 'u_n_unique_group', 'v_n_unique_group',
           'Common_n_group', 'u_n_unique_msg', 'v_n_unique_msg', 'u_n_total_msg',
           'v_n_total_msg']])

    Y = np.sqrt(edge_list['w'])
    model, model_2 = compute_regression(X, Y)

    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X)
    edge_list['res'] = (Y-edge_list['y_hat'])

    print("From Sklearn: \n")
    print('Intercept: \n', model_2.intercept_)
    print('Coefficients: \n', model_2.coef_)
    print('R^2: {0}'.format(model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error((edge_list['y'])**2, (edge_list['y_hat'])**2, squared=False),3)
    print('RMSE', RMSE)
    print('NRMSE', RMSE/np.mean(edge_list['y']))
    PATH_Regression = 'WhatsApp/regression/'+k+'-'+str(percentile)+'-'
    pkl.dump(edge_list, open(PATH_Regression+'edge_list_threshold.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model, open(PATH_Regression+'stats_threshold.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model_2, open(PATH_Regression+'sklearn_threshold.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

--------------------- 99.5 ---------------------
From Sklearn: 

Intercept: 
 3.339794458553567
Coefficients: 
 [-0.13705417 -0.11937215  0.06864148  0.06106281  0.23484356 -0.13940911
  0.03079975  0.21249077  0.11330023 -0.05741293  0.02888879]
R^2: 0.38263520721467703
RMSE 4.181
NRMSE 0.8769231570296459
--------------------- 99.0 ---------------------
From Sklearn: 

Intercept: 
 3.544265241486751
Coefficients: 
 [-0.112705   -0.11723914  0.19414995 -0.04846251 -0.08420458 -0.0029114
  0.0063541   0.12802759  0.09617298 -0.00177959  0.03650933]
R^2: 0.31186327963310834
RMSE 3.693
NRMSE 0.8215644595702493
--------------------- 95.0 ---------------------
From Sklearn: 

Intercept: 
 2.941448532740647
Coefficients: 
 [-0.25045638 -0.3004889   0.27639624  0.2451982  -0.40588008 -0.30101465
 -0.03190937  0.15183221  0.16166222  0.08509942  0.12565884]
R^2: 0.22343987998731252
RMSE 4.686
NRMSE 1.3626698586867532
--------------------- 90.0 ---------------------
From Sklearn: 

Intercept: 


# DF

In [5]:
k = 'October'
backbone = 'DF'

for confidence in [0.999, 0.995, 0.99, 0.95, 0.9]:
    print('alpha', 1-confidence)
    
    PATH_Data = 'WhatsApp/data/'
    type_network = 'DF'
    Path_Networks = 'WhatsApp/backbones/df/'+str(k)+'.edgelist'
    Path_Communities = 'WhatsApp/communities/'+type_network+str(k)+'-'+str(confidence)+'.pkl'
    edge_list = pd.read_csv(Path_Networks, sep=',', names=['u', 'v', 'w','score','var'])
    edge_list = edge_list[edge_list['score'] > confidence]
    edge_list = sort_edges(edge_list)

    df_full = pd.read_csv(PATH_Data+'whatsapp_messages.csv', names=["Timestamp", "Snapshot_ID", "Group_ID", 
                                                                    "User_ID","Message_ID", "Media_Type", "Misinformation"])

    df_full = df_full[df_full['Snapshot_ID'].isin([40,41,42,43])]

    edge_list = extract_variables(df_full[["Timestamp", "Group_ID", "User_ID", "Message_ID", "Media_Type", "Misinformation"]], edge_list[['u', 'v', 'w']])
    del df_full

    X = np.sqrt(edge_list[['u_first_nb', 'v_first_nb', 'u_n_gini_count',
           'v_n_gini_count', 'u_n_unique_group', 'v_n_unique_group',
           'Common_n_group', 'u_n_unique_msg', 'v_n_unique_msg', 'u_n_total_msg',
           'v_n_total_msg']])

    Y = np.sqrt(edge_list['w'])

    model, model_2 = compute_regression(X, Y)

    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X)
    edge_list['res'] = (Y-edge_list['y_hat'])

    print("From Sklearn: \n")
    print('Intercept: \n', model_2.intercept_)
    print('Coefficients: \n', model_2.coef_)
    print('R^2: {0}'.format(model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error((edge_list['y'])**2, (edge_list['y_hat'])**2, squared=False),3)
    print('RMSE', RMSE)
    print('NRMSE', RMSE/np.mean(edge_list['y']))
    PATH_Regression = 'WhatsApp/regression/'+k+'-'+str(confidence)+'-'
    pkl.dump(edge_list, open(PATH_Regression+'edge_list_df.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model, open(PATH_Regression+'stats_df.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model_2, open(PATH_Regression+'sklearn_df.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

alpha 0.0010000000000000009
From Sklearn: 

Intercept: 
 3.0981496536930653
Coefficients: 
 [-0.28203103 -0.3224524   0.16855976  0.3203038   0.09934009 -0.44503093
 -0.13280925  0.26228783  0.20393021  0.0187813   0.10968801]
R^2: 0.33472203757331
RMSE 6.04
NRMSE 1.319845483878308
alpha 0.0050000000000000044
From Sklearn: 

Intercept: 
 2.8612421427661836
Coefficients: 
 [-0.31127179 -0.35621129  0.14209009  0.33118934 -0.11175876 -0.28436353
 -0.10614317  0.25960325  0.25680709  0.03792291  0.08836963]
R^2: 0.30445781345315015
RMSE 5.403
NRMSE 1.3074742113023952
alpha 0.010000000000000009
From Sklearn: 

Intercept: 
 2.7981449543557053
Coefficients: 
 [-0.31541262 -0.39971827  0.11718093  0.25455782 -0.15169991 -0.30828171
 -0.09944905  0.25427428  0.26446251  0.0441881   0.11370294]
R^2: 0.31560006175323196
RMSE 5.159
NRMSE 1.3177102966783056
alpha 0.050000000000000044
From Sklearn: 

Intercept: 
 2.280696455097207
Coefficients: 
 [-0.38340385 -0.44353304  0.22032228  0.25246542 -0.

## Polya Urn

In [6]:
k = 'October' 
backbone = 'Polya'

for alpha in [0.001, 0.005, 0.01, 0.05, 0.1]:
    print("Alpha", alpha)
    PATH_Data = 'WhatsApp/data/'
    type_network = 'polya'
    Path_Networks = 'WhatsApp/backbones/polya/'+str(k)+'.edgelist'
    Path_Communities = 'WhatsApp/communities/'+type_network+str(k)+'-'+str(alpha)+'.pkl'
    edge_list = pd.read_csv(Path_Networks, sep=',', names=['u', 'v', 'w', 'p_value'])
    edge_list = edge_list[edge_list['p_value'] < alpha]
    edge_list = sort_edges(edge_list)

    df_full = pd.read_csv(PATH_Data+'whatsapp_messages.csv', names=["Timestamp", "Snapshot_ID", "Group_ID", 
                                                                    "User_ID","Message_ID", "Media_Type", "Misinformation"])
    df_full = df_full[df_full['Snapshot_ID'].isin([40,41,42,43])]


    edge_list = extract_variables(df_full[["Timestamp", "Group_ID", "User_ID", "Message_ID", "Media_Type", "Misinformation"]], 
                                  edge_list[['u', 'v', 'w']])
    del df_full
    X = np.sqrt(edge_list[['u_first_nb', 'v_first_nb', 'u_n_gini_count',
           'v_n_gini_count', 'u_n_unique_group', 'v_n_unique_group',
           'Common_n_group', 'u_n_unique_msg', 'v_n_unique_msg', 'u_n_total_msg',
           'v_n_total_msg']])

    Y = np.sqrt(edge_list['w'])


    model, model_2 = compute_regression(X, Y)

    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X)
    edge_list['res'] = (Y-edge_list['y_hat'])

    print("From Sklearn: \n")
    print('Intercept: \n', model_2.intercept_)
    print('Coefficients: \n', model_2.coef_)
    print('R^2: {0}'.format(model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error((edge_list['y'])**2, (edge_list['y_hat'])**2, squared=False),3)
    print('RMSE', RMSE)
    print('NRMSE', RMSE/np.mean(edge_list['y']))
    PATH_Regression = 'WhatsApp/regression/'+k+'-'+str(alpha)+'-'
    pkl.dump(edge_list, open(PATH_Regression+'edge_list_polya.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model, open(PATH_Regression+'stats_polya.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model_2, open(PATH_Regression+'sklearn_polya.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

Alpha 0.001
From Sklearn: 

Intercept: 
 3.4207195399274744
Coefficients: 
 [-0.23148195 -0.26847242  0.1748846   0.25917278 -0.11035613 -0.3888202
 -0.12842782  0.19133267  0.15265519  0.03292685  0.10476087]
R^2: 0.231992398796787
RMSE 4.705
NRMSE 1.1159698704639909
Alpha 0.005
From Sklearn: 

Intercept: 
 3.1008199860469485
Coefficients: 
 [-0.27916311 -0.32914996  0.30213925  0.22871096 -0.29132845 -0.26900803
 -0.09773326  0.20106832  0.19815515  0.06321392  0.11240386]
R^2: 0.2782400051883025
RMSE 4.696
NRMSE 1.204500696947734
Alpha 0.01
From Sklearn: 

Intercept: 
 3.0312916217581076
Coefficients: 
 [-0.29229263 -0.36733658  0.32275128  0.22161364 -0.39077849 -0.2469688
 -0.04857034  0.18741545  0.22418516  0.08487184  0.11910071]
R^2: 0.2941535732895162
RMSE 4.72
NRMSE 1.2639619226453136
Alpha 0.05
From Sklearn: 

Intercept: 
 2.5265990597613195
Coefficients: 
 [-0.34204177 -0.38563717  0.26352859  0.22989494 -0.33545515 -0.24039888
 -0.04399401  0.24045692  0.23976882  0.07730

# HSS

In [7]:
k = 'October' 
backbone = 'HSS'

for percentile in [0.995, 0.99, 0.95, 0.9, 0.8]:
    percentile = percentile*100
    print('---------------------', percentile, '---------------------')
    PATH_Data = 'WhatsApp/data/'
    type_network = 'HSS'
    Path_Networks = 'WhatsApp/backbones/hss/'+str(k)+'.edgelist'
    edge_list = pd.read_csv(Path_Networks, sep=',',  names=['u', 'v', 'w', 'score'])
    value = np.percentile(list(edge_list['score']),percentile)
    edge_list = edge_list[edge_list['score'] > value]
    edge_list = sort_edges(edge_list)

    df_full = pd.read_csv(PATH_Data+'whatsapp_messages.csv', names=["Timestamp", "Snapshot_ID", "Group_ID", 
                                                                "User_ID","Message_ID", "Media_Type", "Misinformation"])

    df_full = df_full[df_full['Snapshot_ID'].isin([40,41,42,43])]


    edge_list = extract_variables(df_full[["Timestamp", "Group_ID", "User_ID", "Message_ID", "Media_Type", "Misinformation"]], 
                                  edge_list[['u', 'v', 'w']])
    del df_full
    
    X = np.sqrt(edge_list[['u_first_nb', 'v_first_nb', 'u_n_gini_count',
       'v_n_gini_count', 'u_n_unique_group', 'v_n_unique_group',
       'Common_n_group', 'u_n_unique_msg', 'v_n_unique_msg', 'u_n_total_msg',
       'v_n_total_msg']])

    Y = np.sqrt(edge_list['w'])


    model, model_2 = compute_regression(X, Y)

    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X)
    edge_list['res'] = (Y-edge_list['y_hat'])

    print("From Sklearn: \n")
    print('Intercept: \n', model_2.intercept_)
    print('Coefficients: \n', model_2.coef_)
    print('R^2: {0}'.format(model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error((edge_list['y'])**2, (edge_list['y_hat'])**2, squared=False),3)
    print('RMSE', RMSE)
    print('NRMSE', RMSE/np.mean(edge_list['y']))
    PATH_Regression = 'WhatsApp/regression/'+k+'-'+str(percentile)+'-'
    pkl.dump(edge_list, open(PATH_Regression+'edge_list_hss.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model, open(PATH_Regression+'stats_hss.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model_2, open(PATH_Regression+'sklearn_hss.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

--------------------- 99.5 ---------------------
From Sklearn: 

Intercept: 
 0.8251187365523356
Coefficients: 
 [-0.06145252 -0.08940052  0.19615715  0.12753689  0.00071718  0.00102439
  0.07827933  0.08856515  0.11840221 -0.01279835 -0.00966493]
R^2: 0.36776678594658607
RMSE 1.142
NRMSE 0.8970918327779656
--------------------- 99.0 ---------------------
From Sklearn: 

Intercept: 
 0.9112705364705904
Coefficients: 
 [-0.0908705  -0.12158469  0.24130821 -0.02485327 -0.1312388   0.0093325
  0.08966923  0.07347271  0.15534758  0.02644845 -0.01252932]
R^2: 0.2970414804704392
RMSE 1.867
NRMSE 1.3614866895962991
--------------------- 95.0 ---------------------
From Sklearn: 

Intercept: 
 -0.9204596780422007
Coefficients: 
 [-0.37077603 -0.50254835  0.15694942 -0.00848594  0.29301019  0.32306233
  0.38252477  0.41332994  0.57253629 -0.03858256 -0.05416981]
R^2: 0.4404995269556562
RMSE 4.157
NRMSE 2.717633828968926
--------------------- 90.0 ---------------------
From Sklearn: 

Intercept: 

# Recast

In [8]:
PATH_Networks = 'WhatsApp/networks/'
PATH_Backbones = 'WhatsApp/backbones/recast/' 
k ='October'
for alpha in [0.001, 0.005,0.01, 0.05, 0.1]:
    print('--------', alpha, '--------')
    edge_list = pd.read_csv('WhatsApp/backbones/recast/October-'+str(alpha)+'.edgelist', delimiter=' ')
    edge_list = edge_list[['u','v', 'w']]
    edge_list = sort_edges(edge_list)
    PATH_Data = 'WhatsApp/data/'

    df_full = pd.read_csv(PATH_Data+'whatsapp_messages.csv', names=["Timestamp", "Snapshot_ID", "Group_ID", 
                                                                    "User_ID","Message_ID", "Media_Type", "Misinformation"])



    df_full = df_full[df_full['Snapshot_ID'].isin([40,41,42,43])]
    edge_list = extract_variables(df_full[["Timestamp", "Group_ID", "User_ID", "Message_ID", "Media_Type", "Misinformation"]], edge_list[['u', 'v', 'w']])
    del df_full
    
    X = np.sqrt(edge_list[['u_first_nb', 'v_first_nb', 'u_n_gini_count',
       'v_n_gini_count', 'u_n_unique_group', 'v_n_unique_group',
       'Common_n_group', 'u_n_unique_msg', 'v_n_unique_msg', 'u_n_total_msg',
       'v_n_total_msg']])

    Y = np.sqrt(edge_list['w'])


    model, model_2 = compute_regression(X, Y)

    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X)
    edge_list['res'] = (Y-edge_list['y_hat'])

    print("From Sklearn: \n")
    print('Intercept: \n', model_2.intercept_)
    print('Coefficients: \n', model_2.coef_)
    print('R^2: {0}'.format(model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error((edge_list['y'])**2, (edge_list['y_hat'])**2, squared=False),3)
    print('RMSE', RMSE)
    print('NRMSE', RMSE/np.mean(edge_list['y']))
    
    PATH_Regression = 'WhatsApp/regression/'+k+'-'+str(alpha)+'-'
    pkl.dump(edge_list, open(PATH_Regression+'edge_list_recast.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model, open(PATH_Regression+'stats_recast.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model_2, open(PATH_Regression+'sklearn_recast.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

-------- 0.001 --------
From Sklearn: 

Intercept: 
 2.551735137800221
Coefficients: 
 [-0.56071445 -0.53260566  0.53741481  1.00036152 -1.2176865  -0.34825718
  0.56928499  0.3943077   0.42544481  0.13951629  0.09462757]
R^2: 0.34435207873276763
RMSE 8.778
NRMSE 2.0695175371637005
-------- 0.005 --------
From Sklearn: 

Intercept: 
 2.551735137800221
Coefficients: 
 [-0.56071445 -0.53260566  0.53741481  1.00036152 -1.2176865  -0.34825718
  0.56928499  0.3943077   0.42544481  0.13951629  0.09462757]
R^2: 0.34435207873276763
RMSE 8.778
NRMSE 2.0695175371637005
-------- 0.01 --------
From Sklearn: 

Intercept: 
 2.551735137800221
Coefficients: 
 [-0.56071445 -0.53260566  0.53741481  1.00036152 -1.2176865  -0.34825718
  0.56928499  0.3943077   0.42544481  0.13951629  0.09462757]
R^2: 0.34435207873276763
RMSE 8.778
NRMSE 2.0695175371637005
-------- 0.05 --------
From Sklearn: 

Intercept: 
 2.8088672623436657
Coefficients: 
 [-0.38298833 -0.47365429  0.06143915  0.19812889 -0.56995411 -0.3

# Analysis of the 5 models
- Complete Network
- DF
- Polya
- HSS
- Threshold
- Recast

In [9]:
k = 'October'
#Paramters defined as explained at the paper
threshold = 0.950
df = 0.95 
polya = 0.05
hss = 95.0
recast = 0.05

dict_models = {'Original':'',  'threshold':threshold, 'df':df, 'hss':hss, 'polya':polya, 'recast':recast}

In [10]:
dict_models_user_data = dict()
for model, parameter in dict_models.items():
    print(model)
    if model == "Original":
        PATH_Regression = 'WhatsApp/regression/'+k+'-'+str(parameter)
    else:
        PATH_Regression = 'WhatsApp/regression/'+k+'-'+str(parameter)+'-'
    dict_models_user_data[model] = pkl.load(open(PATH_Regression+'edge_list_'+model+'.pickle', 'rb'))
    dict_models_user_data[model] = dict_models_user_data[model].set_index(['u','v'])

df_full = pd.read_csv('WhatsApp/data/'+'whatsapp_messages.csv', names=["Timestamp", "Snapshot_ID", "Group_ID", "User_ID", 
                                                                       "Message_ID", "Media_Type", "Misinformation"])

df_full = df_full[df_full['Snapshot_ID'].isin([40,41,42,43])]

Original
threshold
df
hss
polya
recast


In [11]:
idx = list(set(dict_models_user_data['threshold'].index) & 
           set(dict_models_user_data['polya'].index) & 
           set(dict_models_user_data['df'].index) & 
                set(dict_models_user_data['hss'].index) &
                set(dict_models_user_data['recast'].index))
#del dict_models['hss']

In [12]:
PATH_Regression = 'WhatsApp/regression/'+k+'-'
idx_train, idx_test = train_test_split(idx, test_size=0.2, random_state=2021)
pkl.dump(idx, open(PATH_Regression+'set_common_edge.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(idx_train, open(PATH_Regression+'set_common_edge_train.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(idx_test, open(PATH_Regression+'set_common_edge_test.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

In [13]:
dict_results = {}
dict_misinformation = {}
dict_msg = {}
list_features = ['u_first_nb', 'v_first_nb', 'u_n_gini_count',
       'v_n_gini_count', 'u_n_unique_group', 'v_n_unique_group',
       'Common_n_group', 'u_n_unique_msg', 'v_n_unique_msg', 'u_n_total_msg',
       'v_n_total_msg']


for model, parameter in dict_models.items():
    if model == "Original":
        PATH_Regression = 'WhatsApp/regression/'+k+'-'+str(parameter)
    else:
        PATH_Regression = 'WhatsApp/regression/'+k+'-'+str(parameter)+'-'

    print("------------------")
    print("Model", model)
    dict_results[model] = []
    edge_list = pkl.load(open(PATH_Regression+'edge_list_'+model+'.pickle', 'rb'))
    X = np.sqrt(edge_list[list_features]) 
    Y = np.sqrt(edge_list['w'])
    model_1, model_2 = compute_regression(X, Y)
    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X) 
    edge_list['res'] = (Y-edge_list['y_hat'])
    print('R^2 Complete:', model_2.score(X, Y))
    dict_results[model].append(("R^2 Train", model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error(edge_list['y']**2, edge_list['y_hat']**2, squared=False),3)
    print("RMSE Complete:", RMSE)
    print("NRMSE Complete:", RMSE/np.mean(edge_list['w']))
    
    def count_msg_mis(df_full, list_nodes):
        df_temp = df_full[df_full['User_ID'].isin(list_nodes)]
        n_miss = len(df_temp[df_temp['Misinformation'] == 1])
        n_msg = len(df_temp)
        return n_miss, n_msg
    
    list_nodes = set(edge_list['u']).union(edge_list['v'])

    #Train and test 
    idx_test = pkl.load(open('WhatsApp/regression/'+k+'-'+'set_common_edge_test.pickle', 'rb'))
    idx_test = pd.MultiIndex.from_tuples(idx_test, names=['u', 'v'])
    edge_list = edge_list.set_index(['u','v'])
    edge_list_test = edge_list.loc[idx_test].reset_index(drop=False)
    edge_list = edge_list.drop(idx_test)


    X = np.sqrt(edge_list[list_features]) 
    Y = np.sqrt(edge_list['w'])
    model_1, model_2 = compute_regression(X, Y)
    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X) 
    edge_list['res'] = (Y-edge_list['y_hat'])
    print('R^2 Train:', model_2.score(X, Y))
    dict_results[model].append(("R^2 Train", model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error(edge_list['y']**2, edge_list['y_hat']**2, squared=False),3)
    print("RMSE Train:", RMSE)
    print("NRMSE Train:", RMSE/np.mean(edge_list['w']))

    X = np.sqrt(edge_list_test[list_features])
    Y = np.sqrt(edge_list_test['w'])
    edge_list_test['y'] = Y
    edge_list_test['y_hat'] = model_2.predict(X)
    edge_list_test['res'] = (Y-edge_list_test['y_hat'])
    RMSE = round(sklearn.metrics.mean_squared_error(edge_list_test['y']**2, edge_list_test['y_hat']**2, squared=False),3)
    print("RMSE Test:", RMSE)
    print("NRMSE Test:", RMSE/np.mean(edge_list_test['w']))

------------------
Model Original
R^2 Complete: 0.21739345039601154
RMSE Complete: 2.399
NRMSE Complete: 1.3428153838360815
R^2 Train: 0.2147627076428824
RMSE Train: 2.367
NRMSE Train: 1.3313998304152534
RMSE Test: 19.311
NRMSE Test: 0.9261210826210826
------------------
Model threshold
R^2 Complete: 0.22343987998731252
RMSE Complete: 4.686
NRMSE Complete: 0.3806435615618184
R^2 Train: 0.21561548741643843
RMSE Train: 4.591
NRMSE Train: 0.37572869722756924
RMSE Test: 10.741
NRMSE Test: 0.5151191832858499
------------------
Model df
R^2 Complete: 0.35005215672583756
RMSE Complete: 4.795
NRMSE Complete: 0.42486640661791497
R^2 Train: 0.34717491988702487
RMSE Train: 4.701
NRMSE Train: 0.4201813794289957
RMSE Test: 11.137
NRMSE Test: 0.534110636277303
------------------
Model hss
R^2 Complete: 0.4404995269556562
RMSE Complete: 4.157
NRMSE Complete: 1.2367470386659831
R^2 Train: 0.41909334421220956
RMSE Train: 4.012
NRMSE Train: 1.2539293377199254
RMSE Test: 14.928
NRMSE Test: 0.715920227920