In [1]:
%load_ext autoreload
%autoreload 2
from dateutil import rrule
from datetime import date, datetime, timedelta
import pandas as pd
import math
import pickle as pkl
import random
import matplotlib.pyplot as plt
import fastplot
import seaborn as sns
import scipy
import numpy as np
import instagram_utils as utils
import statsmodels.api as sm
from sklearn import linear_model
import json
import itertools
from sklearn.model_selection import train_test_split
import sklearn.metrics

In [2]:
def compute_regression(X, Y): 
    # with sklearn
    regr = linear_model.LinearRegression()
    regr.fit(X, Y)
    # with statsmodels
    X_new = sm.add_constant(X) # adding a constant
    model = sm.OLS(Y, X_new).fit()
    return model, regr

def extract_variables(dict_data, edge_list):
    user_comments = {}
    user_influencers = {}
    user_n_comments = {}
    user_n_influencers = {}
    
    for influencer, dict_post2users in dict_data.items():
        for post, list_user in dict_post2users.items():
            for user in list_user:
                if user not in user_comments:
                    user_comments[user] = set()
                    user_influencers[user] = set()
                user_comments[user].add(post)
                user_influencers[user].add(influencer)
    
    for user, list_posts_commented in user_comments.items():
        user_n_comments[user] = len(list_posts_commented)

    for user, list_influencers_commented in user_influencers.items():
        user_n_influencers[user] = len(list_influencers_commented)

    
    edge_list["u_n_comments"] = edge_list['u_n'].map(user_n_comments)
    edge_list["v_n_comments"] = edge_list['v_n'].map(user_n_comments)
    edge_list["u_n_influencers"] = edge_list['u_n'].map(user_n_influencers)
    edge_list["v_n_influencers"] = edge_list['v_n'].map(user_n_influencers)
    
    return edge_list

def sort_edges(edge_list):
    tmp = edge_list.apply(lambda r:(r['u'],r['v'],r['w']) if (r['u'] < r['v']) else (r['v'], r['u'], r['w']), axis=1)       
    edge_list = pd.DataFrame(list(tmp), columns=['u', 'v', 'w'])
    del tmp
    return edge_list

# Compute the regression model for each network (Original and Backbones)

**Original Network**

In [42]:
PATH_Networks = 'Instagram-BR/networks/network.edgelist'

with open('Instagram-BR/data/instagram_data.json') as fp:
    dict_influencer2media2commenter = json.load(fp)
    fp.close()
    
node2id = utils.get_node2id(dict_influencer2media2commenter)
node2id_reverse =  {v: k for k, v in node2id.items()}
edge_list = pd.read_csv(PATH_Networks, names=['u', 'v', 'w'], header=None, delimiter=' ')
edge_list['u_n'] = edge_list['u'].map(node2id_reverse)
edge_list['v_n'] = edge_list['v'].map(node2id_reverse)
edge_list = extract_variables(dict_influencer2media2commenter, edge_list)
print(len(edge_list))

189813505


In [None]:
PATH_Regression = 'Instagram-BR/regression/''
edge_list = pkl.load(open(PATH_Regression+'edge_list_original.pickle', 'rb'))

In [43]:
X = np.log(edge_list[['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']]) 
Y = np.log(edge_list['w'])
model, model_2 = compute_regression(X, Y)

edge_list['y'] = Y
edge_list['y_hat'] = model_2.predict(X)
edge_list['res'] = (Y-edge_list['y_hat'])

print("From Sklearn: \n")
print('Intercept: \n', model_2.intercept_)
print('Coefficients: \n', model_2.coef_)
print('R^2: {0}'.format(model_2.score(X, Y)))
print('RMSE', round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3))

From Sklearn: 

Intercept: 
 -0.664643679923767
Coefficients: 
 [ 0.32319041  0.3225865  -0.21036166 -0.21783238]
R^2: 0.2595505264008887
RMSE 0.702


In [44]:
PATH_Regression = 'Instagram-BR/regression/'
pkl.dump(edge_list, open(PATH_Regression+'edge_list_original.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model, open(PATH_Regression+'stats_original.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model_2, open(PATH_Regression+'sklearn_original.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

# TriBE

In [None]:
with open('Instagram-BR/data/instagram_data.json') as fp:
    dict_influencer2media2commenter = json.load(fp)
    fp.close()

node2id = utils.get_node2id(dict_influencer2media2commenter)
node2id_reverse =  {v: k for k, v in node2id.items()}

for parameter in ['0.9', '0.95', '0.99', '0.995', '0.999', '0.9995', '0.9999']:
    PATH_backbone = 'Instagram-BR/backbones/tribe/'+str(parameter)+'.edgelist'
    print(parameter)
    edge_list = pd.read_csv(PATH_backbone, 
                                     names=['u', 'v', 'w'], header=None, delimiter=' ')

    edge_list['u_n'] = edge_list['u'].map(node2id_reverse)
    edge_list['v_n'] = edge_list['v'].map(node2id_reverse)
    edge_list = extract_variables(dict_influencer2media2commenter, edge_list)
    X = np.log(edge_list[['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']])
    Y = np.log(edge_list['w'])
    model, model_2 = compute_regression(X, Y)

    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X)
    edge_list['res'] = (Y-edge_list['y_hat']) 

    print("From Sklearn: \n")
    print('Intercept: \n', model_2.intercept_)
    print('Coefficients: \n', model_2.coef_)
    print('R^2: {0}'.format(model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3)
    print('RMSE', RMSE) 
    print("NRMSE:", RMSE/np.mean(edge_list['w']))
    PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
    pkl.dump(edge_list, open(PATH_Regression+'edge_list_tribe.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model, open(PATH_Regression+'stats_tribe.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model_2, open(PATH_Regression+'sklearn_tribe.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

In [None]:
parameter = 0.95
PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
edge_list = pkl.load(open(PATH_Regression+'edge_list_insta_model.pickle', 'rb'))

In [None]:
X = np.log(edge_list[['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']])
Y = np.log(edge_list['w'])
model, model_2 = compute_regression(X, Y)

edge_list['y'] = Y
edge_list['y_hat'] = model_2.predict(X)
edge_list['res'] = (Y-edge_list['y_hat'])

print("From Sklearn: \n")
print('Intercept: \n', model_2.intercept_)
print('Coefficients: \n', model_2.coef_)
print('R^2: {0}'.format(model_2.score(X, Y)))
RMSE = round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3)
print('RMSE', RMSE) 
print("NRMSE:", RMSE/np.mean(edge_list['w']))

In [None]:
PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
pkl.dump(edge_list, open(PATH_Regression+'edge_list_tribe.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model, open(PATH_Regression+'stats_tribe.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model_2, open(PATH_Regression+'sklearn_tribe.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

# Noise Corrected

In [None]:
PATH_backbone = 'Instagram-BR/backbones/nc/all_p_values.edgelist'

with open('Instagram-BR/data/instagram_data.json') as fp:
    dict_influencer2media2commenter = json.load(fp)
    fp.close()

node2id = utils.get_node2id(dict_influencer2media2commenter)
node2id_reverse =  {v: k for k, v in node2id.items()}

for parameter in [0.10, 
                  0.05, 0.01, 
                  0.005, 0.001,
                  0.0005, 0.0001,
                  0.00005, 0.00001, 
                  0.000005, 0.000001,
                  0.0000005, 0.0000001,
                  0.00000005, 0.00000001,
                  0.000000005, 0.000000001,
                  0.0000000005, 0.0000000001,
                  0.00000000005, 0.00000000001,
                  0.000000000005, 0.000000000001]:
    
    print(parameter)
    parameter = 1-parameter
    edge_list = pd.read_csv(PATH_backbone, 
                                     names=['u', 'v', 'w', 'confidence'], header=None, delimiter=',')

    edge_list = edge_list[edge_list['confidence'] > parameter]
    #edge_list = sort_edges(edge_list)


    edge_list = edge_list[['u', 'v', 'w']]
    edge_list['u_n'] = edge_list['u'].map(node2id_reverse)
    edge_list['v_n'] = edge_list['v'].map(node2id_reverse)
    edge_list = extract_variables(dict_influencer2media2commenter, edge_list)
    
    X = np.log(edge_list[['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']]) 
    Y = np.log(edge_list['w'])
    model, model_2 = compute_regression(X, Y)

    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X)
    edge_list['res'] = (Y-edge_list['y_hat'])

    print("From Sklearn: \n")
    print('Intercept: \n', model_2.intercept_)
    print('Coefficients: \n', model_2.coef_)
    print('R^2: {0}'.format(model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3)
    print('RMSE', RMSE) 
    print("NRMSE:", RMSE/np.mean(edge_list['w']))

    
    PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
    pkl.dump(edge_list, open(PATH_Regression+'edge_list_nc.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model, open(PATH_Regression+'stats_nc.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model_2, open(PATH_Regression+'sklearn_nc.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

In [None]:
parameter = 0.9999995
PATH_Regression = 'Instagram-BR/regression/'+str(parameter)
edge_list = pkl.load(open(PATH_Regression+'edge_list_nc.pickle', 'rb'))

In [None]:
X = np.log(edge_list[['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']]) 
Y = np.log(edge_list['w'])
model, model_2 = compute_regression(X, Y)

edge_list['y'] = Y
edge_list['y_hat'] = model_2.predict(X)
edge_list['res'] = (Y-edge_list['y_hat'])

print("From Sklearn: \n")
print('Intercept: \n', model_2.intercept_)
print('Coefficients: \n', model_2.coef_)
print('R^2: {0}'.format(model_2.score(X, Y)))
RMSE = round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3)
print('RMSE', RMSE) 
print("NRMSE:", RMSE/np.mean(edge_list['w']))

# print_model = model.summary()
# print(print_model)

In [None]:
PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
pkl.dump(edge_list, open(PATH_Regression+'edge_list_nc.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model, open(PATH_Regression+'stats_nc.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model_2, open(PATH_Regression+'sklearn_nc.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

# MLF

In [None]:
import math
parameter = 0.0005
PATH_backbone = 'Instagram-BR/backbones/mlf/all_p_values.edgelist'

with open('Instagram-BR/data/instagram_data.json') as fp:
    dict_influencer2media2commenter = json.load(fp)
    fp.close()

node2id = utils.get_node2id(dict_influencer2media2commenter)
node2id_reverse =  {v: k for k, v in node2id.items()}
for parameter in [0.10, 0.05, 0.01, 0.005, 0.001]:
    edge_list = pd.read_csv(PATH_backbone,
                                     names=['u', 'v', 'w','significance'], header=None, delimiter=',')

    edge_list['significance'] = math.e**(-edge_list['significance'])
    edge_list = edge_list[edge_list['significance'] < parameter]

    edge_list = sort_edges(edge_list)

    edge_list['u_n'] = edge_list['u'].map(node2id_reverse)
    edge_list['v_n'] = edge_list['v'].map(node2id_reverse)
    edge_list = extract_variables(dict_influencer2media2commenter, edge_list)
    X = np.log(edge_list[['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']]) 
    Y = np.log(edge_list['w'])
    model, model_2 = compute_regression(X, Y)

    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X)
    edge_list['res'] = (Y-edge_list['y_hat'])

    print("From Sklearn: \n")
    print('Intercept: \n', model_2.intercept_)
    print('Coefficients: \n', model_2.coef_)
    print('R^2: {0}'.format(model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3)
    print('RMSE', RMSE) 
    print("NRMSE:", RMSE/np.mean(edge_list['w']))
    
    PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
    pkl.dump(edge_list, open(PATH_Regression+'edge_list_mlf.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model, open(PATH_Regression+'stats_mlf.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model_2, open(PATH_Regression+'sklearn_mlf.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

In [None]:
parameter = 0.0005
PATH_Regression = 'Instagram-BR/regression/'+parameter+'-'
edge_list = pkl.load(open(PATH_Regression+'edge_list_mlf.pickle', 'rb'))

In [None]:
X = np.log(edge_list[['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']]) 
Y = np.log(edge_list['w'])
model, model_2 = compute_regression(X, Y)

edge_list['y'] = Y
edge_list['y_hat'] = model_2.predict(X)
edge_list['res'] = (Y-edge_list['y_hat'])

print("From Sklearn: \n")
print('Intercept: \n', model_2.intercept_)
print('Coefficients: \n', model_2.coef_)
print('R^2: {0}'.format(model_2.score(X, Y)))
print('RMSE', round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3))

In [None]:
PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
pkl.dump(edge_list, open(PATH_Regression+'edge_list_mlf.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model, open(PATH_Regression+'stats_mlf.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model_2, open(PATH_Regression+'sklearn_mlf.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

# GloSS

In [None]:
with open('Instagram-BR/data/instagram_data.json') as fp:
    dict_influencer2media2commenter = json.load(fp)
    fp.close()

node2id = utils.get_node2id(dict_influencer2media2commenter)
node2id_reverse =  {v: k for k, v in node2id.items()}


PATH_backbone = 'Instagram-BR/backbones/gloss/all_p_values.edgelist'


for parameter in [0.10, 0.05, 0.01, 0.005, 0.001]:
    print(parameter)
    edge_list = pd.read_csv(PATH_backbone,
                                 names=['u', 'v','p','w'], header=None,sep = ' ')
    edge_list = edge_list[edge_list['p'] < parameter]
    edge_list = edge_list[['u', 'v', 'w']]

    #edge_list = sort_edges(edge_list)

    edge_list['u_n'] = edge_list['u'].map(node2id_reverse)
    edge_list['v_n'] = edge_list['v'].map(node2id_reverse)
    edge_list = extract_variables(dict_influencer2media2commenter, edge_list)
    
    X = np.log(edge_list[['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']])
    Y = np.log(edge_list['w'])
    model, model_2 = compute_regression(X, Y)

    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X)
    edge_list['res'] = (Y-edge_list['y_hat'])

    print("From Sklearn: \n")
    print('Intercept: \n', model_2.intercept_)
    print('Coefficients: \n', model_2.coef_)
    print('R^2: {0}'.format(model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3)
    print('RMSE', RMSE) 
    print("NRMSE:", RMSE/np.mean(edge_list['w']))
    
    PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
    pkl.dump(edge_list, open(PATH_Regression+'edge_list_gloss.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model, open(PATH_Regression+'stats_gloss.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model_2, open(PATH_Regression+'sklearn_gloss.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

In [None]:
PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
edge_list = pkl.load(open(PATH_Regression+'edge_list_gloss.pickle', 'rb'))

In [None]:
X = np.log(edge_list[['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']])
Y = np.log(edge_list['w'])
model, model_2 = compute_regression(X, Y)

edge_list['y'] = Y
edge_list['y_hat'] = model_2.predict(X)
edge_list['res'] = (Y-edge_list['y_hat'])

print("From Sklearn: \n")
print('Intercept: \n', model_2.intercept_)
print('Coefficients: \n', model_2.coef_)
print('R^2: {0}'.format(model_2.score(X, Y)))
print('RMSE', round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3))

In [None]:
PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
pkl.dump(edge_list, open(PATH_Regression+'edge_list_gloss.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model, open(PATH_Regression+'stats_gloss.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model_2, open(PATH_Regression+'sklearn_gloss.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

# SDMS

In [None]:
parameter = 0.05

with open('Instagram-BR/data/instagram_data.json') as fp:
    dict_influencer2media2commenter = json.load(fp)
    fp.close()

node2id = utils.get_node2id(dict_influencer2media2commenter)
node2id_reverse =  {v: k for k, v in node2id.items()}

for parameter in ['0.10', '0.05', '0.01', '0.005', '0.001']:
    print(parameter)
    PATH_backbone = "Instagram-BR/backbones/sdsm/backbone_edgelist_"+parameter+".csv"

    edge_list = pd.read_csv(PATH_backbone,
                                     names=['u', 'v','w'], header=None,sep = ' ')

    edge_list['u_n'] = edge_list['u'].map(node2id_reverse)
    edge_list['v_n'] = edge_list['v'].map(node2id_reverse)
    edge_list = extract_variables(dict_influencer2media2commenter, edge_list)

    X = np.log(edge_list[['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']])
    Y = np.log(edge_list['w'])
    model, model_2 = compute_regression(X, Y)

    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X)
    edge_list['res'] = (Y-edge_list['y_hat'])

    print("From Sklearn: \n")
    print('Intercept: \n', model_2.intercept_)
    print('Coefficients: \n', model_2.coef_)
    print('R^2: {0}'.format(model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3)
    print('RMSE', RMSE) 
    print("NRMSE:", RMSE/np.mean(edge_list['w']))

    PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
    pkl.dump(edge_list, open(PATH_Regression+'edge_list_sdsm.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model, open(PATH_Regression+'stats_sdsm.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(model_2, open(PATH_Regression+'sklearn_sdsm.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

In [None]:
PATH_Regression = 'Instagram-BR/regression/'-+str(parameter)+'-'
edge_list = pkl.load(open(PATH_Regression+'edge_list_sdsm.pickle', 'rb'))

In [None]:
PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
edge_list = pkl.load(open(PATH_Regression+'edge_list_sdsm.pickle', 'rb'))

X = np.log(edge_list[['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']])
Y = np.log(edge_list['w'])
model, model_2 = compute_regression(X, Y)

edge_list['y'] = Y
edge_list['y_hat'] = model_2.predict(X)
edge_list['res'] = (Y-edge_list['y_hat'])

print("From Sklearn: \n")
print('Intercept: \n', model_2.intercept_)
print('Coefficients: \n', model_2.coef_)
print('R^2: {0}'.format(model_2.score(X, Y)))
print('RMSE', round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3))

In [None]:
PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
pkl.dump(edge_list, open(PATH_Regression+'edge_list_sdsm.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model, open(PATH_Regression+'stats_sdsm.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(model_2, open(PATH_Regression+'sklearn_sdsm.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

# Analysis of the 6 models
- Complete Network
- TriBE
- Noise Corrected
- MLF
- GloSS
- SDSM

In [5]:
#Base on the most aggressive method: GloSS - Explained in the paper.
tribe = 1-0.05
sdsm= 0.001 
gloss = 0.1
mlf = 0.001
nc = 1-0.00001
dict_models = {'original':'', 'nc':nc, 'tribe':tribe, 'mlf':mlf, 'gloss':gloss, 'sdsm':sdsm}

In [6]:
dict_models_sklearn = dict()
dict_models_stats = dict()
dict_models_user_data = dict()
dict_number_edges_model = dict()
for model, parameter in dict_models.items():
    if model == "original":
        PATH_Regression = 'Instagram-BR/regression/'
    else:
        PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'
    print(model)
    dict_models_user_data[model] = pkl.load(open(PATH_Regression+'edge_list_'+model+'.pickle', 'rb'))
    dict_models_user_data[model] = dict_models_user_data[model].set_index(['u','v'])

original
nc
tribe
mlf
gloss
sdsm


In [7]:
#Get the common set of edges in all backbones
idx = list(set(dict_models_user_data['tribe'].index) & 
           set(dict_models_user_data['nc'].index) & 
           set(dict_models_user_data['mlf'].index) & 
           set(dict_models_user_data['gloss'].index) &
          set(dict_models_user_data['sdsm'].index))

In [8]:
PATH_Regression = 'Instagram-BR/regression/'
idx_train, idx_test = train_test_split(idx, test_size=0.2, random_state=2022)
pkl.dump(idx, open(PATH_Regression+'set_common_edge.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(idx_train, open(PATH_Regression+'set_common_edge_train.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)
pkl.dump(idx_test, open(PATH_Regression+'set_common_edge_test.pickle', 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

In [9]:
dict_results = {}
list_features = ['u_n_comments','v_n_comments', 'u_n_influencers', 'v_n_influencers']
for model, parameter in dict_models.items():
    if model == "original":
        PATH_Regression = 'Instagram-BR/regression/'+str(parameter)
    else:
        PATH_Regression = 'Instagram-BR/regression/'+str(parameter)+'-'

    print("------------------")
    print("Model", model)
    dict_results[model] = []
    edge_list = pkl.load(open(PATH_Regression+'edge_list_'+model+'.pickle', 'rb'))
    X = np.log(edge_list[list_features]) 
    Y = np.log(edge_list['w'])
    model_1, model_2 = compute_regression(X, Y)
    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X) 
    edge_list['res'] = (Y-edge_list['y_hat'])
    print('R^2 Complete:', model_2.score(X, Y))
    dict_results[model].append(("R^2 Train", model_2.score(X, Y)))
    RMSE = round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3)
    print("RMSE Complete:", RMSE)
    print("NRMSE Complete:", RMSE/np.mean(edge_list['w']))

    #Train and test 
    idx_test = pkl.load(open('Instagram-BR/regression/set_common_edge_test.pickle', 'rb'))
    idx_test = pd.MultiIndex.from_tuples(idx_test, names=['u', 'v'])
    edge_list = edge_list.set_index(['u','v'])
    edge_list_test = edge_list.loc[idx_test].reset_index(drop=False)
    edge_list = edge_list.drop(idx_test)


    X = np.log(edge_list[list_features]) 
    Y = np.log(edge_list['w'])
    model_1, model_2 = compute_regression(X, Y)
    edge_list['y'] = Y
    edge_list['y_hat'] = model_2.predict(X) 
    edge_list['res'] = (Y-edge_list['y_hat'])
    print('R^2 Train:', model_2.score(X, Y))
    dict_results[model].append(("R^2 Train", model_2.score(X, Y)))
    RMSE =  round(sklearn.metrics.mean_squared_error(np.exp(edge_list['y']), np.exp(edge_list['y_hat']), squared=False),3)

    print("RMSE Train:", RMSE)
    print("NRMSE Train:", RMSE/np.mean(edge_list['w']))



    X = np.log(edge_list_test[list_features])
    Y = np.log(edge_list_test['w'])
    edge_list_test['y'] = Y
    edge_list_test['y_hat'] = model_2.predict(X)
    edge_list_test['res'] = (Y-edge_list_test['y_hat'])
    RMSE =  round(sklearn.metrics.mean_squared_error(np.exp(edge_list_test['y']), np.exp(edge_list_test['y_hat']), squared=False),3)
    print("RMSE Test:", RMSE)
    print("NRMSE Test:", RMSE/np.mean(edge_list_test['w']))
    


------------------
Model original
R^2 Complete: 0.2595505264008887
RMSE Complete: 0.702
NRMSE Complete: 0.48151494728833044
R^2 Train: 0.259577722197635
RMSE Train: 0.702
NRMSE Train: 0.48152961640826974
RMSE Test: 2.367
NRMSE Test: 0.6944656846106849
------------------
Model nc
R^2 Complete: 0.5201176644615584
RMSE Complete: 0.881
NRMSE Complete: 0.4500208720367939
R^2 Train: 0.5211624066427123
RMSE Train: 0.88
NRMSE Train: 0.4498465906818159
RMSE Test: 1.993
NRMSE Test: 0.5847359989138552
------------------
Model tribe
R^2 Complete: 0.8229544332064933
RMSE Complete: 0.632
NRMSE Complete: 0.18724125988802345
R^2 Train: 0.8231076498921683
RMSE Train: 0.632
NRMSE Train: 0.18724317017048203
RMSE Test: 0.698
NRMSE Test: 0.20478962731654332
------------------
Model mlf
R^2 Complete: 0.4985517787881326
RMSE Complete: 0.919
NRMSE Complete: 0.35456916899086344
R^2 Train: 0.500784015767743
RMSE Train: 0.917
NRMSE Train: 0.3540173344755732
RMSE Test: 1.753
NRMSE Test: 0.5143212273436969
-------