In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from datetime import datetime
import matplotlib.pyplot as plt
from random import random
import warnings

In [2]:
# Data reading
DATA_FOLDER = "Data/"

epinions = pd.read_table(DATA_FOLDER + 'soc-sign-epinions.txt', 
                         names=['Source','Target','Weight'],comment='#',header=None).rename_axis('Epinions',axis=1)
slashdot = pd.read_table(DATA_FOLDER + 'soc-sign-Slashdot090221.txt', 
                         names=['Source','Target','Weight'],comment='#',header=None).rename_axis('Slashdot',axis=1)
wikielec = pd.read_csv(DATA_FOLDER + 'wikipedia.csv').rename_axis('Wikipedia',axis=1)

In [3]:
# Display of the same structures datasets
display(epinions.head())
display(slashdot.head())
display(wikielec.head())

Epinions,Source,Target,Weight
0,0,1,-1
1,1,128552,-1
2,2,3,1
3,4,5,-1
4,4,155,-1


Slashdot,Source,Target,Weight
0,0,1,1
1,0,2,1
2,0,3,1
3,0,4,1
4,0,5,1


Wikipedia,Source,Target,Weight
0,3,30,1
1,25,30,-1
2,4,30,1
3,5,30,1
4,6,30,1


# PAUL

In [30]:
def weighted_average_predict(dataframe, alpha):
    G = nx.from_pandas_edgelist(dataframe, source="Source", target="Target", 
                                edge_attr=["Weight"], create_using=nx.DiGraph())

    for e in G.edges:
        G.edges[e]['Modified Weight'] = (1-alpha) if G.edges[e]['Weight'] > 0 else -alpha

    for node in G.nodes:
        out_edges_weight = [G.get_edge_data(*e)['Modified Weight'] for e in G.out_edges(node)]
        in_edges_weight  = [G.get_edge_data(*e)['Modified Weight'] for e in G.in_edges(node)]
        G.nodes[node]["Source score"] = np.mean(out_edges_weight) if len(out_edges_weight) > 0 else np.nan
        G.nodes[node]["Target score"] = np.mean(in_edges_weight)  if len(in_edges_weight)  > 0 else np.nan


    for e in G.edges:
        s = G.nodes[e[0]]['Source score']
        t = G.nodes[e[1]]['Target score']
        if np.isnan(s) or np.isnan(t):
            raise ValueError("nan cannot exist")

        if s*t > 0:
            if s > 0:
                G.edges[e]["Predict"] = 1
            else:
                G.edges[e]["Predict"] = -1

        elif s*t < 0:
            if s > 0:
                if s > abs(t):
                    G.edges[e]["Predict"] = 1
                elif s < abs(t):
                    G.edges[e]["Predict"] = -1
                else:
                    G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)
            else:
                if abs(s) > t:
                    G.edges[e]["Predict"] = -1
                elif abs(s) < t:
                    G.edges[e]["Predict"] = 1
                else:
                    G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)

        else:
            if s + t > 0:
                G.edges[e]["Predict"] = 1
            elif s + t < 0:
                G.edges[e]["Predict"] = -1
            else:
                G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)

    TP = TN = FP = FN = 0
    for e in G.edges:
        if G.edges[e]['Weight'] == G.edges[e]['Predict']:
            if G.edges[e]['Weight'] > 0:
                TP += 1
            else:
                TN += 1
        else:
            if G.edges[e]['Predict'] > 0:
                FP += 1
            else:
                FN += 1
    
    precision_pos = TP/(TP+FP)
    recall_pos    = TP/(TP+FN)
    precision_neg = TN/(TN+FN)
    recall_neg    = TN/(TN+FP)
    F_score_pos = (2*precision_pos*recall_pos/(precision_pos + recall_pos)) if precision_pos + recall_pos != 0 else 0
    F_score_neg = (2*precision_neg*recall_neg/(precision_neg + recall_neg)) if precision_neg + recall_neg != 0 else 0
    Youden = F_score_pos + F_score_neg - 1
    
    print("TP: %d\nTN: %d\nFP: %d\nFN: %d\n"%(TP, TN, FP, FN))
    print("Precision on positive: %.3f"% precision_pos)
    print("Recall on positive: %.3f"% recall_pos)
    print("Precision on negative: %.3f"% precision_neg)
    print("Recall on negative: %.3f\n"% recall_neg)
    print("F1 score on positive: %.3f"  % F_score_pos)
    print("F1 score on negative: %.3f\n"% F_score_neg)
    print("Accuracy: %.3f"%((TP+TN)/(TP+TN+FP+FN)))
    print("Youden's J statistic: %.3f" %Youden)
    
    return Youden

In [31]:
weighted_average_predict(epinions, 0.63)

TP: 704151
TN: 101582
FP: 22123
FN: 13516

Precision on positive: 0.970
Recall on positive: 0.981
Precision on negative: 0.883
Recall on negative: 0.821

F1 score on positive: 0.975
F1 score on negative: 0.851

Accuracy: 0.958
Youden's J statistic: 0.826


0.8260780749378489

In [32]:
weighted_average_predict(slashdot, 0.63)

TP: 397498
TN: 95929
FP: 28201
FN: 27574

Precision on positive: 0.934
Recall on positive: 0.935
Precision on negative: 0.777
Recall on negative: 0.773

F1 score on positive: 0.934
F1 score on negative: 0.775

Accuracy: 0.898
Youden's J statistic: 0.709


0.7092093167460303

In [33]:
weighted_average_predict(wikielec, 0.66)

TP: 76660
TN: 15677
FP: 6332
FN: 5078

Precision on positive: 0.924
Recall on positive: 0.938
Precision on negative: 0.755
Recall on negative: 0.712

F1 score on positive: 0.931
F1 score on negative: 0.733

Accuracy: 0.890
Youden's J statistic: 0.664


0.6639219349834695

alpha = 0 finds the percentage of positives

In [174]:
# Gradient Descent
delta = np.inf
alpha_prec = 0.5
alpha = 0.55
gamma = 0.6
f = 1
while delta > 1e-4:
    f_prec = weighted_average_predict(epinions, alpha_prec)
    f = weighted_average_predict(epinions, alpha)
    alpha_new = alpha + gamma*(f-f_prec)
    delta = abs(alpha_new - alpha)
    alpha_prec = alpha
    alpha = alpha_new
    print("alpha_new:",alpha_new)
    print("Delta:",delta,"\n")
print(f)

alpha_new: 0.5697640145520472
Delta: 0.019764014552047127 

alpha_new: 0.5740852201160805
Delta: 0.00432120556403337 

alpha_new: 0.5748827121148449
Delta: 0.0007974919987643325 

alpha_new: 0.5750433621949559
Delta: 0.00016065008011101334 

alpha_new: 0.5750853057346298
Delta: 4.1943539673905406e-05 

0.8198333433391702


In [77]:
def lvo_weighted_average_predict(dataframe, alpha):
    G = nx.from_pandas_edgelist(dataframe, source="Source", target="Target", 
                                edge_attr=["Weight"], create_using=nx.DiGraph())
    
    for e in G.edges:
        G.edges[e]['Modified Weight'] = (1-alpha) if G.edges[e]['Weight'] > 0 else -alpha

    for e in G.edges:
        out_edges_weight = [G.get_edge_data(*edge)['Modified Weight'] for edge in G.out_edges(e[0]) if edge != e]
        in_edges_weight  = [G.get_edge_data(*edge)['Modified Weight'] for edge in G.in_edges(e[1])  if edge != e]
        
        len_out = len(out_edges_weight)
        len_in  = len(in_edges_weight)
        
        if 0 == len_out == len_in:
            G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)
        elif len_out == 0:
            t = np.mean(in_edges_weight)
            if t > 0:
                G.edges[e]["Predict"] = 1
            elif t < 0:
                G.edges[e]["Predict"] = -1
            else:
                 G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)
        elif len_in == 0:
            s = np.mean(out_edges_weight)
            if s > 0:
                G.edges[e]["Predict"] = 1
            elif s < 0:
                G.edges[e]["Predict"] = -1
            else:
                 G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)
        else:
            s = np.mean(out_edges_weight)
            t = np.mean(in_edges_weight)
            if s*t > 0:
                if s > 0:
                    G.edges[e]["Predict"] = 1
                else:
                    G.edges[e]["Predict"] = -1

            elif s*t < 0:
                if s > 0:
                    if s > abs(t):
                        G.edges[e]["Predict"] = 1
                    elif s < abs(t):
                        G.edges[e]["Predict"] = -1
                    else:
                        G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)
                else:
                    if abs(s) > t:
                        G.edges[e]["Predict"] = -1
                    elif abs(s) < t:
                        G.edges[e]["Predict"] = 1
                    else:
                        G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)

            else:
                if s + t > 0:
                    G.edges[e]["Predict"] = 1
                elif s + t < 0:
                    G.edges[e]["Predict"] = -1
                else:
                    G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)

    TP = TN = FP = FN = 0
    for e in G.edges:
        if G.edges[e]['Weight'] == G.edges[e]['Predict']:
            if G.edges[e]['Weight'] > 0:
                TP += 1
            else:
                TN += 1
        else:
            if G.edges[e]['Predict'] > 0:
                FP += 1
            else:
                FN += 1
    
    precision_pos = TP/(TP+FP)
    recall_pos    = TP/(TP+FN)
    precision_neg = TN/(TN+FN)
    recall_neg    = TN/(TN+FP)
    F_score_pos = (2*precision_pos*recall_pos/(precision_pos + recall_pos)) if precision_pos + recall_pos != 0 else 0
    F_score_neg = (2*precision_neg*recall_neg/(precision_neg + recall_neg)) if precision_neg + recall_neg != 0 else 0
    Youden = F_score_pos + F_score_neg - 1
    
    print("TP: %d\nTN: %d\nFP: %d\nFN: %d\n"%(TP, TN, FP, FN))
    print("Precision on positive: %.3f"% precision_pos)
    print("Recall on positive: %.3f"% recall_pos)
    print("Precision on negative: %.3f"% precision_neg)
    print("Recall on negative: %.3f\n"% recall_neg)
    print("F1 score on positive: %.3f"  % F_score_pos)
    print("F1 score on negative: %.3f\n"% F_score_neg)
    print("Accuracy: %.3f"%((TP+TN)/(TP+TN+FP+FN)))
    print("Youden's J statistic: %.3f" %Youden)
    
    return Youden

In [74]:
lvo_weighted_average_predict(epinions, 0.85)

TP: 690224
TN: 91555
FP: 32150
FN: 27443

Precision on positive: 0.955
Recall on positive: 0.962
Precision on negative: 0.769
Recall on negative: 0.740

F1 score on positive: 0.959
F1 score on negative: 0.754

Accuracy: 0.929
Youden's J statistic: 0.713


0.7130783643955632

In [75]:
lvo_weighted_average_predict(slashdot, 0.64)

TP: 382272
TN: 87730
FP: 36400
FN: 42800

Precision on positive: 0.913
Recall on positive: 0.899
Precision on negative: 0.672
Recall on negative: 0.707

F1 score on positive: 0.906
F1 score on negative: 0.689

Accuracy: 0.856
Youden's J statistic: 0.595


0.5951297599970591

In [74]:
np.mean([-0.64,0.36, -0.64 ,0.36 ,0.36])

-0.040000000000000015

In [None]:
[-1 1 -1 1 1] -> np.mean([-0.64 0.36 -0.64 0.36 0.36])

In [79]:
lvo_weighted_average_predict(wikielec, 0)

TP: 81711
TN: 34
FP: 21975
FN: 27

Precision on positive: 0.788
Recall on positive: 1.000
Precision on negative: 0.557
Recall on negative: 0.002

F1 score on positive: 0.881
F1 score on negative: 0.003

Accuracy: 0.788
Youden's J statistic: -0.116


-0.11557667335511779

In [89]:
wikielec.sample(10)

Wikipedia,Source,Target,Weight
32815,826,1814,1
81346,457,6170,1
15466,1483,1791,1
54215,1542,4661,1
64508,1097,5452,1
48511,2174,4269,1
18890,231,2097,1
80854,4967,6682,-1
17489,1352,1973,1
44336,3643,3800,1


In [64]:
def wpo_weighted_average_predict(dataframe, alpha, test_percent):
    
    df = dataframe.copy()
    df.loc[df.sample(round(test_percent*df.shape[0])).index,"Weight"] = np.nan

    G = nx.from_pandas_edgelist(dataframe, source="Source", target="Target", 
                                edge_attr=["Weight"], create_using=nx.DiGraph())
    
    G_unknown = nx.from_pandas_edgelist(df[df["Weight"].isnull()].copy(), source="Source", target="Target", 
                                edge_attr=["Weight"], create_using=nx.DiGraph())

    for e in G.edges:
        G.edges[e]['Modified Weight'] = (1-alpha) if G.edges[e]['Weight'] > 0 else -alpha
        
    for node in G_unknown.nodes:
        out_edges_weight = [G.get_edge_data(*e)['Modified Weight'] for e in G.out_edges(node)]
        in_edges_weight  = [G.get_edge_data(*e)['Modified Weight'] for e in G.in_edges(node)]
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            G.nodes[node]["Source score"] = np.nanmean(out_edges_weight)
            G.nodes[node]["Target score"] = np.nanmean(in_edges_weight)
    n = 0
    for e in G_unknown.edges:
        s = G.nodes[e[0]]['Source score']
        t = G.nodes[e[1]]['Target score']
        if np.nan == s == t:
            G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)
            n+=1
        elif s == np.nan:
            if t > 0:
                G.edges[e]["Predict"] = 1
            elif t < 0:
                G.edges[e]["Predict"] = -1
            else:
                G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)
                n+=1
        elif t == np.nan:
            if s > 0:
                G.edges[e]["Predict"] = 1
            elif s < 0:
                G.edges[e]["Predict"] = -1
            else:
                G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)
                n+=1
        else:
            if s*t > 0:
                if s > 0:
                    G.edges[e]["Predict"] = 1
                else:
                    G.edges[e]["Predict"] = -1

            elif s*t < 0:
                if s > 0:
                    if s > abs(t):
                        G.edges[e]["Predict"] = 1
                    elif s < abs(t):
                        G.edges[e]["Predict"] = -1
                    else:
                        G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)
                        n+=1
                else:
                    if abs(s) > t:
                        G.edges[e]["Predict"] = -1
                    elif abs(s) < t:
                        G.edges[e]["Predict"] = 1
                    else:
                        G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)
                        n+=1
            else:
                if s + t > 0:
                    G.edges[e]["Predict"] = 1
                elif s + t < 0:
                    G.edges[e]["Predict"] = -1
                else:
                    G.edges[e]["Predict"] = (1 if random() < 0.5 else -1)
                    n+=1

    TP = TN = FP = FN = 0
    for e in G_unknown.edges:
        if G.edges[e]['Weight'] == G.edges[e]['Predict']:
            if G.edges[e]['Weight'] > 0:
                TP += 1
            else:
                TN += 1
        else:
            if G.edges[e]['Predict'] > 0:
                FP += 1
            else:
                FN += 1
    
    print(n)
    precision_pos = TP/(TP+FP)
    recall_pos    = TP/(TP+FN)
    precision_neg = TN/(TN+FN)
    recall_neg    = TN/(TN+FP)
    F_score_pos = (2*precision_pos*recall_pos/(precision_pos + recall_pos)) if precision_pos + recall_pos != 0 else 0
    F_score_neg = (2*precision_neg*recall_neg/(precision_neg + recall_neg)) if precision_neg + recall_neg != 0 else 0
    Youden = F_score_pos + F_score_neg - 1
    
    print("TP: %d\nTN: %d\nFP: %d\nFN: %d\n"%(TP, TN, FP, FN))
    print("Precision on positive: %.3f"% precision_pos)
    print("Recall on positive: %.3f"% recall_pos)
    print("Precision on negative: %.3f"% precision_neg)
    print("Recall on negative: %.3f\n"% recall_neg)
    print("F1 score on positive: %.3f"  % F_score_pos)
    print("F1 score on negative: %.3f\n"% F_score_neg)
    print("Accuracy: %.3f"%((TP+TN)/(TP+TN+FP+FN)))
    print("Youden's J statistic: %.3f" %Youden)
    
    return Youden

In [95]:
wpo_weighted_average_predict(epinions, 0.63, 0.15)

TP: 704151
TN: 101583
FP: 22122
FN: 13516

Precision on positive: 0.970
Recall on positive: 0.981
Precision on negative: 0.883
Recall on negative: 0.821

F1 score on positive: 0.975
F1 score on negative: 0.851

Accuracy: 0.958
Youden's J statistic: 0.826


0.8260835628769705

In [94]:
wpo_weighted_average_predict(slashdot, 0.63, 0.15)

TP: 397499
TN: 95929
FP: 28201
FN: 27573

Precision on positive: 0.934
Recall on positive: 0.935
Precision on negative: 0.777
Recall on negative: 0.773

F1 score on positive: 0.934
F1 score on negative: 0.775

Accuracy: 0.898
Youden's J statistic: 0.709


0.7092136979114518

In [82]:
wpo_weighted_average_predict(wikielec, 0.65, 0.7)

3
TP: 54168
TN: 10580
FP: 4681
FN: 3194

Precision on positive: 0.920
Recall on positive: 0.944
Precision on negative: 0.768
Recall on negative: 0.693

F1 score on positive: 0.932
F1 score on negative: 0.729

Accuracy: 0.892
Youden's J statistic: 0.661


0.6610109461988123

Extension: use dates exponential