In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm

from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from utils.data_processing import *
from models.logistic_regression import LogisticRegression

In [2]:
articles_df = load_article_df()
articles_df.head()

Loaded 4604 articles in df of shape (4604, 1)


Unnamed: 0,article_name
0,Áedán_mac_Gabráin
1,Åland
2,Édouard_Manet
3,Éire
4,Óengus_I_of_the_Picts


In [3]:
links_df = load_links_df(articles_df=articles_df)
links_df.head()

Loaded 119882 links in df of shape (119882, 2)
After adding missing links, there are 124492 links in df


Unnamed: 0,source,target
0,Áedán_mac_Gabráin,Bede
1,Áedán_mac_Gabráin,Columba
2,Áedán_mac_Gabráin,Dál_Riata
3,Áedán_mac_Gabráin,Great_Britain
4,Áedán_mac_Gabráin,Ireland


In [4]:
finished_df = load_finished_df()
finished_df.head()

Loaded 51318 finished paths in df of shape (51318, 7)


Unnamed: 0,hashIP,timestamp,duration,path,difficulty_rating,path_length,num_backward
0,6a3701d319fc3754,2011-02-15 03:26:49,166,"[14th_century, 15th_century, 16th_century, Pac...",,9,0
1,3824310e536af032,2012-08-12 06:36:52,88,"[14th_century, Europe, Africa, Atlantic_slave_...",3.0,5,0
2,415612e93584d30e,2012-10-03 21:10:40,138,"[14th_century, Niger, Nigeria, British_Empire,...",,8,0
3,64dd5cd342e3780c,2010-02-08 07:25:25,37,"[14th_century, Renaissance, Ancient_Greece, Gr...",,4,0
4,015245d773376aab,2013-04-23 15:27:08,175,"[14th_century, Italy, Roman_Catholic_Church, H...",3.0,7,0


In [5]:
unifinished_df = load_unfinished_df()
unifinished_df.head()

Loaded 24875 unfinished paths in df of shape (24875, 8)


Unnamed: 0,hashIP,timestamp,duration,path,target_article,type_end,path_length,num_backward
0,2426091a53125110,2011-02-07 05:02:15,1804,[Obi-Wan_Kenobi],Microsoft,timeout,1,0
1,26141fd878806294,2011-02-07 05:14:11,1805,[Julius_Caesar],Caracas,timeout,1,0
2,2b015fb8181c48f2,2011-02-07 15:00:19,1818,"[Malawi, Democracy, Alexander_the_Great]",First_Crusade,timeout,3,0
3,53a53bc244e08a6a,2011-02-07 16:06:01,49,[Paraguay],Mount_St._Helens,restart,1,0
4,53a53bc244e08a6a,2011-02-07 17:18:25,1808,"[Paraguay, Bolivia]",Mount_St._Helens,timeout,2,0


In [6]:
all_games_df, _ = preprocess_and_concat_unfinished_and_finished(unifinished_df, finished_df)
all_games_df.head()

After filtering all paths after 2011-02-07 05:02:15
we kept 23245 paths out of 51318 finished paths
There are 24875 unfinished paths


Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,type_end
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,


In [7]:
valid_games_df = prune_invalid_games(all_games_df, articles_df)
valid_games_df.head()

Pruning invalid games. Initially we have 48120 games
Pruned invalid games. Now we have 48092 valid games


Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,type_end
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,


In [8]:
_ = dump_adjacency_list(links_df, articles_df)

Dumped adjacency list with 4604 articles and 124492 entries


In [9]:
_ = dump_unique_source_target_pairs(valid_games_df)

Dumped 29834 unique source-target pairs


In [10]:
unique_game_stats_df = load_or_compute_unique_source_target_pair_stats()
unique_game_stats_df.head()

Loaded 29834 unique source-target pair stats


Unnamed: 0_level_0,Unnamed: 1_level_0,shortest_path_length,shortest_path_count,max_sp_pagerank,max_sp_avg_pagerank,avg_sp_avg_pagerank,one_longer_path_count,max_ol_pagerank,max_ol_avg_pagerank,avg_ol_avg_pagerank,two_longer_pagerank,max_tl_pagerank,max_tl_avg_pagerank
source,target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
14th_century,African_slave_trade,3,3,0.001523,0.000559,0.000414,85,0.007202,0.002545,0.000857,300,0.005009,0.001814
14th_century,John_F._Kennedy,3,18,0.007202,0.00317,0.001738,1758,0.007202,0.003519,0.001507,300,0.007202,0.003492
14th_century,Rainbow,3,5,0.003356,0.00099,0.000457,198,0.005009,0.001775,0.000671,300,0.005009,0.002008
14th_century,Sodium,3,4,0.002184,0.000765,0.000519,548,0.007202,0.002669,0.000756,300,0.005009,0.002164
14th_century,Elizabeth_I_of_England,2,1,0.000644,0.000371,0.000371,61,0.005009,0.00159,0.000683,300,0.005009,0.001854


In [11]:
node_stats_df = load_or_compute_node_stats()
node_stats_df.head()

Loaded 4604 node stats


Unnamed: 0_level_0,degree,closeness,betweenness,pagerank
article_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10th_century,27,0.323966,5575.09,0.0005
11th_century,49,0.336178,6653.51,0.000435
12th_century,46,0.338251,14201.0,0.000561
13th_century,35,0.32163,8544.39,0.000586
14th_century,32,0.325238,5366.38,0.00047


In [12]:
valid_games_df = valid_games_df.merge(unique_game_stats_df, on=['source', 'target'], how='left')
valid_games_df.dropna(subset=['shortest_path_length'], inplace=True)
print("Number of valid games after shortest path length filtering: ", len(valid_games_df))

Number of valid games after shortest path length filtering:  48092


In [13]:
embeddings_df = load_embeddings()
embeddings_df.head()

Loaded 4604 embeddings in df of shape (4604, 1)


Unnamed: 0_level_0,embedding
article_name,Unnamed: 1_level_1
10th_century,"[-0.050105415, 0.09074813, 0.008811204, -0.051..."
11th_century,"[-0.042490385, 0.061723217, -0.020333063, -0.0..."
12th_century,"[-0.032364372, 0.037580367, -0.052070234, -0.1..."
13th_century,"[-0.07914139, 0.059444286, 0.020845776, -0.100..."
14th_century,"[-0.040906686, 0.122400954, -0.04961793, 0.007..."


In [14]:
valid_games_df = compute_cosine_similarity(valid_games_df, embeddings_df)


In [15]:
valid_games_df = merge_with_node_data(valid_games_df, node_stats_df)
valid_games_df.head()

Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,...,max_tl_avg_pagerank,cosine_sim_source_target,degree_source,closeness_source,betweenness_source,pagerank_source,degree_target,closeness_target,betweenness_target,pagerank_target
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,...,0.001814,0.202444,32,0.325238,5366.38,0.00047,25,0.335511,102.483,2.8e-05
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,...,0.001814,0.202444,32,0.325238,5366.38,0.00047,25,0.335511,102.483,2.8e-05
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,...,0.001814,0.202444,32,0.325238,5366.38,0.00047,25,0.335511,102.483,2.8e-05
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,...,0.003492,0.079502,32,0.325238,5366.38,0.00047,69,0.348326,16585.8,0.000243
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,...,0.003492,0.079502,32,0.325238,5366.38,0.00047,69,0.348326,16585.8,0.000243


In [16]:
valid_games_df.columns

Index(['difficulty_rating', 'duration', 'finished', 'hashIP', 'num_backward',
       'path', 'path_length', 'source', 'target', 'timestamp', 'type_end',
       'shortest_path_length', 'shortest_path_count', 'max_sp_pagerank',
       'max_sp_avg_pagerank', 'avg_sp_avg_pagerank', 'one_longer_path_count',
       'max_ol_pagerank', 'max_ol_avg_pagerank', 'avg_ol_avg_pagerank',
       'two_longer_pagerank', 'max_tl_pagerank', 'max_tl_avg_pagerank',
       'cosine_sim_source_target', 'degree_source', 'closeness_source',
       'betweenness_source', 'pagerank_source', 'degree_target',
       'closeness_target', 'betweenness_target', 'pagerank_target'],
      dtype='object')

In [17]:
valid_games_df = valid_games_df[valid_games_df['type_end'] != 'timeout']

In [18]:
fame_df = load_fame()
fame_df[fame_df.isna().any(axis=1)]
#fame_df.head()

Unnamed: 0_level_0,fame_score
article_name,Unnamed: 1_level_1


In [19]:
valid_games_df = merge_with_fame_data(valid_games_df, fame_df)
valid_games_df.head()

Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,...,degree_source,closeness_source,betweenness_source,pagerank_source,degree_target,closeness_target,betweenness_target,pagerank_target,fame_score_source,fame_score_target
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,...,32,0.325238,5366.38,0.00047,25,0.335511,102.483,2.8e-05,8.0,6.0
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,...,32,0.325238,5366.38,0.00047,25,0.335511,102.483,2.8e-05,8.0,6.0
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,...,32,0.325238,5366.38,0.00047,25,0.335511,102.483,2.8e-05,8.0,6.0
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,...,32,0.325238,5366.38,0.00047,69,0.348326,16585.8,0.000243,8.0,9.0
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,...,32,0.325238,5366.38,0.00047,69,0.348326,16585.8,0.000243,8.0,9.0


In [20]:
# Drop some variables before logistic regression
valid_games_df.drop(
    columns=[
        'hashIP', 'timestamp', 'path', 'difficulty_rating', 'type_end',
        'duration', 'path_length', 'num_backward'
    ], 
    inplace=True
)

valid_games_df = valid_games_df[valid_games_df['shortest_path_length'] > 0]

# The number of paths is better when log-transformed
valid_games_df['shortest_path_count'] = valid_games_df['shortest_path_count'].apply(lambda x: np.log(x + 1e-2))
valid_games_df['one_longer_path_count'] = valid_games_df['one_longer_path_count'].apply(lambda x: np.log(x + 1e-2))

# Change the type of finished to a float for logistic regression
valid_games_df['finished'] = valid_games_df['finished'].astype(float)

print(valid_games_df.shape)
print(valid_games_df.columns)

(38762, 26)
Index(['finished', 'source', 'target', 'shortest_path_length',
       'shortest_path_count', 'max_sp_pagerank', 'max_sp_avg_pagerank',
       'avg_sp_avg_pagerank', 'one_longer_path_count', 'max_ol_pagerank',
       'max_ol_avg_pagerank', 'avg_ol_avg_pagerank', 'two_longer_pagerank',
       'max_tl_pagerank', 'max_tl_avg_pagerank', 'cosine_sim_source_target',
       'degree_source', 'closeness_source', 'betweenness_source',
       'pagerank_source', 'degree_target', 'closeness_target',
       'betweenness_target', 'pagerank_target', 'fame_score_source',
       'fame_score_target'],
      dtype='object')


In [21]:
features_1 = ['shortest_path_length', 'shortest_path_count']

model_1 = LogisticRegression(valid_games_df, features_1)
model_1.fit()

Class distribution: finished
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
Total number of samples: 31052
Optimization terminated successfully.
         Current function value: 0.647197
         Iterations 5
Training Set Metrics:
Threshold:   0.5030
F1 Score:    0.6220
Precision:   0.6221
Accuracy:    0.6220
              precision    recall  f1-score   support

         0.0     0.6205    0.6211    0.6208      3106
         1.0     0.6206    0.6200    0.6203      3105

    accuracy                         0.6205      6211
   macro avg     0.6205    0.6205    0.6205      6211
weighted avg     0.6205    0.6205    0.6205      6211



In [22]:
model_1.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,24841.0
Model:,Logit,Df Residuals:,24839.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 16 Dec 2024",Pseudo R-squ.:,0.06629
Time:,00:14:37,Log-Likelihood:,-16077.0
converged:,True,LL-Null:,-17218.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.7771,0.018,-43.757,0.000,-0.812,-0.742
x2,0.3592,0.016,22.608,0.000,0.328,0.390


In [23]:
features_2 = ['pagerank_target']

model_2 = LogisticRegression(valid_games_df, features_2, has_constant_term=True)
model_2.fit()

Class distribution: finished
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
Total number of samples: 31052
Optimization terminated successfully.
         Current function value: 0.660634
         Iterations 6


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Set Metrics:
Threshold:   0.4470
F1 Score:    0.6375
Precision:   0.6375
Accuracy:    0.6375
              precision    recall  f1-score   support

         0.0     0.6367    0.6330    0.6348      3106
         1.0     0.6350    0.6386    0.6368      3105

    accuracy                         0.6358      6211
   macro avg     0.6358    0.6358    0.6358      6211
weighted avg     0.6358    0.6358    0.6358      6211



In [24]:
model_2.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,24841.0
Model:,Logit,Df Residuals:,24839.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 16 Dec 2024",Pseudo R-squ.:,0.04691
Time:,00:14:42,Log-Likelihood:,-16411.0
converged:,True,LL-Null:,-17218.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0592,0.014,4.381,0.000,0.033,0.086
x1,0.7877,0.024,32.301,0.000,0.740,0.835


In [26]:
features_3 = [
    'shortest_path_length',
    'shortest_path_count', 'max_sp_pagerank', 'max_sp_avg_pagerank',
    'avg_sp_avg_pagerank', 'one_longer_path_count', 'max_ol_pagerank',
    'max_ol_avg_pagerank', 'avg_ol_avg_pagerank', 'two_longer_pagerank',
    'max_tl_pagerank', 'max_tl_avg_pagerank', 'cosine_sim_source_target',
    'degree_source', 'closeness_source', 'betweenness_source',
    'pagerank_source', 'degree_target', 'closeness_target',
    'betweenness_target', 'pagerank_target', 'fame_score_source', 'fame_score_target'
]

model_3 = LogisticRegression(valid_games_df, features_3)
model_3.fit()

Class distribution: finished
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
Total number of samples: 31052
Optimization terminated successfully.
         Current function value: 0.629736
         Iterations 5
Training Set Metrics:
Threshold:   0.4930
F1 Score:    0.6398
Precision:   0.6408
Accuracy:    0.6402
              precision    recall  f1-score   support

         0.0     0.6454    0.5889    0.6158      3106
         1.0     0.6219    0.6763    0.6479      3105

    accuracy                         0.6326      6211
   macro avg     0.6336    0.6326    0.6319      6211
weighted avg     0.6336    0.6326    0.6319      6211



In [27]:
model_3.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,24841.0
Model:,Logit,Df Residuals:,24818.0
Method:,MLE,Df Model:,22.0
Date:,"Mon, 16 Dec 2024",Pseudo R-squ.:,0.09148
Time:,00:24:33,Log-Likelihood:,-15643.0
converged:,True,LL-Null:,-17218.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.8386,0.033,-25.365,0.000,-0.903,-0.774
x2,0.0337,0.042,0.809,0.419,-0.048,0.115
x3,-0.0052,0.048,-0.108,0.914,-0.100,0.089
x4,-0.0218,0.061,-0.360,0.719,-0.141,0.097
x5,0.0161,0.034,0.476,0.634,-0.050,0.082
x6,0.3542,0.048,7.362,0.000,0.260,0.448
x7,-0.1352,0.028,-4.851,0.000,-0.190,-0.081
x8,0.2154,0.044,4.865,0.000,0.129,0.302
x9,-0.0836,0.035,-2.415,0.016,-0.151,-0.016


In [28]:
combined_pred = valid_games_df.copy()[['finished']]

combined_pred['model_1_pred'] = model_1.predict(valid_games_df[features_1])
combined_pred['model_2_pred'] = model_2.predict(valid_games_df[features_2])
combined_pred['model_3_pred'] = model_3.predict(valid_games_df[features_3])

combined_pred['pred'] = combined_pred[['model_1_pred', 'model_2_pred', 'model_3_pred']].sum(axis=1)
combined_pred['pred'] = combined_pred['pred'].apply(lambda x: 1 if x >= 2 else 0)

print(classification_report(combined_pred['finished'], combined_pred['pred'], digits=4))

              precision    recall  f1-score   support

         0.0     0.5484    0.6202    0.5821     15526
         1.0     0.7219    0.6588    0.6889     23236

    accuracy                         0.6433     38762
   macro avg     0.6352    0.6395    0.6355     38762
weighted avg     0.6524    0.6433    0.6461     38762

