In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm

from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from src.data_processing import *
from src.logistic_regression import LogisticRegression

In [2]:
articles_df = load_article_df()
articles_df.head()

Loaded 4604 articles in df of shape (4604, 1)


Unnamed: 0,article_name
0,Áedán_mac_Gabráin
1,Åland
2,Édouard_Manet
3,Éire
4,Óengus_I_of_the_Picts


In [3]:
links_df = load_links_df()
links_df.head()

Loaded 119882 links in df of shape (119882, 2)


Unnamed: 0,source,target
0,Áedán_mac_Gabráin,Bede
1,Áedán_mac_Gabráin,Columba
2,Áedán_mac_Gabráin,Dál_Riata
3,Áedán_mac_Gabráin,Great_Britain
4,Áedán_mac_Gabráin,Ireland


In [4]:
finished_df = load_finished_df()
finished_df.head()

Loaded 51318 finished paths in df of shape (51318, 7)


Unnamed: 0,hashIP,timestamp,duration,path,difficulty_rating,path_length,num_backward
0,6a3701d319fc3754,2011-02-15 03:26:49,166,"[14th_century, 15th_century, 16th_century, Pac...",,9,0
1,3824310e536af032,2012-08-12 06:36:52,88,"[14th_century, Europe, Africa, Atlantic_slave_...",3.0,5,0
2,415612e93584d30e,2012-10-03 21:10:40,138,"[14th_century, Niger, Nigeria, British_Empire,...",,8,0
3,64dd5cd342e3780c,2010-02-08 07:25:25,37,"[14th_century, Renaissance, Ancient_Greece, Gr...",,4,0
4,015245d773376aab,2013-04-23 15:27:08,175,"[14th_century, Italy, Roman_Catholic_Church, H...",3.0,7,0


In [5]:
unifinished_df = load_unfinished_df()
unifinished_df.head()

Loaded 24875 unfinished paths in df of shape (24875, 8)


Unnamed: 0,hashIP,timestamp,duration,path,target_article,type_end,path_length,num_backward
0,2426091a53125110,2011-02-07 05:02:15,1804,[Obi-Wan_Kenobi],Microsoft,timeout,1,0
1,26141fd878806294,2011-02-07 05:14:11,1805,[Julius_Caesar],Caracas,timeout,1,0
2,2b015fb8181c48f2,2011-02-07 15:00:19,1818,"[Malawi, Democracy, Alexander_the_Great]",First_Crusade,timeout,3,0
3,53a53bc244e08a6a,2011-02-07 16:06:01,49,[Paraguay],Mount_St._Helens,restart,1,0
4,53a53bc244e08a6a,2011-02-07 17:18:25,1808,"[Paraguay, Bolivia]",Mount_St._Helens,timeout,2,0


In [6]:
all_games_df, _ = preprocess_and_concat_unfinished_and_finished(unifinished_df, finished_df)
all_games_df.head()

After filtering all paths after 2011-02-07 05:02:15
we kept 23245 paths out of 51318 finished paths
There are 24875 unfinished paths


Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,type_end
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,


In [7]:
valid_games_df = prune_invalid_games(all_games_df, articles_df)
valid_games_df.head()

Pruning invalid games. Initially we have 48120 games
Pruned invalid games. Now we have 48092 valid games


Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,type_end
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,


In [8]:
_ = dump_adjacency_list(links_df, articles_df)

Dumped adjacency list with 4604 articles and 119882 entries


In [9]:
_ = dump_unique_source_target_pairs(valid_games_df)

Dumped 29834 unique source-target pairs


In [10]:
unique_game_stats_df = load_or_compute_unique_source_target_pair_stats()
unique_game_stats_df.head()

Loaded 29834 unique source-target pair stats


Unnamed: 0_level_0,Unnamed: 1_level_0,shortest_path_length,shortest_path_count,max_sp_pagerank,max_sp_avg_pagerank,avg_sp_avg_pagerank,one_longer_path_count,max_ol_pagerank,max_ol_avg_pagerank,avg_ol_avg_pagerank,two_longer_pagerank,max_tl_pagerank,max_tl_avg_pagerank
source,target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
14th_century,African_slave_trade,3,3,0.00207,0.000759,0.000562,85,0.009774,0.003463,0.001167,300,0.006847,0.00247
14th_century,John_F._Kennedy,3,18,0.009774,0.004316,0.002362,1758,0.009774,0.004792,0.00205,300,0.009774,0.004751
14th_century,Rainbow,3,5,0.004548,0.001342,0.000626,198,0.006847,0.002413,0.000922,300,0.006847,0.002735
14th_century,Sodium,3,4,0.003013,0.001054,0.000713,548,0.009774,0.003634,0.001034,300,0.006847,0.00296
14th_century,Elizabeth_I_of_England,2,1,0.000882,0.000508,0.000508,61,0.006847,0.002172,0.000932,300,0.006847,0.002537


In [11]:
node_stats_df = load_or_compute_node_stats()
node_stats_df.head()

Loaded 4604 node stats


Unnamed: 0_level_0,degree,closeness,betweenness,pagerank
article_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10th_century,26,0.323931,5575.16,0.000684
11th_century,48,0.336125,6652.27,0.000595
12th_century,45,0.338172,14188.4,0.000769
13th_century,34,0.321593,8543.72,0.00081
14th_century,31,0.325205,5366.15,0.000642


In [12]:
valid_games_df = valid_games_df.merge(unique_game_stats_df, on=['source', 'target'], how='left')
valid_games_df.dropna(subset=['shortest_path_length'], inplace=True)
print("Number of valid games after shortest path length filtering: ", len(valid_games_df))

Number of valid games after shortest path length filtering:  48092


In [13]:
embeddings_df = load_embeddings()
embeddings_df.head()

Loaded 4604 embeddings in df of shape (4604, 1)


Unnamed: 0_level_0,embedding
article_name,Unnamed: 1_level_1
10th_century,"[-0.050105415, 0.09074813, 0.008811204, -0.051..."
11th_century,"[-0.042490385, 0.061723217, -0.020333063, -0.0..."
12th_century,"[-0.032364372, 0.037580367, -0.052070234, -0.1..."
13th_century,"[-0.07914139, 0.059444286, 0.020845776, -0.100..."
14th_century,"[-0.040906686, 0.122400954, -0.04961793, 0.007..."


In [14]:
valid_games_df = compute_cosine_similarity(valid_games_df, embeddings_df)
valid_games_df = merge_with_node_data(valid_games_df, node_stats_df)

In [15]:
# Drop some variables before logistic regression
valid_games_df.drop(
    columns=[
        'hashIP', 'timestamp', 'path', 'difficulty_rating', 'type_end',
        'duration', 'path_length', 'num_backward'
    ], 
    inplace=True
)

valid_games_df = valid_games_df[valid_games_df['shortest_path_length'] > 0]

# The number of paths is better when log-transformed
valid_games_df['shortest_path_count'] = valid_games_df['shortest_path_count'].apply(lambda x: np.log(x + 1e-2))
valid_games_df['one_longer_path_count'] = valid_games_df['one_longer_path_count'].apply(lambda x: np.log(x + 1e-2))

# Change the type of finished to a float for logistic regression
valid_games_df['finished'] = valid_games_df['finished'].astype(float)

print(valid_games_df.shape)
print(valid_games_df.columns)

(48074, 24)
Index(['finished', 'source', 'target', 'shortest_path_length',
       'shortest_path_count', 'max_sp_pagerank', 'max_sp_avg_pagerank',
       'avg_sp_avg_pagerank', 'one_longer_path_count', 'max_ol_pagerank',
       'max_ol_avg_pagerank', 'avg_ol_avg_pagerank', 'two_longer_pagerank',
       'max_tl_pagerank', 'max_tl_avg_pagerank', 'cosine_similarity',
       'degree_source', 'closeness_source', 'betweenness_source',
       'pagerank_source', 'degree_target', 'closeness_target',
       'betweenness_target', 'pagerank_target'],
      dtype='object')


In [16]:
features_1 = ['shortest_path_length', 'shortest_path_count']

model_1 = LogisticRegression(valid_games_df, features_1)
model_1.fit()

Class distribution: finished
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
Total number of samples: 46470
Optimization terminated successfully.
         Current function value: 0.649173
         Iterations 5
Training Set Metrics:
Threshold:   0.5030
F1 Score:    0.6181
Precision:   0.6182
Accuracy:    0.6182
              precision    recall  f1-score   support

         0.0     0.6277    0.6451    0.6363      4647
         1.0     0.6350    0.6174    0.6261      4647

    accuracy                         0.6313      9294
   macro avg     0.6314    0.6313    0.6312      9294
weighted avg     0.6314    0.6313    0.6312      9294



In [17]:
model_1.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,37176.0
Model:,Logit,Df Residuals:,37174.0
Method:,MLE,Df Model:,1.0
Date:,"Thu, 05 Dec 2024",Pseudo R-squ.:,0.06344
Time:,23:06:05,Log-Likelihood:,-24134.0
converged:,True,LL-Null:,-25768.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.7574,0.014,-52.498,0.000,-0.786,-0.729
x2,0.3402,0.013,26.288,0.000,0.315,0.366


In [18]:
features_2 = ['pagerank_target']

model_2 = LogisticRegression(valid_games_df, features_2, has_constant_term=True)
model_2.fit()

Class distribution: finished
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
Total number of samples: 46470
Optimization terminated successfully.
         Current function value: 0.664087
         Iterations 6


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Training Set Metrics:
Threshold:   0.4470
F1 Score:    0.6333
Precision:   0.6334
Accuracy:    0.6334
              precision    recall  f1-score   support

         0.0     0.6514    0.6383    0.6448      4647
         1.0     0.6454    0.6585    0.6519      4647

    accuracy                         0.6484      9294
   macro avg     0.6484    0.6484    0.6483      9294
weighted avg     0.6484    0.6484    0.6483      9294



In [19]:
model_2.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,37176.0
Model:,Logit,Df Residuals:,37174.0
Method:,MLE,Df Model:,1.0
Date:,"Thu, 05 Dec 2024",Pseudo R-squ.:,0.04192
Time:,23:06:09,Log-Likelihood:,-24688.0
converged:,True,LL-Null:,-25768.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0508,0.011,4.634,0.000,0.029,0.072
x1,0.7266,0.019,37.442,0.000,0.689,0.765


In [None]:
features_3 = [
    'shortest_path_length',
    'shortest_path_count', 'max_sp_pagerank', 'max_sp_avg_pagerank',
    'avg_sp_avg_pagerank', 'one_longer_path_count', 'max_ol_pagerank',
    'max_ol_avg_pagerank', 'avg_ol_avg_pagerank', 'two_longer_pagerank',
    'max_tl_pagerank', 'max_tl_avg_pagerank', 'cosine_similarity',
    'degree_source', 'closeness_source', 'betweenness_source',
    'pagerank_source', 'degree_target', 'closeness_target',
    'betweenness_target', 'pagerank_target'
]

model_3 = LogisticRegression(valid_games_df, features_3)
model_3.fit()

Class distribution: finished
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
Total number of samples: 46470
Optimization terminated successfully.
         Current function value: 0.635179
         Iterations 5
Training Set Metrics:
Threshold:   0.4730
F1 Score:    0.6333
Precision:   0.6368
Accuracy:    0.6347
              precision    recall  f1-score   support

         0.0     0.6616    0.5793    0.6177      4647
         1.0     0.6258    0.7037    0.6625      4647

    accuracy                         0.6415      9294
   macro avg     0.6437    0.6415    0.6401      9294
weighted avg     0.6437    0.6415    0.6401      9294



In [22]:
model_3.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,37176.0
Model:,Logit,Df Residuals:,37154.0
Method:,MLE,Df Model:,21.0
Date:,"Thu, 05 Dec 2024",Pseudo R-squ.:,0.08363
Time:,23:06:50,Log-Likelihood:,-23613.0
converged:,True,LL-Null:,-25768.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0097,0.011,-0.874,0.382,-0.032,0.012
x1,-0.8232,0.027,-30.423,0.000,-0.876,-0.770
x2,0.0757,0.034,2.222,0.026,0.009,0.142
x3,-0.0688,0.039,-1.770,0.077,-0.145,0.007
x4,0.0444,0.049,0.911,0.362,-0.051,0.140
x5,0.0108,0.027,0.396,0.692,-0.043,0.064
x6,0.3082,0.039,7.839,0.000,0.231,0.385
x7,-0.0903,0.023,-4.004,0.000,-0.135,-0.046
x8,0.1412,0.036,3.894,0.000,0.070,0.212


In [23]:
combined_pred = valid_games_df.copy()[['finished']]

combined_pred['model_1_pred'] = model_1.predict(valid_games_df[features_1])
combined_pred['model_2_pred'] = model_2.predict(valid_games_df[features_2])
combined_pred['model_3_pred'] = model_3.predict(valid_games_df[features_3])

combined_pred['pred'] = combined_pred[['model_1_pred', 'model_2_pred', 'model_3_pred']].sum(axis=1)
combined_pred['pred'] = combined_pred['pred'].apply(lambda x: 1 if x >= 2 else 0)

print(classification_report(combined_pred['finished'], combined_pred['pred'], digits=4))

              precision    recall  f1-score   support

         0.0     0.6629    0.6020    0.6310     24839
         1.0     0.6126    0.6726    0.6412     23235

    accuracy                         0.6362     48074
   macro avg     0.6377    0.6373    0.6361     48074
weighted avg     0.6385    0.6362    0.6359     48074

