In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm

from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from utils.data_processing import *
from utils.graph_processing import *
from models.logistic_regression import LogisticRegression

We are first loading all possible games: games played before 2011, that were winnable, where there was no timeout. We remove timeouted games as we do not know the reason why the player gave up and it may not be linked with the Wikispeedia game.

In [2]:
games = load_preprocessed_games()
games.head()

Loaded 51318 finished paths in df of shape (51318, 7)
Loaded 24875 unfinished paths in df of shape (24875, 8)
After filtering all paths after 2011-02-07 05:02:15
we kept 23245 paths out of 51318 finished paths
There are 24875 unfinished paths
Loaded 4604 articles in df of shape (4604, 1)
Pruning invalid games. Initially we have 48120 games
Pruned invalid games. Now we have 48092 valid games
After removing timeouted games, there are 38775 games left


Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,type_end
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,


Let's see how long games are.

In [3]:
games[(games['path_length'] == 1)].shape

(3204, 11)

In [4]:
print(games[(games['path_length'] == 1) & (games['finished'] == True)].shape)
print(games[(games['path_length'] == 1) & (games['finished'] == True)])

(9, 11)
       difficulty_rating  duration  finished            hashIP  num_backward  \
3908                 NaN         0      True  43f864c75b2571b5             0   
5405                 NaN         0      True  43f864c75b2571b5             0   
11137                NaN         0      True  43f864c75b2571b5             0   
12391                NaN         0      True  43f864c75b2571b5             0   
14685                NaN         0      True  43f864c75b2571b5             0   
15737                NaN         0      True  43f864c75b2571b5             0   
15911                NaN         0      True  43f864c75b2571b5             0   
17716                NaN         0      True  43f864c75b2571b5             0   
22649                NaN         0      True  43f864c75b2571b5             0   

                         path  path_length                source  \
3908                [Lesotho]            1               Lesotho   
5405                   [Coal]            1             

So there are 9 games where the source and target article are the same, so the player automatically won. These games are not useful when analysing games so we will remove them. We still keep the games of path length 1 since the nature of the first article may impact if the player gives up.

In [5]:
condition = (games['path_length'] == 1) & (games['finished'] == True)
games = games[~condition]
games.shape

(38766, 11)

In [6]:
games[(games['path_length'] == 2)].shape

(2233, 11)

In [7]:
games[(games['path_length'] == 3)].shape

(3354, 11)

In [8]:
games[(games['path_length'] > 4)].shape

(24641, 11)

Most of the games have a path length bigger than 4 (at least 3 clicks before the final click), so we will first try to analyse these.

In [9]:
games_longer_than_4 = games.copy()
games_longer_than_4 = games_longer_than_4[(games_longer_than_4['path_length'] > 4)]

The features we will take into account will be the inherent difficulty of the game and informations from the first 3 clicks. The information from the first 3 clicks will be: the shortest path length from the article to the goal, the fame of the article, if the click was a backward click or not, the pagerank of the article, the duration (approximated), the probabilities of the link used being known to people, the pagerank of the target. The target variable is whether the games was won or lost. We do not use information from the first article as it is contained in the inherent difficulty.

In [10]:
games_longer_than_4.head()

Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,type_end
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,


In [11]:
games_longer_than_4['duration'] = (3/(games_longer_than_4['path_length']-1))* games_longer_than_4['duration']

In [12]:
games_longer_than_4.drop(columns = ["difficulty_rating", 'hashIP', 'num_backward', 'path_length', 'type_end', 'timestamp'], inplace = True)

In [13]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,path,source,target
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy


In [14]:
games_longer_than_4['first_click'] = games_longer_than_4['path'].apply(lambda x: x[1])
games_longer_than_4['second_click'] = games_longer_than_4['path'].apply(lambda x: x[2])
games_longer_than_4['third_click'] = games_longer_than_4['path'].apply(lambda x: x[3])
games_longer_than_4['num_back'] = games_longer_than_4.apply(lambda a: int(a['first_click'] == '<') + int(a['second_click'] == '<') + int(a['third_click'] == '<'), axis = 1)


#games_longer_than_4.drop(columns=['path'], inplace=True)

games_longer_than_4.head()


Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade,15th_century,16th_century,Pacific_Ocean,0
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0


In [15]:
print((games_longer_than_4['first_click'] == '<').sum())
print((games_longer_than_4['second_click'] == '<').sum())
print((games_longer_than_4['third_click'] == '<').sum())
games_longer_than_4[games_longer_than_4['num_back'] > 0].shape



0
1235
1303


(2538, 9)

In [16]:
games_longer_than_4['second_is_<'] = games_longer_than_4['second_click'] == '<'
games_longer_than_4['third_is_<'] = games_longer_than_4['third_click'] == '<'

In [17]:
games_longer_than_4['second_click'] = games_longer_than_4.apply(
    lambda row: row['source'] if row['second_click'] == '<' else row['second_click'], axis=1
)

games_longer_than_4['third_click'] = games_longer_than_4.apply(
    lambda row: row['first_click'] if row['third_click'] == '<' else row['third_click'], axis=1
)
print((games_longer_than_4['first_click'] == '<').sum())
print((games_longer_than_4['second_click'] == '<').sum())
print((games_longer_than_4['third_click'] == '<').sum())

0
0
0


In [18]:
node_stats_df = load_or_compute_node_stats()
games_longer_than_4 = merge_with_node_data(games_longer_than_4, node_stats_df, columns = ['source', 'target', 'first_click', 'second_click', 'third_click'], data = ['pagerank'])
games_longer_than_4.head()

Loaded 4604 node stats


Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,second_is_<,third_is_<,pagerank_source,pagerank_target,pagerank_first_click,pagerank_second_click,pagerank_third_click
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade,15th_century,16th_century,Pacific_Ocean,0,False,False,0.000642,3e-05,0.001024,0.001223,0.001408
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,False,False,0.000642,3e-05,0.006698,0.003321,0.000145
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,False,False,0.000642,3e-05,0.000408,0.00063,0.001197
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0,False,False,0.000642,0.000315,0.003975,0.002173,0.000357
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,False,False,0.000642,0.000315,0.006698,0.002751,0.009774


In [19]:
fame_df = load_fame()
games_longer_than_4 = merge_with_fame_data(games_longer_than_4, fame_df, columns = ['source', 'first_click', 'second_click', 'third_click'])

Dropped 5055 games without fame statistics


In [20]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,second_is_<,third_is_<,pagerank_source,pagerank_target,pagerank_first_click,pagerank_second_click,pagerank_third_click,fame_score_source,fame_score_first_click,fame_score_second_click,fame_score_third_click
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,False,False,0.000642,3e-05,0.006698,0.003321,0.000145,8.0,8.0,9.0,6.0
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,False,False,0.000642,3e-05,0.000408,0.00063,0.001197,8.0,6.0,8.0,8.0
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0,False,False,0.000642,0.000315,0.003975,0.002173,0.000357,8.0,9.0,9.0,9.0
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,False,False,0.000642,0.000315,0.006698,0.002751,0.009774,8.0,8.0,9.0,10.0
5,82.928571,True,"[14th_century, Europe, Republic_of_Ireland, <,...",14th_century,Rainbow,Europe,Republic_of_Ireland,Europe,1,False,True,0.000642,0.000144,0.006698,0.001045,0.006698,8.0,8.0,8.0,8.0


In [21]:
link_proba = load_link_proba()
link_proba.head()
links_df = load_links_df()
link_proba.isna().sum()
#print(print(games_longer_than_4.loc[3167]))
#links[(links['source'] == 'Finland') & (links['target'] == 'Åland')]

Loaded 119882 links in df of shape (119882, 2)


link_probability    5140
dtype: int64

In [22]:
combs = []
seen_tuples = set() 

for i, row in games_longer_than_4.iterrows():
    pair1 = (row['source'], row['first_click'])
    if pair1 not in link_proba.index and pair1 not in seen_tuples:
        combs.append((i, *pair1))
        seen_tuples.add(pair1)

    pair2 = (row['first_click'], row['second_click'])
    sec_is_b = row['second_is_<']
    if pair2 not in link_proba.index and not sec_is_b and pair2 not in seen_tuples:
        combs.append((i, *pair2))
        seen_tuples.add(pair2)
    
    pair3 = (row['second_click'], row['third_click'])
    third_is_b = row['third_is_<']
    if pair3 not in link_proba.index and not third_is_b and pair3 not in seen_tuples:
        combs.append((i, *pair3))
        seen_tuples.add(pair3)
indexes = []
for i in combs:
    print(i)
    indexes.append(i[0])
print(indexes)

(935, 'Batman', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
(3167, 'Finland', 'Åland')
(8055, 'Programming_language', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
(15331, 'Consolation_of_Philosophy', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
(15730, 'Abbasid', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
(22757, 'Yttrium', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
(28130, 'Company_(law)', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
(32236, 'Rabbit', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
(32412, 'Aircraft', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
(34184, 'Communication', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
(38446, 'Railway_post_office', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
(42069, 'Electron_beam_welding', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
(46398, 'Actinium', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License')
[935, 3

In [23]:
games_longer_than_4.drop(index = indexes, inplace = True)

In [24]:
games_longer_than_4['second_click'] = games_longer_than_4.apply(lambda row: '<' if row['second_is_<'] else row['second_click'], axis = 1)
games_longer_than_4['third_click'] = games_longer_than_4.apply(lambda row: '<' if row['third_is_<'] else row['third_click'], axis = 1)

In [25]:
print((games_longer_than_4['first_click'] == '<').sum())
print((games_longer_than_4['second_click'] == '<').sum())
print((games_longer_than_4['third_click'] == '<').sum())

0
1031
1112


In [26]:
games_longer_than_4 = add_link_proba_info(games_longer_than_4, link_proba, pairs = [['source', 'first_click'], ['first_click', 'second_click'], ['second_click', 'third_click']], names = ['first_link', 'second_link', 'third_link'])
games_longer_than_4.head()

Dropped 2132 games without link statistics


Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,second_is_<,...,pagerank_first_click,pagerank_second_click,pagerank_third_click,fame_score_source,fame_score_first_click,fame_score_second_click,fame_score_third_click,first_link,second_link,third_link
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,False,...,0.006698,0.003321,0.000145,8.0,8.0,9.0,6.0,0.9,0.3,0.7
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,False,...,0.000408,0.00063,0.001197,8.0,6.0,8.0,8.0,0.1,0.9,0.3
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,False,...,0.006698,0.002751,0.009774,8.0,8.0,9.0,10.0,0.9,0.1,0.9
5,82.928571,True,"[14th_century, Europe, Republic_of_Ireland, <,...",14th_century,Rainbow,Europe,Republic_of_Ireland,<,1,False,...,0.006698,0.001045,0.006698,8.0,8.0,8.0,8.0,0.9,0.7,0.0
6,59.666667,True,"[14th_century, English_peasants'_revolt_of_138...",14th_century,Rainbow,English_peasants'_revolt_of_1381,Archbishop_of_Canterbury,19th_century,0,False,...,6.8e-05,0.000469,0.002593,8.0,3.0,7.0,4.0,0.1,0.15,0.2


In [27]:
print((games_longer_than_4['first_click'] == '<').sum())
print((games_longer_than_4['second_click'] == '<').sum())
print((games_longer_than_4['third_click'] == '<').sum())

0
995
1029


In [28]:
articles_df = load_article_df()
adj_matrix = construct_adjecency_matrix(links_df, articles_df['article_name'].tolist())
adj_list = from_adjacency_matrix_to_list(adj_matrix)
index_mapping = generate_inverse_index_mapping(adj_list)
pair_data = load_pair_data_with_multiindex('../src/data/pair_stats.txt', index_mapping)
pair_data.head()

Loaded 4604 articles in df of shape (4604, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,shortest_path_length,shortest_path_count,max_sp_node_degree,max_sp_avg_node_degree,avg_sp_avg_node_degree,one_longer_path_count,max_ol_node_degree,max_ol_avg_node_degree,avg_ol_avg_node_degree,two_longer_path_count,max_tl_node_degree,max_tl_avg_node_degree
source,target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10th_century,10th_century,0,1,0,0,0,0,0,0,0,0,0,0
10th_century,11th_century,1,1,26,13,13,2,62,29,26,7,112,46
10th_century,12th_century,2,5,48,24,23,121,180,93,37,30,169,70
10th_century,13th_century,2,4,79,35,28,131,169,84,39,30,169,71
10th_century,14th_century,2,4,53,26,22,113,169,84,37,30,169,70


In [29]:
pair_data.columns

Index(['shortest_path_length', 'shortest_path_count', 'max_sp_node_degree',
       'max_sp_avg_node_degree', 'avg_sp_avg_node_degree',
       'one_longer_path_count', 'max_ol_node_degree', 'max_ol_avg_node_degree',
       'avg_ol_avg_node_degree', 'two_longer_path_count', 'max_tl_node_degree',
       'max_tl_avg_node_degree'],
      dtype='object')

In [30]:
games_longer_than_4 = add_pair_data(games_longer_than_4, pair_data, pairs =[['source', 'target'], ['first_click', 'target'], ['second_click', 'target'], ['third_click', 'target']], names = ["source", "first", "second", "third"], data = ['shortest_path_length', 'shortest_path_count']) 

Dropped 2 games without link statistics


In [31]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,second_is_<,...,second_link,third_link,shortest_path_length_source,shortest_path_count_source,shortest_path_length_first,shortest_path_count_first,shortest_path_length_second,shortest_path_count_second,shortest_path_length_third,shortest_path_count_third
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,False,...,0.3,0.7,3.0,3.0,3.0,7.0,2.0,1.0,1.0,1.0
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,False,...,0.9,0.3,3.0,3.0,3.0,4.0,2.0,1.0,3.0,7.0
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,False,...,0.1,0.9,3.0,18.0,2.0,2.0,2.0,2.0,1.0,1.0
5,82.928571,True,"[14th_century, Europe, Republic_of_Ireland, <,...",14th_century,Rainbow,Europe,Republic_of_Ireland,<,1,False,...,0.7,0.0,3.0,5.0,3.0,6.0,3.0,12.0,0.0,0.0
6,59.666667,True,"[14th_century, English_peasants'_revolt_of_138...",14th_century,Rainbow,English_peasants'_revolt_of_1381,Archbishop_of_Canterbury,19th_century,0,False,...,0.15,0.2,3.0,5.0,3.0,2.0,3.0,4.0,3.0,17.0


In [32]:
games_longer_than_4.columns

Index(['duration', 'finished', 'path', 'source', 'target', 'first_click',
       'second_click', 'third_click', 'num_back', 'second_is_<', 'third_is_<',
       'pagerank_source', 'pagerank_target', 'pagerank_first_click',
       'pagerank_second_click', 'pagerank_third_click', 'fame_score_source',
       'fame_score_first_click', 'fame_score_second_click',
       'fame_score_third_click', 'first_link', 'second_link', 'third_link',
       'shortest_path_length_source', 'shortest_path_count_source',
       'shortest_path_length_first', 'shortest_path_count_first',
       'shortest_path_length_second', 'shortest_path_count_second',
       'shortest_path_length_third', 'shortest_path_count_third'],
      dtype='object')

In [33]:
games_longer_than_4.drop(columns = ['path', 'source', 'target', 'first_click',
       'second_click', 'third_click', 'num_back'], inplace = True)

In [34]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,second_is_<,third_is_<,pagerank_source,pagerank_target,pagerank_first_click,pagerank_second_click,pagerank_third_click,fame_score_source,...,second_link,third_link,shortest_path_length_source,shortest_path_count_source,shortest_path_length_first,shortest_path_count_first,shortest_path_length_second,shortest_path_count_second,shortest_path_length_third,shortest_path_count_third
1,66.0,True,False,False,0.000642,3e-05,0.006698,0.003321,0.000145,8.0,...,0.3,0.7,3.0,3.0,3.0,7.0,2.0,1.0,1.0,1.0
2,59.142857,True,False,False,0.000642,3e-05,0.000408,0.00063,0.001197,8.0,...,0.9,0.3,3.0,3.0,3.0,4.0,2.0,1.0,3.0,7.0
4,66.0,True,False,False,0.000642,0.000315,0.006698,0.002751,0.009774,8.0,...,0.1,0.9,3.0,18.0,2.0,2.0,2.0,2.0,1.0,1.0
5,82.928571,True,False,True,0.000642,0.000144,0.006698,0.001045,0.006698,8.0,...,0.7,0.0,3.0,5.0,3.0,6.0,3.0,12.0,0.0,0.0
6,59.666667,True,False,False,0.000642,0.000144,6.8e-05,0.000469,0.002593,8.0,...,0.15,0.2,3.0,5.0,3.0,2.0,3.0,4.0,3.0,17.0


In [35]:
games_longer_than_4.columns

Index(['duration', 'finished', 'second_is_<', 'third_is_<', 'pagerank_source',
       'pagerank_target', 'pagerank_first_click', 'pagerank_second_click',
       'pagerank_third_click', 'fame_score_source', 'fame_score_first_click',
       'fame_score_second_click', 'fame_score_third_click', 'first_link',
       'second_link', 'third_link', 'shortest_path_length_source',
       'shortest_path_count_source', 'shortest_path_length_first',
       'shortest_path_count_first', 'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third'],
      dtype='object')

In [38]:
features_1 = ['duration', 'second_is_<', 'third_is_<', 'pagerank_source',
       'pagerank_target', 'pagerank_first_click', 'pagerank_second_click',
       'pagerank_third_click', 'fame_score_source', 'fame_score_first_click',
       'fame_score_second_click', 'fame_score_third_click', 'first_link',
       'second_link', 'third_link', 'shortest_path_length_source',
       'shortest_path_count_source', 'shortest_path_length_first',
       'shortest_path_count_first', 'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third']

In [39]:
model_1 = LogisticRegression(games_longer_than_4, features_1)
model_1.fit()

Class distribution: finished
False    0.5
True     0.5
Name: proportion, dtype: float64
Total number of samples: 9664
Optimization terminated successfully.
         Current function value: 0.537664
         Iterations 6
Training Set Metrics:
Threshold:   0.5130
F1 Score:    0.7246
Precision:   0.7247
Accuracy:    0.7246
              precision    recall  f1-score   support

       False     0.7305    0.7260    0.7282       967
        True     0.7274    0.7319    0.7296       966

    accuracy                         0.7289      1933
   macro avg     0.7289    0.7289    0.7289      1933
weighted avg     0.7289    0.7289    0.7289      1933



In [40]:
model_1.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,7731.0
Model:,Logit,Df Residuals:,7708.0
Method:,MLE,Df Model:,22.0
Date:,"Thu, 12 Dec 2024",Pseudo R-squ.:,0.2243
Time:,17:11:18,Log-Likelihood:,-4156.7
converged:,True,LL-Null:,-5358.7
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.5685,0.039,-14.715,0.000,-0.644,-0.493
x2,-0.1080,0.051,-2.138,0.033,-0.207,-0.009
x3,-0.8718,0.041,-21.475,0.000,-0.951,-0.792
x4,0.0142,0.029,0.486,0.627,-0.043,0.072
x5,0.2865,0.045,6.324,0.000,0.198,0.375
x6,0.0108,0.031,0.350,0.726,-0.050,0.071
x7,-0.0748,0.030,-2.466,0.014,-0.134,-0.015
x8,-0.0870,0.029,-2.997,0.003,-0.144,-0.030
x9,0.0172,0.029,0.600,0.548,-0.039,0.073


Based on p-value, we remove 'pagerank_source', 'pagerank_first_click','fame_score_source', 'fame_score_third_click', 'fame_score_first_click', 'fame_score_second_click', 'fame_score_third_click', 'first_link', 'second_link', 'third_link', shortest_path_count_source', 'shortest_path_count_first', 'shortest_path_length_first'.

In [41]:
features_2 = ['duration', 'second_is_<', 'third_is_<',
       'pagerank_target', 'pagerank_second_click',
       'pagerank_third_click',
        'shortest_path_length_source',
        'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third']

In [42]:
model_2 = LogisticRegression(games_longer_than_4, features_2)
model_2.fit()

Class distribution: finished
False    0.5
True     0.5
Name: proportion, dtype: float64
Total number of samples: 9664
Optimization terminated successfully.
         Current function value: 0.538087
         Iterations 6
Training Set Metrics:
Threshold:   0.5570
F1 Score:    0.7234
Precision:   0.7260
Accuracy:    0.7240
              precision    recall  f1-score   support

       False     0.7167    0.7746    0.7445       967
        True     0.7545    0.6936    0.7228       966

    accuracy                         0.7341      1933
   macro avg     0.7356    0.7341    0.7336      1933
weighted avg     0.7356    0.7341    0.7337      1933



In [43]:
model_2.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,7731.0
Model:,Logit,Df Residuals:,7720.0
Method:,MLE,Df Model:,10.0
Date:,"Thu, 12 Dec 2024",Pseudo R-squ.:,0.2237
Time:,17:18:23,Log-Likelihood:,-4160.0
converged:,True,LL-Null:,-5358.7
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.5628,0.038,-14.662,0.000,-0.638,-0.488
x2,-0.0911,0.046,-1.994,0.046,-0.181,-0.002
x3,-0.8676,0.038,-22.770,0.000,-0.942,-0.793
x4,0.2770,0.043,6.449,0.000,0.193,0.361
x5,-0.0707,0.029,-2.447,0.014,-0.127,-0.014
x6,-0.0912,0.027,-3.402,0.001,-0.144,-0.039
x7,0.1309,0.035,3.714,0.000,0.062,0.200
x8,-0.2442,0.058,-4.232,0.000,-0.357,-0.131
x9,0.1187,0.031,3.851,0.000,0.058,0.179


Let's see if using an average of node values helps.

In [58]:
features_3 = ['duration', 'second_is_<', 'third_is_<',
       'pagerank_target', 'pagerank_second_click',
       'pagerank_third_click',
        'shortest_path_length_source',
        'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third']

In [59]:
model_3 = LogisticRegression(games_longer_than_4, features_3)
model_3.fit()

Class distribution: finished
False    0.5
True     0.5
Name: proportion, dtype: float64
Total number of samples: 9664
Optimization terminated successfully.
         Current function value: 0.538087
         Iterations 6
Training Set Metrics:
Threshold:   0.5570
F1 Score:    0.7234
Precision:   0.7260
Accuracy:    0.7240
              precision    recall  f1-score   support

       False     0.7167    0.7746    0.7445       967
        True     0.7545    0.6936    0.7228       966

    accuracy                         0.7341      1933
   macro avg     0.7356    0.7341    0.7336      1933
weighted avg     0.7356    0.7341    0.7337      1933

