In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm

from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from utils.data_processing import *
from utils.graph_processing import *
from models.logistic_regression import LogisticRegression

We are first loading all possible games: games played before 2011, that were winnable, where there was no timeout. We remove timeouted games as we do not know the reason why the player gave up and it may not be linked with the Wikispeedia game.

In [2]:
games = load_preprocessed_games()
games.head()

Loaded 51318 finished paths in df of shape (51318, 7)
Loaded 24875 unfinished paths in df of shape (24875, 8)
After filtering all paths after 2011-02-07 05:02:15
we kept 23245 paths out of 51318 finished paths
There are 24875 unfinished paths
Loaded 4604 articles in df of shape (4604, 1)
Pruning invalid games. Initially we have 48120 games
Pruned invalid games. Now we have 48092 valid games
Removed 101 games that contained non existing links
After removing timeouted games, there are 38696 games left


Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,type_end
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,


Let's see how long games are.

In [3]:
games[(games['path_length'] == 1)].shape

(3204, 11)

In [4]:
print(games[(games['path_length'] == 1) & (games['finished'] == True)].shape)
print(games[(games['path_length'] == 1) & (games['finished'] == True)])

(9, 11)
       difficulty_rating  duration  finished            hashIP  num_backward  \
3908                 NaN         0      True  43f864c75b2571b5             0   
5405                 NaN         0      True  43f864c75b2571b5             0   
11137                NaN         0      True  43f864c75b2571b5             0   
12391                NaN         0      True  43f864c75b2571b5             0   
14685                NaN         0      True  43f864c75b2571b5             0   
15737                NaN         0      True  43f864c75b2571b5             0   
15911                NaN         0      True  43f864c75b2571b5             0   
17716                NaN         0      True  43f864c75b2571b5             0   
22649                NaN         0      True  43f864c75b2571b5             0   

                         path  path_length                source  \
3908                [Lesotho]            1               Lesotho   
5405                   [Coal]            1             

So there are 9 games where the source and target article are the same, so the player automatically won. These games are not useful when analysing games so we will remove them. We still keep the lost games of path length 1 since the nature of the first article may impact if the player gives up.

In [5]:
condition = (games['path_length'] == 1) & (games['finished'] == True)
games = games[~condition]
games.shape

(38687, 11)

In [6]:
games[(games['path_length'] == 2)].shape

(2211, 11)

In [7]:
games[(games['path_length'] == 3)].shape

(3344, 11)

In [8]:
games[(games['path_length'] > 4)].shape

(24597, 11)

Most of the games have a path length bigger than 4 (at least 3 clicks before the final click), so we will first try to analyse these.

In [9]:
games_longer_than_4 = games.copy()
games_longer_than_4 = games_longer_than_4[(games_longer_than_4['path_length'] > 3)] 
condition = (games_longer_than_4['path_length'] == 4) & (games_longer_than_4['finished'] == True)
games_longer_than_4 = games_longer_than_4[~condition]

The features we will take into account will be the inherent difficulty of the game and informations from the first 3 clicks. The information from the first 3 clicks will be: the shortest path length from the article to the goal, the fame of the article, if the click was a backward click or not, the pagerank of the article, the duration (approximated), the probabilities of the link used being known to people, the pagerank of the target. The target variable is whether the games was won or lost. We do not use information from the first article as it is contained in the inherent difficulty.

In [10]:
games_longer_than_4.head()

Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,type_end
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,


In [11]:
games_longer_than_4['duration'] = (3/(games_longer_than_4['path_length']-1))* games_longer_than_4['duration']

In [12]:
games_longer_than_4.drop(columns = ["difficulty_rating", 'hashIP', 'num_backward', 'path_length', 'type_end', 'timestamp'], inplace = True)

In [13]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,path,source,target
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy


In [14]:
games_longer_than_4['first_click'] = games_longer_than_4['path'].apply(lambda x: x[1])
games_longer_than_4['second_click'] = games_longer_than_4['path'].apply(lambda x: x[2])
games_longer_than_4['third_click'] = games_longer_than_4['path'].apply(lambda x: x[3])
games_longer_than_4['num_back'] = games_longer_than_4.apply(lambda a: int(a['first_click'] == '<') + int(a['second_click'] == '<') + int(a['third_click'] == '<'), axis = 1)


#games_longer_than_4.drop(columns=['path'], inplace=True)

games_longer_than_4.head()


Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade,15th_century,16th_century,Pacific_Ocean,0
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0


In [15]:
print((games_longer_than_4['first_click'] == '<').sum())
print((games_longer_than_4['second_click'] == '<').sum())
print((games_longer_than_4['third_click'] == '<').sum())
games_longer_than_4[games_longer_than_4['num_back'] > 0].shape



0
1282
1435


(2717, 9)

In [16]:
games_longer_than_4['second_is_<'] = games_longer_than_4['second_click'] == '<'
games_longer_than_4['third_is_<'] = games_longer_than_4['third_click'] == '<'

In [17]:
games_longer_than_4['second_click'] = games_longer_than_4.apply(
    lambda row: row['source'] if row['second_click'] == '<' else row['second_click'], axis=1
)

games_longer_than_4['third_click'] = games_longer_than_4.apply(
    lambda row: row['first_click'] if row['third_click'] == '<' else row['third_click'], axis=1
)
print((games_longer_than_4['first_click'] == '<').sum())
print((games_longer_than_4['second_click'] == '<').sum())
print((games_longer_than_4['third_click'] == '<').sum())

0
0
0


In [18]:
node_stats_df = load_or_compute_node_stats()
games_longer_than_4 = merge_with_node_data(games_longer_than_4, node_stats_df, columns = ['source', 'target', 'first_click', 'second_click', 'third_click'], data = ['pagerank'])
games_longer_than_4.head()

Loaded 4604 node stats


Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,second_is_<,third_is_<,pagerank_source,pagerank_target,pagerank_first_click,pagerank_second_click,pagerank_third_click
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade,15th_century,16th_century,Pacific_Ocean,0,False,False,0.00047,2.8e-05,0.000746,0.000888,0.001042
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,False,False,0.00047,2.8e-05,0.004916,0.002445,0.000111
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,False,False,0.00047,2.8e-05,0.000302,0.000467,0.000881
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0,False,False,0.00047,0.000243,0.002903,0.001582,0.000266
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,False,False,0.00047,0.000243,0.004916,0.00203,0.007202


In [19]:
fame_df = load_fame()
games_longer_than_4 = merge_with_fame_data(games_longer_than_4, fame_df, columns = ['source', 'first_click', 'second_click', 'third_click'])

In [20]:
s = fame_df.isna()
s[s['fame_score'] == True]

Unnamed: 0_level_0,fame_score
article_name,Unnamed: 1_level_1


In [21]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,second_is_<,third_is_<,pagerank_source,pagerank_target,pagerank_first_click,pagerank_second_click,pagerank_third_click,fame_score_source,fame_score_first_click,fame_score_second_click,fame_score_third_click
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade,15th_century,16th_century,Pacific_Ocean,0,False,False,0.00047,2.8e-05,0.000746,0.000888,0.001042,8.0,8.0,8.0,9.0
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,False,False,0.00047,2.8e-05,0.004916,0.002445,0.000111,8.0,8.0,9.0,6.0
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,False,False,0.00047,2.8e-05,0.000302,0.000467,0.000881,8.0,6.0,8.0,8.0
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0,False,False,0.00047,0.000243,0.002903,0.001582,0.000266,8.0,9.0,9.0,9.0
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,False,False,0.00047,0.000243,0.004916,0.00203,0.007202,8.0,8.0,9.0,10.0


In [22]:
embeddings_df = load_embeddings()
games_longer_than_4 = compute_cosine_similarity(games_longer_than_4, embeddings_df, pairs = [['source', 'target'], ['first_click', 'target'], ['second_click', 'target'], ['third_click', 'target']])
games_longer_than_4.head()

Loaded 4604 embeddings in df of shape (4604, 1)


Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,second_is_<,...,pagerank_second_click,pagerank_third_click,fame_score_source,fame_score_first_click,fame_score_second_click,fame_score_third_click,cosine_sim_source_target,cosine_sim_first_click_target,cosine_sim_second_click_target,cosine_sim_third_click_target
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade,15th_century,16th_century,Pacific_Ocean,0,False,...,0.000888,0.001042,8.0,8.0,8.0,9.0,0.202444,0.187263,0.261171,0.015896
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,False,...,0.002445,0.000111,8.0,8.0,9.0,6.0,0.202444,0.146602,0.387016,0.485815
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,False,...,0.000467,0.000881,8.0,6.0,8.0,8.0,0.202444,0.309651,0.293009,0.172144
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0,False,...,0.001582,0.000266,8.0,9.0,9.0,9.0,0.079502,0.03784,0.108953,0.012576
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,False,...,0.00203,0.007202,8.0,8.0,9.0,10.0,0.079502,-0.128314,-0.044935,0.170961


In [30]:
link_proba = load_link_proba()
link_proba.head()
links_df = load_links_df()
link_proba.isna().sum()
#print(print(games_longer_than_4.loc[3167]))
links_df[(links_df['source'] == 'Finland') & (links_df['target'] == 'Åland')]

Loaded 119882 links in df of shape (119882, 2)


Unnamed: 0,source,target


In [29]:
combs = []
seen_tuples = set() 

for i, row in games_longer_than_4.iterrows():
    pair1 = (row['source'], row['first_click'])
    if pair1 not in link_proba.index and pair1 not in seen_tuples:
        combs.append((i, *pair1))
        seen_tuples.add(pair1)

    pair2 = (row['first_click'], row['second_click'])
    sec_is_b = row['second_is_<']
    if pair2 not in link_proba.index and not sec_is_b and pair2 not in seen_tuples:
        combs.append((i, *pair2))
        seen_tuples.add(pair2)
    
    pair3 = (row['second_click'], row['third_click'])
    third_is_b = row['third_is_<']
    if pair3 not in link_proba.index and not third_is_b and pair3 not in seen_tuples:
        combs.append((i, *pair3))
        seen_tuples.add(pair3)
indexes = []
for i in combs:
    print(i)
    indexes.append(i[0])
print(indexes)

KeyError: 'first_click'

In [156]:
games_longer_than_4.drop(index = indexes, inplace = True)

In [28]:
articles_df = load_article_df()
adj_matrix = construct_adjecency_matrix(links_df, articles_df['article_name'].tolist())
adj_list = from_adjacency_matrix_to_list(adj_matrix)
index_mapping = generate_inverse_index_mapping(adj_list)
pair_data = load_pair_data_with_multiindex('../src/data/pair_stats.txt', index_mapping)
pair_data.head()

Loaded 4604 articles in df of shape (4604, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,shortest_path_length,shortest_path_count,max_sp_node_degree,max_sp_avg_node_degree,avg_sp_avg_node_degree,one_longer_path_count,max_ol_node_degree,max_ol_avg_node_degree,avg_ol_avg_node_degree,two_longer_path_count,max_tl_node_degree,max_tl_avg_node_degree
source,target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10th_century,10th_century,0,1,0,0,0,0,0,0,0,0,0,0
10th_century,11th_century,1,1,26,13,13,2,62,29,26,7,112,46
10th_century,12th_century,2,5,48,24,23,121,180,93,37,30,169,70
10th_century,13th_century,2,4,79,35,28,131,169,84,39,30,169,71
10th_century,14th_century,2,4,53,26,22,113,169,84,37,30,169,70


In [157]:
games_longer_than_4 = add_pair_data(games_longer_than_4, pair_data, pairs =[['source', 'target'], ['first_click', 'target'], ['second_click', 'target'], ['third_click', 'target']], names = ["source", "first", "second", "third"], data = ['shortest_path_length', 'shortest_path_count']) 

Dropped 4 games without link statistics


In [158]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,second_is_<,...,cosine_sim_second_click_target,cosine_sim_third_click_target,shortest_path_length_source,shortest_path_count_source,shortest_path_length_first,shortest_path_count_first,shortest_path_length_second,shortest_path_count_second,shortest_path_length_third,shortest_path_count_third
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade,15th_century,16th_century,Pacific_Ocean,0,False,...,0.261171,0.015896,3.0,3.0,3.0,3.0,3.0,6.0,3.0,3.0
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,False,...,0.387016,0.485815,3.0,3.0,3.0,7.0,2.0,1.0,1.0,1.0
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,False,...,0.293009,0.172144,3.0,3.0,3.0,4.0,2.0,1.0,3.0,7.0
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0,False,...,0.108953,0.012576,3.0,18.0,2.0,1.0,2.0,1.0,2.0,2.0
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,False,...,-0.044935,0.170961,3.0,18.0,2.0,2.0,2.0,2.0,1.0,1.0


In [159]:
games_longer_than_4.columns

Index(['duration', 'finished', 'path', 'source', 'target', 'first_click',
       'second_click', 'third_click', 'num_back', 'second_is_<', 'third_is_<',
       'pagerank_source', 'pagerank_target', 'pagerank_first_click',
       'pagerank_second_click', 'pagerank_third_click', 'fame_score_source',
       'fame_score_first_click', 'fame_score_second_click',
       'fame_score_third_click', 'cosine_sim_source_target',
       'cosine_sim_first_click_target', 'cosine_sim_second_click_target',
       'cosine_sim_third_click_target', 'shortest_path_length_source',
       'shortest_path_count_source', 'shortest_path_length_first',
       'shortest_path_count_first', 'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third'],
      dtype='object')

In [160]:
games_longer_than_4['zjb_factor'] = games_longer_than_4.apply(lambda row: ((row['shortest_path_length_first']  - row['shortest_path_length_source'] - 1) + (row['shortest_path_length_second'] - row['shortest_path_length_first'] - 1) + (row['shortest_path_length_third'] - row['shortest_path_length_second'] - 1))/3, axis = 1)

In [161]:
games_longer_than_4['cos_zjb_factor'] =  games_longer_than_4.apply(lambda row: ((row['cosine_sim_first_click_target'] - row['cosine_sim_source_target']) + (row['cosine_sim_second_click_target'] - row['cosine_sim_first_click_target']) + (row['cosine_sim_third_click_target'] - row['cosine_sim_second_click_target']))/3, axis = 1)

In [162]:
games_longer_than_4['short_diff'] = games_longer_than_4.apply(lambda row: row['shortest_path_length_third'] - row['shortest_path_length_source'], axis = 1)

In [163]:
games_longer_than_4.columns

Index(['duration', 'finished', 'path', 'source', 'target', 'first_click',
       'second_click', 'third_click', 'num_back', 'second_is_<', 'third_is_<',
       'pagerank_source', 'pagerank_target', 'pagerank_first_click',
       'pagerank_second_click', 'pagerank_third_click', 'fame_score_source',
       'fame_score_first_click', 'fame_score_second_click',
       'fame_score_third_click', 'cosine_sim_source_target',
       'cosine_sim_first_click_target', 'cosine_sim_second_click_target',
       'cosine_sim_third_click_target', 'shortest_path_length_source',
       'shortest_path_count_source', 'shortest_path_length_first',
       'shortest_path_count_first', 'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third', 'zjb_factor', 'cos_zjb_factor',
       'short_diff'],
      dtype='object')

In [164]:
games_longer_than_4['second_click'] = games_longer_than_4.apply(lambda row: '<' if row['second_is_<'] else row['second_click'], axis = 1)
games_longer_than_4['third_click'] = games_longer_than_4.apply(lambda row: '<' if row['third_is_<'] else row['third_click'], axis = 1)

In [165]:
games_longer_than_4 = add_link_proba_info(games_longer_than_4, link_proba, pairs = [['source', 'first_click'], ['first_click', 'second_click'], ['second_click', 'third_click']], names = ['first_link', 'second_link', 'third_link'])
games_longer_than_4.head()

Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,second_is_<,...,shortest_path_length_second,shortest_path_count_second,shortest_path_length_third,shortest_path_count_third,zjb_factor,cos_zjb_factor,short_diff,first_link,second_link,third_link
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade,15th_century,16th_century,Pacific_Ocean,0,False,...,3.0,6.0,3.0,3.0,-1.0,-0.062183,0.0,0.9,0.6,0.2
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,False,...,2.0,1.0,1.0,1.0,-1.666667,0.094457,-2.0,0.9,0.3,0.7
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,False,...,2.0,1.0,3.0,7.0,-1.0,-0.0101,0.0,0.1,0.9,0.3
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0,False,...,2.0,1.0,2.0,2.0,-1.333333,-0.022309,-1.0,0.4,0.25,0.2
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,False,...,2.0,2.0,1.0,1.0,-1.666667,0.030486,-2.0,0.9,0.1,0.9


In [166]:
print((games_longer_than_4['first_click'] == '<').sum())
print((games_longer_than_4['second_click'] == '<').sum())
print((games_longer_than_4['third_click'] == '<').sum())

0
1280
1435


In [167]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,second_is_<,...,shortest_path_length_second,shortest_path_count_second,shortest_path_length_third,shortest_path_count_third,zjb_factor,cos_zjb_factor,short_diff,first_link,second_link,third_link
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade,15th_century,16th_century,Pacific_Ocean,0,False,...,3.0,6.0,3.0,3.0,-1.0,-0.062183,0.0,0.9,0.6,0.2
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,False,...,2.0,1.0,1.0,1.0,-1.666667,0.094457,-2.0,0.9,0.3,0.7
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,False,...,2.0,1.0,3.0,7.0,-1.0,-0.0101,0.0,0.1,0.9,0.3
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0,False,...,2.0,1.0,2.0,2.0,-1.333333,-0.022309,-1.0,0.4,0.25,0.2
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,False,...,2.0,2.0,1.0,1.0,-1.666667,0.030486,-2.0,0.9,0.1,0.9


In [168]:
games_longer_than_4.columns

Index(['duration', 'finished', 'path', 'source', 'target', 'first_click',
       'second_click', 'third_click', 'num_back', 'second_is_<', 'third_is_<',
       'pagerank_source', 'pagerank_target', 'pagerank_first_click',
       'pagerank_second_click', 'pagerank_third_click', 'fame_score_source',
       'fame_score_first_click', 'fame_score_second_click',
       'fame_score_third_click', 'cosine_sim_source_target',
       'cosine_sim_first_click_target', 'cosine_sim_second_click_target',
       'cosine_sim_third_click_target', 'shortest_path_length_source',
       'shortest_path_count_source', 'shortest_path_length_first',
       'shortest_path_count_first', 'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third', 'zjb_factor', 'cos_zjb_factor',
       'short_diff', 'first_link', 'second_link', 'third_link'],
      dtype='object')

In [169]:
games_longer_than_4.drop(columns = ['path', 'source', 'target', 'first_click',
       'second_click', 'third_click'], inplace = True)

In [170]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,num_back,second_is_<,third_is_<,pagerank_source,pagerank_target,pagerank_first_click,pagerank_second_click,pagerank_third_click,...,shortest_path_length_second,shortest_path_count_second,shortest_path_length_third,shortest_path_count_third,zjb_factor,cos_zjb_factor,short_diff,first_link,second_link,third_link
0,62.25,True,0,False,False,0.00047,2.8e-05,0.000746,0.000888,0.001042,...,3.0,6.0,3.0,3.0,-1.0,-0.062183,0.0,0.9,0.6,0.2
1,66.0,True,0,False,False,0.00047,2.8e-05,0.004916,0.002445,0.000111,...,2.0,1.0,1.0,1.0,-1.666667,0.094457,-2.0,0.9,0.3,0.7
2,59.142857,True,0,False,False,0.00047,2.8e-05,0.000302,0.000467,0.000881,...,2.0,1.0,3.0,7.0,-1.0,-0.0101,0.0,0.1,0.9,0.3
3,87.5,True,0,False,False,0.00047,0.000243,0.002903,0.001582,0.000266,...,2.0,1.0,2.0,2.0,-1.333333,-0.022309,-1.0,0.4,0.25,0.2
4,66.0,True,0,False,False,0.00047,0.000243,0.004916,0.00203,0.007202,...,2.0,2.0,1.0,1.0,-1.666667,0.030486,-2.0,0.9,0.1,0.9


In [171]:
games_longer_than_4['num_back'] = games_longer_than_4.apply(lambda row: (row['second_is_<'] + row['third_is_<'])/2, axis = 1)

In [172]:
games_longer_than_4.columns

Index(['duration', 'finished', 'num_back', 'second_is_<', 'third_is_<',
       'pagerank_source', 'pagerank_target', 'pagerank_first_click',
       'pagerank_second_click', 'pagerank_third_click', 'fame_score_source',
       'fame_score_first_click', 'fame_score_second_click',
       'fame_score_third_click', 'cosine_sim_source_target',
       'cosine_sim_first_click_target', 'cosine_sim_second_click_target',
       'cosine_sim_third_click_target', 'shortest_path_length_source',
       'shortest_path_count_source', 'shortest_path_length_first',
       'shortest_path_count_first', 'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third', 'zjb_factor', 'cos_zjb_factor',
       'short_diff', 'first_link', 'second_link', 'third_link'],
      dtype='object')

In [179]:
features_1 = ['duration', 'num_back', 'second_is_<', 'third_is_<',
       'pagerank_source', 'pagerank_target', 'pagerank_first_click',
       'pagerank_second_click', 'pagerank_third_click', 'fame_score_source',
       'fame_score_first_click', 'fame_score_second_click',
       'fame_score_third_click', 'cosine_sim_source_target',
       'cosine_sim_first_click_target', 'cosine_sim_second_click_target',
       'cosine_sim_third_click_target', 'shortest_path_length_source',
       'shortest_path_count_source', 'shortest_path_length_first',
       'shortest_path_count_first', 'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third', 'zjb_factor', 'cos_zjb_factor',
       'first_link', 'second_link', 'third_link', 'short_diff']

In [180]:
model_1 = LogisticRegression(games_longer_than_4, features_1)
model_1.fit()

Class distribution: finished
False    0.5
True     0.5
Name: proportion, dtype: float64
Total number of samples: 17062
         Current function value: 0.548811
         Iterations: 35




Training Set Metrics:
Threshold:   0.5800
F1 Score:    0.7299
Precision:   0.7345
Accuracy:    0.7310
              precision    recall  f1-score   support

       False     0.7002    0.7961    0.7451      1707
        True     0.7636    0.6589    0.7074      1706

    accuracy                         0.7275      3413
   macro avg     0.7319    0.7275    0.7262      3413
weighted avg     0.7319    0.7275    0.7262      3413



In [181]:
model_1.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,13649.0
Model:,Logit,Df Residuals:,13622.0
Method:,MLE,Df Model:,26.0
Date:,"Thu, 19 Dec 2024",Pseudo R-squ.:,0.2082
Time:,19:48:15,Log-Likelihood:,-7490.7
converged:,False,LL-Null:,-9460.8
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.8372,0.064,-13.169,0.000,-0.962,-0.713
x2,0.0049,1.15e+06,4.28e-09,1.000,-2.26e+06,2.26e+06
x3,0.0438,8.03e+05,5.45e-08,1.000,-1.57e+06,1.57e+06
x4,-0.0339,8.67e+05,-3.91e-08,1.000,-1.7e+06,1.7e+06
x5,0.0030,0.021,0.143,0.887,-0.039,0.045
x6,0.2179,0.032,6.903,0.000,0.156,0.280
x7,0.0016,0.023,0.070,0.944,-0.044,0.047
x8,-0.0534,0.023,-2.321,0.020,-0.099,-0.008
x9,-0.0953,0.022,-4.301,0.000,-0.139,-0.052


In [182]:
features_2 = ['duration', 'second_is_<','third_is_<',
       'pagerank_target',  'pagerank_second_click',
       'pagerank_third_click', 'shortest_path_length_source', 
       'shortest_path_count_first', 'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third',  'cosine_sim_source_target',
       'cosine_sim_second_click_target',
       'cosine_sim_third_click_target', 'zjb_factor', 'num_back', 'short_diff']


In [183]:
model_2 = LogisticRegression(games_longer_than_4, features_2)
model_2.fit()

Class distribution: finished
False    0.5
True     0.5
Name: proportion, dtype: float64
Total number of samples: 17062
         Current function value: 0.549651
         Iterations: 35




Training Set Metrics:
Threshold:   0.5780
F1 Score:    0.7288
Precision:   0.7320
Accuracy:    0.7295
              precision    recall  f1-score   support

       False     0.7051    0.7885    0.7445      1707
        True     0.7600    0.6700    0.7121      1706

    accuracy                         0.7293      3413
   macro avg     0.7325    0.7293    0.7283      3413
weighted avg     0.7325    0.7293    0.7283      3413



In [184]:
model_2.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,13649.0
Model:,Logit,Df Residuals:,13634.0
Method:,MLE,Df Model:,14.0
Date:,"Thu, 19 Dec 2024",Pseudo R-squ.:,0.207
Time:,19:48:41,Log-Likelihood:,-7502.2
converged:,False,LL-Null:,-9460.8
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.8303,0.063,-13.152,0.000,-0.954,-0.707
x2,0.0570,2.24e+05,2.54e-07,1.000,-4.4e+05,4.4e+05
x3,-0.0324,2.43e+05,-1.33e-07,1.000,-4.75e+05,4.75e+05
x4,0.2035,0.030,6.721,0.000,0.144,0.263
x5,-0.0550,0.022,-2.520,0.012,-0.098,-0.012
x6,-0.1077,0.021,-5.202,0.000,-0.148,-0.067
x7,-0.4064,,,,,
x8,0.0401,0.020,2.028,0.043,0.001,0.079
x9,0.0057,0.036,0.157,0.875,-0.065,0.077


Let's see if using an average of node values helps.

pagerank target, duration, shortest path length zadnji i shortest path count zadnji, cosine similarity zadnji, max pagerank dosad, brback/brukup,  

In [185]:
games_longer_than_4_bis = games_longer_than_4.copy()
games_longer_than_4_bis['max_pgrank'] = games_longer_than_4_bis.apply(lambda row: max(row['pagerank_source'], row['pagerank_first_click'], row['pagerank_second_click'], row['pagerank_third_click']), axis = 1)
games_longer_than_4_bis['back'] = games_longer_than_4_bis.apply(lambda row: (row['second_is_<'] + row['third_is_<'])/2, axis = 1)
games_longer_than_4_bis['cos_diff'] = games_longer_than_4_bis.apply(lambda row: (-row['cosine_sim_source_target'] + row['cosine_sim_third_click_target']), axis = 1)
games_longer_than_4_bis.head()

Unnamed: 0,duration,finished,num_back,second_is_<,third_is_<,pagerank_source,pagerank_target,pagerank_first_click,pagerank_second_click,pagerank_third_click,...,shortest_path_count_third,zjb_factor,cos_zjb_factor,short_diff,first_link,second_link,third_link,max_pgrank,back,cos_diff
0,62.25,True,0.0,False,False,0.00047,2.8e-05,0.000746,0.000888,0.001042,...,3.0,-1.0,-0.062183,0.0,0.9,0.6,0.2,0.001042,0.0,-0.186548
1,66.0,True,0.0,False,False,0.00047,2.8e-05,0.004916,0.002445,0.000111,...,1.0,-1.666667,0.094457,-2.0,0.9,0.3,0.7,0.004916,0.0,0.28337
2,59.142857,True,0.0,False,False,0.00047,2.8e-05,0.000302,0.000467,0.000881,...,7.0,-1.0,-0.0101,0.0,0.1,0.9,0.3,0.000881,0.0,-0.0303
3,87.5,True,0.0,False,False,0.00047,0.000243,0.002903,0.001582,0.000266,...,2.0,-1.333333,-0.022309,-1.0,0.4,0.25,0.2,0.002903,0.0,-0.066926
4,66.0,True,0.0,False,False,0.00047,0.000243,0.004916,0.00203,0.007202,...,1.0,-1.666667,0.030486,-2.0,0.9,0.1,0.9,0.007202,0.0,0.091459


In [186]:
games_longer_than_4_bis.columns

Index(['duration', 'finished', 'num_back', 'second_is_<', 'third_is_<',
       'pagerank_source', 'pagerank_target', 'pagerank_first_click',
       'pagerank_second_click', 'pagerank_third_click', 'fame_score_source',
       'fame_score_first_click', 'fame_score_second_click',
       'fame_score_third_click', 'cosine_sim_source_target',
       'cosine_sim_first_click_target', 'cosine_sim_second_click_target',
       'cosine_sim_third_click_target', 'shortest_path_length_source',
       'shortest_path_count_source', 'shortest_path_length_first',
       'shortest_path_count_first', 'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third', 'zjb_factor', 'cos_zjb_factor',
       'short_diff', 'first_link', 'second_link', 'third_link', 'max_pgrank',
       'back', 'cos_diff'],
      dtype='object')

In [187]:
games_longer_than_4_bis.drop(columns = ['second_is_<', 'third_is_<', 'pagerank_source',
      'pagerank_first_click', 'pagerank_second_click',
        'fame_score_source', 'fame_score_first_click',
       'fame_score_second_click', 'fame_score_third_click',
       'cosine_sim_source_target', 'cosine_sim_first_click_target',
       'cosine_sim_second_click_target', 'pagerank_third_click',
       'first_link', 'second_link', 'third_link',
       'shortest_path_length_source', 'shortest_path_count_source',
       'shortest_path_length_first', 'shortest_path_count_first',
       'shortest_path_length_second', 'shortest_path_count_second',
      ], inplace=True)

In [188]:
games_longer_than_4_bis.head()

Unnamed: 0,duration,finished,num_back,pagerank_target,cosine_sim_third_click_target,shortest_path_length_third,shortest_path_count_third,zjb_factor,cos_zjb_factor,short_diff,max_pgrank,back,cos_diff
0,62.25,True,0.0,2.8e-05,0.015896,3.0,3.0,-1.0,-0.062183,0.0,0.001042,0.0,-0.186548
1,66.0,True,0.0,2.8e-05,0.485815,1.0,1.0,-1.666667,0.094457,-2.0,0.004916,0.0,0.28337
2,59.142857,True,0.0,2.8e-05,0.172144,3.0,7.0,-1.0,-0.0101,0.0,0.000881,0.0,-0.0303
3,87.5,True,0.0,0.000243,0.012576,2.0,2.0,-1.333333,-0.022309,-1.0,0.002903,0.0,-0.066926
4,66.0,True,0.0,0.000243,0.170961,1.0,1.0,-1.666667,0.030486,-2.0,0.007202,0.0,0.091459


In [190]:
games_longer_than_4_bis.columns

Index(['duration', 'finished', 'num_back', 'pagerank_target',
       'cosine_sim_third_click_target', 'shortest_path_length_third',
       'shortest_path_count_third', 'zjb_factor', 'cos_zjb_factor',
       'short_diff', 'max_pgrank', 'back', 'cos_diff'],
      dtype='object')

In [197]:
features_3 = ['duration', 'pagerank_target',
       'cosine_sim_third_click_target', 'shortest_path_length_third',
       'shortest_path_count_third', 'max_pgrank','back', 'cos_diff', 'zjb_factor']

In [198]:
model_3 = LogisticRegression(games_longer_than_4_bis, features_3)
model_3.fit()

Class distribution: finished
False    0.5
True     0.5
Name: proportion, dtype: float64
Total number of samples: 17062
Optimization terminated successfully.
         Current function value: 0.551730
         Iterations 7
Training Set Metrics:
Threshold:   0.5870
F1 Score:    0.7308
Precision:   0.7355
Accuracy:    0.7318
              precision    recall  f1-score   support

       False     0.7013    0.7950    0.7452      1707
        True     0.7632    0.6612    0.7085      1706

    accuracy                         0.7281      3413
   macro avg     0.7322    0.7281    0.7269      3413
weighted avg     0.7322    0.7281    0.7269      3413



In [199]:
model_3.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,13649.0
Model:,Logit,Df Residuals:,13640.0
Method:,MLE,Df Model:,8.0
Date:,"Thu, 19 Dec 2024",Pseudo R-squ.:,0.204
Time:,19:50:26,Log-Likelihood:,-7530.6
converged:,True,LL-Null:,-9460.8
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.8364,0.063,-13.246,0.000,-0.960,-0.713
x2,0.2152,0.030,7.215,0.000,0.157,0.274
x3,-0.0735,0.047,-1.559,0.119,-0.166,0.019
x4,-1.0929,0.039,-28.031,0.000,-1.169,-1.016
x5,0.2046,0.022,9.172,0.000,0.161,0.248
x6,-0.0830,0.020,-4.053,0.000,-0.123,-0.043
x7,0.0173,0.020,0.869,0.385,-0.022,0.056
x8,0.1527,0.046,3.324,0.001,0.063,0.243
x9,-0.1288,0.031,-4.154,0.000,-0.190,-0.068
