In [112]:
import numpy as np
import pandas as pd

import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm

from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from utils.data_processing import *
from utils.graph_processing import *
from models.logistic_regression import LogisticRegression

We are first loading all possible games: games played before 2011, that were winnable, where there was no timeout. We remove timeouted games as we do not know the reason why the player gave up and it may not be linked with the Wikispeedia game.

In [113]:
games = load_preprocessed_games()
games.head()

Loaded 51318 finished paths in df of shape (51318, 7)
Loaded 24875 unfinished paths in df of shape (24875, 8)
After filtering all paths after 2011-02-07 05:02:15
we kept 23245 paths out of 51318 finished paths
There are 24875 unfinished paths
Loaded 4604 articles in df of shape (4604, 1)
Pruning invalid games. Initially we have 48120 games
Pruned invalid games. Now we have 48092 valid games
After removing timeouted games, there are 38775 games left


Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,type_end
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,


Let's see how long games are.

In [114]:
games[(games['path_length'] == 1)].shape

(3204, 11)

In [115]:
print(games[(games['path_length'] == 1) & (games['finished'] == True)].shape)
print(games[(games['path_length'] == 1) & (games['finished'] == True)])

(9, 11)
       difficulty_rating  duration  finished            hashIP  num_backward  \
3908                 NaN         0      True  43f864c75b2571b5             0   
5405                 NaN         0      True  43f864c75b2571b5             0   
11137                NaN         0      True  43f864c75b2571b5             0   
12391                NaN         0      True  43f864c75b2571b5             0   
14685                NaN         0      True  43f864c75b2571b5             0   
15737                NaN         0      True  43f864c75b2571b5             0   
15911                NaN         0      True  43f864c75b2571b5             0   
17716                NaN         0      True  43f864c75b2571b5             0   
22649                NaN         0      True  43f864c75b2571b5             0   

                         path  path_length                source  \
3908                [Lesotho]            1               Lesotho   
5405                   [Coal]            1             

So there are 9 games where the source and target article are the same, so the player automatically won. These games are not useful when analysing games so we will remove them. We still keep the games of path length 1 since the nature of the first article may impact if the player gives up.

In [116]:
condition = (games['path_length'] == 1) & (games['finished'] == True)
games = games[~condition]
games.shape

(38766, 11)

In [117]:
games[(games['path_length'] == 2)].shape

(2233, 11)

In [118]:
games[(games['path_length'] == 3)].shape

(3354, 11)

In [119]:
games[(games['path_length'] > 4)].shape

(24641, 11)

Most of the games have a path length bigger than 4 (at least 3 clicks before the final click), so we will first try to analyse these.

In [120]:
games_longer_than_4 = games.copy()
games_longer_than_4 = games_longer_than_4[(games_longer_than_4['path_length'] > 4)]

The features we will take into account will be the inherent difficulty of the game and informations from the first 3 clicks. The information from the first 3 clicks will be: the shortest path length from the article to the goal, the fame of the article, if the click was a backward click or not, the pagerank of the article, the duration (approximated), the probabilities of the link used being known to people, the pagerank of the target. The target variable is whether the games was won or lost. We do not use information from the first article as it is contained in the inherent difficulty.

In [121]:
games_longer_than_4.head()

Unnamed: 0,difficulty_rating,duration,finished,hashIP,num_backward,path,path_length,source,target,timestamp,type_end
0,,166,True,6a3701d319fc3754,0,"[14th_century, 15th_century, 16th_century, Pac...",9,14th_century,African_slave_trade,2011-02-15 03:26:49,
1,3.0,88,True,3824310e536af032,0,"[14th_century, Europe, Africa, Atlantic_slave_...",5,14th_century,African_slave_trade,2012-08-12 06:36:52,
2,,138,True,415612e93584d30e,0,"[14th_century, Niger, Nigeria, British_Empire,...",8,14th_century,African_slave_trade,2012-10-03 21:10:40,
3,3.0,175,True,015245d773376aab,0,"[14th_century, Italy, Roman_Catholic_Church, H...",7,14th_century,John_F._Kennedy,2013-04-23 15:27:08,
4,,110,True,5295bca242be81fe,0,"[14th_century, Europe, North_America, United_S...",6,14th_century,John_F._Kennedy,2013-07-03 22:26:54,


In [122]:
games_longer_than_4['duration'] = (3/(games_longer_than_4['path_length']-1))* games_longer_than_4['duration']

In [123]:
games_longer_than_4.drop(columns = ["difficulty_rating", 'hashIP', 'num_backward', 'path_length', 'type_end', 'timestamp'], inplace = True)

In [124]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,path,source,target
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy


In [125]:
games_longer_than_4['first_click'] = games_longer_than_4['path'].apply(lambda x: x[1])
games_longer_than_4['second_click'] = games_longer_than_4['path'].apply(lambda x: x[2])
games_longer_than_4['third_click'] = games_longer_than_4['path'].apply(lambda x: x[3])
games_longer_than_4['num_back'] = games_longer_than_4.apply(lambda a: int(a['first_click'] == '<') + int(a['second_click'] == '<') + int(a['third_click'] == '<'), axis = 1)


#games_longer_than_4.drop(columns=['path'], inplace=True)

games_longer_than_4.head()


Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade,15th_century,16th_century,Pacific_Ocean,0
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0


In [126]:
node_stats_df = load_or_compute_node_stats()
games_longer_than_4 = merge_with_node_data(games_longer_than_4, node_stats_df, columns = ['source', 'target', 'first_click', 'second_click', 'third_click'], data = ['pagerank'])
games_longer_than_4.head()

Loaded 4604 node stats
Dropped 2538 games without node statistics


Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,pagerank_source,pagerank_target,pagerank_first_click,pagerank_second_click,pagerank_third_click
0,62.25,True,"[14th_century, 15th_century, 16th_century, Pac...",14th_century,African_slave_trade,15th_century,16th_century,Pacific_Ocean,0,0.000642,3e-05,0.001024,0.001223,0.001408
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,0.000642,3e-05,0.006698,0.003321,0.000145
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,0.000642,3e-05,0.000408,0.00063,0.001197
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0,0.000642,0.000315,0.003975,0.002173,0.000357
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,0.000642,0.000315,0.006698,0.002751,0.009774


In [127]:
fame_df = load_fame()
games_longer_than_4 = merge_with_fame_data(games_longer_than_4, fame_df, columns = ['source', 'first_click', 'second_click', 'third_click'])

Dropped 4670 games without fame statistics


In [128]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,pagerank_source,pagerank_target,pagerank_first_click,pagerank_second_click,pagerank_third_click,fame_score_source,fame_score_first_click,fame_score_second_click,fame_score_third_click
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,0.000642,3e-05,0.006698,0.003321,0.000145,8.0,8.0,9.0,6.0
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,0.000642,3e-05,0.000408,0.00063,0.001197,8.0,6.0,8.0,8.0
3,87.5,True,"[14th_century, Italy, Roman_Catholic_Church, H...",14th_century,John_F._Kennedy,Italy,Roman_Catholic_Church,HIV,0,0.000642,0.000315,0.003975,0.002173,0.000357,8.0,9.0,9.0,9.0
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,0.000642,0.000315,0.006698,0.002751,0.009774,8.0,8.0,9.0,10.0
6,59.666667,True,"[14th_century, English_peasants'_revolt_of_138...",14th_century,Rainbow,English_peasants'_revolt_of_1381,Archbishop_of_Canterbury,19th_century,0,0.000642,0.000144,6.8e-05,0.000469,0.002593,8.0,3.0,7.0,4.0


In [129]:
link_proba = load_link_proba()
link_proba.head()
links_df = load_links_df()
link_proba.isna().sum()
#print(print(games_longer_than_4.loc[3167]))
#links[(links['source'] == 'Finland') & (links['target'] == 'Åland')]

Loaded 119882 links in df of shape (119882, 2)


link_probability    5140
dtype: int64

In [130]:
for i, row in games_longer_than_4.iterrows():
    if (row['source'], row['first_click']) not in link_proba.index:
        print(i, "-1-", row['source'], row['first_click'])
    if (row['first_click'], row['second_click']) not in link_proba.index:
        print(i, "-2-",row['first_click'], row['second_click'])
    if (row['second_click'], row['third_click']) not in link_proba.index:
        print(i, "-3-",row['second_click'], row['third_click'])

3167 -2- Finland Åland
8055 -3- Programming_language Wikipedia_Text_of_the_GNU_Free_Documentation_License
34184 -3- Communication Wikipedia_Text_of_the_GNU_Free_Documentation_License


In [131]:
games_longer_than_4.drop(index = [3167, 8055, 34184], inplace = True)

In [132]:
games_longer_than_4 = add_link_proba_info(games_longer_than_4, link_proba, pairs = [['source', 'first_click'], ['first_click', 'second_click'], ['second_click', 'third_click']], names = ['first_link', 'second_link', 'third_link'])
games_longer_than_4.head()

Dropped 2013 games without link statistics


Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,pagerank_source,...,pagerank_first_click,pagerank_second_click,pagerank_third_click,fame_score_source,fame_score_first_click,fame_score_second_click,fame_score_third_click,first_link,second_link,third_link
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,0.000642,...,0.006698,0.003321,0.000145,8.0,8.0,9.0,6.0,0.9,0.3,0.7
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,0.000642,...,0.000408,0.00063,0.001197,8.0,6.0,8.0,8.0,0.1,0.9,0.3
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,0.000642,...,0.006698,0.002751,0.009774,8.0,8.0,9.0,10.0,0.9,0.1,0.9
6,59.666667,True,"[14th_century, English_peasants'_revolt_of_138...",14th_century,Rainbow,English_peasants'_revolt_of_1381,Archbishop_of_Canterbury,19th_century,0,0.000642,...,6.8e-05,0.000469,0.002593,8.0,3.0,7.0,4.0,0.1,0.15,0.2
7,72.428571,True,"[14th_century, India, Rice, Rain, Acid_rain, C...",14th_century,Rainbow,India,Rice,Rain,0,0.000642,...,0.004343,0.000484,0.000403,8.0,10.0,9.0,9.0,0.2,0.3,0.01


In [51]:
articles_df = load_article_df()
adj_matrix = construct_adjecency_matrix(links_df, articles_df['article_name'].tolist())
adj_list = from_adjacency_matrix_to_list(adj_matrix)
index_mapping = generate_inverse_index_mapping(adj_list)
pair_data = load_pair_data_with_multiindex('../src/data/pair_stats.txt', index_mapping)
pair_data.head()

Loaded 4604 articles in df of shape (4604, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,shortest_path_length,shortest_path_count,max_sp_node_degree,max_sp_avg_node_degree,avg_sp_avg_node_degree,one_longer_path_count,max_ol_node_degree,max_ol_avg_node_degree,avg_ol_avg_node_degree,two_longer_path_count,max_tl_node_degree,max_tl_avg_node_degree
source,target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10th_century,10th_century,0,1,0,0,0,0,0,0,0,0,0,0
10th_century,11th_century,1,1,26,13,13,2,62,29,26,7,112,46
10th_century,12th_century,2,5,48,24,23,121,180,93,37,30,169,70
10th_century,13th_century,2,4,79,35,28,131,169,84,39,30,169,71
10th_century,14th_century,2,4,53,26,22,113,169,84,37,30,169,70


In [52]:
pair_data.columns

Index(['shortest_path_length', 'shortest_path_count', 'max_sp_node_degree',
       'max_sp_avg_node_degree', 'avg_sp_avg_node_degree',
       'one_longer_path_count', 'max_ol_node_degree', 'max_ol_avg_node_degree',
       'avg_ol_avg_node_degree', 'two_longer_path_count', 'max_tl_node_degree',
       'max_tl_avg_node_degree'],
      dtype='object')

In [133]:
games_longer_than_4 = add_pair_data(games_longer_than_4, pair_data, pairs =[['source', 'target'], ['first_click', 'target'], ['second_click', 'target'], ['third_click', 'target']], names = ["source", "first", "second", "third"], data = ['shortest_path_length', 'shortest_path_count']) 

Dropped 2 games without link statistics


In [134]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,path,source,target,first_click,second_click,third_click,num_back,pagerank_source,...,second_link,third_link,shortest_path_length_source,shortest_path_count_source,shortest_path_length_first,shortest_path_count_first,shortest_path_length_second,shortest_path_count_second,shortest_path_length_third,shortest_path_count_third
1,66.0,True,"[14th_century, Europe, Africa, Atlantic_slave_...",14th_century,African_slave_trade,Europe,Africa,Atlantic_slave_trade,0,0.000642,...,0.3,0.7,3.0,3.0,3.0,7.0,2.0,1.0,1.0,1.0
2,59.142857,True,"[14th_century, Niger, Nigeria, British_Empire,...",14th_century,African_slave_trade,Niger,Nigeria,British_Empire,0,0.000642,...,0.9,0.3,3.0,3.0,3.0,4.0,2.0,1.0,3.0,7.0
4,66.0,True,"[14th_century, Europe, North_America, United_S...",14th_century,John_F._Kennedy,Europe,North_America,United_States,0,0.000642,...,0.1,0.9,3.0,18.0,2.0,2.0,2.0,2.0,1.0,1.0
6,59.666667,True,"[14th_century, English_peasants'_revolt_of_138...",14th_century,Rainbow,English_peasants'_revolt_of_1381,Archbishop_of_Canterbury,19th_century,0,0.000642,...,0.15,0.2,3.0,5.0,3.0,2.0,3.0,4.0,3.0,17.0
7,72.428571,True,"[14th_century, India, Rice, Rain, Acid_rain, C...",14th_century,Rainbow,India,Rice,Rain,0,0.000642,...,0.3,0.01,3.0,5.0,3.0,6.0,2.0,1.0,2.0,1.0


In [135]:
games_longer_than_4.columns

Index(['duration', 'finished', 'path', 'source', 'target', 'first_click',
       'second_click', 'third_click', 'num_back', 'pagerank_source',
       'pagerank_target', 'pagerank_first_click', 'pagerank_second_click',
       'pagerank_third_click', 'fame_score_source', 'fame_score_first_click',
       'fame_score_second_click', 'fame_score_third_click', 'first_link',
       'second_link', 'third_link', 'shortest_path_length_source',
       'shortest_path_count_source', 'shortest_path_length_first',
       'shortest_path_count_first', 'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third'],
      dtype='object')

In [136]:
games_longer_than_4.drop(columns = ['path', 'source', 'target', 'first_click',
       'second_click', 'third_click'], inplace = True)

In [139]:
games_longer_than_4.head()

Unnamed: 0,duration,finished,num_back,pagerank_source,pagerank_target,pagerank_first_click,pagerank_second_click,pagerank_third_click,fame_score_source,fame_score_first_click,...,second_link,third_link,shortest_path_length_source,shortest_path_count_source,shortest_path_length_first,shortest_path_count_first,shortest_path_length_second,shortest_path_count_second,shortest_path_length_third,shortest_path_count_third
1,66.0,True,0,0.000642,3e-05,0.006698,0.003321,0.000145,8.0,8.0,...,0.3,0.7,3.0,3.0,3.0,7.0,2.0,1.0,1.0,1.0
2,59.142857,True,0,0.000642,3e-05,0.000408,0.00063,0.001197,8.0,6.0,...,0.9,0.3,3.0,3.0,3.0,4.0,2.0,1.0,3.0,7.0
4,66.0,True,0,0.000642,0.000315,0.006698,0.002751,0.009774,8.0,8.0,...,0.1,0.9,3.0,18.0,2.0,2.0,2.0,2.0,1.0,1.0
6,59.666667,True,0,0.000642,0.000144,6.8e-05,0.000469,0.002593,8.0,3.0,...,0.15,0.2,3.0,5.0,3.0,2.0,3.0,4.0,3.0,17.0
7,72.428571,True,0,0.000642,0.000144,0.004343,0.000484,0.000403,8.0,10.0,...,0.3,0.01,3.0,5.0,3.0,6.0,2.0,1.0,2.0,1.0


In [140]:
games_longer_than_4['num_back'].max()

0

In [141]:
games_longer_than_4.columns

Index(['duration', 'finished', 'num_back', 'pagerank_source',
       'pagerank_target', 'pagerank_first_click', 'pagerank_second_click',
       'pagerank_third_click', 'fame_score_source', 'fame_score_first_click',
       'fame_score_second_click', 'fame_score_third_click', 'first_link',
       'second_link', 'third_link', 'shortest_path_length_source',
       'shortest_path_count_source', 'shortest_path_length_first',
       'shortest_path_count_first', 'shortest_path_length_second',
       'shortest_path_count_second', 'shortest_path_length_third',
       'shortest_path_count_third'],
      dtype='object')

In [148]:
features_1 = ['duration', 'pagerank_source',
       'pagerank_target', 'pagerank_first_click', 'pagerank_second_click',
       'pagerank_third_click', 'fame_score_source', 'fame_score_first_click',
       'fame_score_second_click', 'fame_score_third_click', 'first_link',
       'second_link', 'third_link', 'shortest_path_length_source',
       'shortest_path_count_source', 'shortest_path_length_first', 'shortest_path_count_first',
       'shortest_path_length_second', 'shortest_path_count_second',
       'shortest_path_length_third', 'shortest_path_count_third']

In [155]:
model_1 = LogisticRegression(games_longer_than_4, features_1)
model_1.fit()

Class distribution: finished
True     0.5
False    0.5
Name: proportion, dtype: float64
Total number of samples: 8258
Optimization terminated successfully.
         Current function value: 0.514900
         Iterations 6
Training Set Metrics:
Threshold:   0.5750
F1 Score:    0.7506
Precision:   0.7527
Accuracy:    0.7510
              precision    recall  f1-score   support

       False     0.7118    0.7506    0.7307       826
        True     0.7362    0.6961    0.7156       826

    accuracy                         0.7234      1652
   macro avg     0.7240    0.7234    0.7232      1652
weighted avg     0.7240    0.7234    0.7232      1652



In [156]:
model_1.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,6606.0
Model:,Logit,Df Residuals:,6585.0
Method:,MLE,Df Model:,20.0
Date:,"Wed, 11 Dec 2024",Pseudo R-squ.:,0.2572
Time:,22:56:04,Log-Likelihood:,-3401.4
converged:,True,LL-Null:,-4578.9
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.5865,0.048,-12.222,0.000,-0.681,-0.492
x2,0.0320,0.033,0.973,0.331,-0.033,0.097
x3,0.3095,0.051,6.128,0.000,0.210,0.408
x4,0.0034,0.034,0.098,0.922,-0.064,0.070
x5,-0.0389,0.033,-1.179,0.238,-0.103,0.026
x6,-0.1384,0.032,-4.321,0.000,-0.201,-0.076
x7,-0.0035,0.032,-0.109,0.913,-0.066,0.059
x8,0.0444,0.032,1.375,0.169,-0.019,0.108
x9,0.0344,0.032,1.075,0.282,-0.028,0.097


Based on p-value, we remove 'pagerank_source', 'pagerank_first_click','fame_score_source', 'fame_score_third_click', 'pagerank_second_click', 'fame_score_first_click', 'fame_score_second_click', 'fame_score_third_click', 'first_link', 'second_link', 'third_link', shortest_path_count_source', 'shortest_path_count_first'.

In [160]:
features_2 = ['duration',
       'pagerank_target',
       'pagerank_third_click',
       'shortest_path_length_source',
       'shortest_path_length_second', 'shortest_path_count_second',
       'shortest_path_length_third', 'shortest_path_count_third']

In [161]:
model_2 = LogisticRegression(games_longer_than_4, features_2)
model_2.fit()

Class distribution: finished
True     0.5
False    0.5
Name: proportion, dtype: float64
Total number of samples: 8258
Optimization terminated successfully.
         Current function value: 0.515755
         Iterations 6
Training Set Metrics:
Threshold:   0.5910
F1 Score:    0.7515
Precision:   0.7559
Accuracy:    0.7523
              precision    recall  f1-score   support

       False     0.7036    0.7760    0.7381       826
        True     0.7503    0.6731    0.7096       826

    accuracy                         0.7246      1652
   macro avg     0.7270    0.7246    0.7238      1652
weighted avg     0.7270    0.7246    0.7238      1652



In [162]:
model_2.summary()

0,1,2,3
Dep. Variable:,finished,No. Observations:,6606.0
Model:,Logit,Df Residuals:,6598.0
Method:,MLE,Df Model:,7.0
Date:,"Wed, 11 Dec 2024",Pseudo R-squ.:,0.2559
Time:,22:56:38,Log-Likelihood:,-3407.1
converged:,True,LL-Null:,-4578.9
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.5774,0.048,-12.147,0.000,-0.671,-0.484
x2,0.2966,0.047,6.247,0.000,0.204,0.390
x3,-0.1374,0.030,-4.611,0.000,-0.196,-0.079
x4,0.1340,0.039,3.422,0.001,0.057,0.211
x5,-0.1035,0.051,-2.026,0.043,-0.204,-0.003
x6,0.0730,0.034,2.120,0.034,0.006,0.141
x7,-1.3756,0.054,-25.299,0.000,-1.482,-1.269
x8,0.2439,0.034,7.091,0.000,0.176,0.311
