In [1]:
import re
import os
import pickle
from urllib.parse import unquote


import seaborn as sns
import plotly.subplots
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go


import scipy
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf


import networkx as nx

In [2]:
REMOVE_INTERNATIONAL = True
INTERNATIONAL_LABEL = "International"

PLOTS_PATH = "plots"
PLOTS_PATH_PLT = os.path.join(PLOTS_PATH, "plt")
PLOTS_PATH_PX = os.path.join(PLOTS_PATH, "px")
PLOTS_PATH_HTML = os.path.join(PLOTS_PATH, "html")

FIGURE_WIDTH = 800
FIGURE_HEIGHT = 600

for path in [PLOTS_PATH_PLT, PLOTS_PATH_PX, PLOTS_PATH_HTML]: 
    os.makedirs(path, exist_ok=True)

In [3]:
df_continents = pd.read_csv(os.path.join("Data", "continents.csv"))

if REMOVE_INTERNATIONAL:
    labeled_articles_all_count = len(df_continents)
    df_continents = df_continents[df_continents.continent != INTERNATIONAL_LABEL]
    labeled_articles_count = len(df_continents)
    print(f"Removing articles labeled as {INTERNATIONAL_LABEL}, Removed articles: {labeled_articles_all_count - labeled_articles_count}")

display(df_continents.head())
print("Size:", df_continents.shape)

df_categories = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "categories.tsv"),
    delimiter="\t",
    header=None,
    names=["article", "category"],
    skip_blank_lines=True,
    comment="#",
)

main_categories = []
for category in df_categories["category"].values:
    main_categories.append(category.split(".")[1])

df_categories["categoryMain"] = main_categories

display(df_categories.head())
print("Size:", df_categories.shape)

df_continents_categories = pd.merge(df_continents, df_categories, on="article")

display(df_continents_categories.head())
print("Size:", df_continents_categories.shape)

df_articles = df_continents_categories[["article", "continent"]].drop_duplicates()
df_articles = pd.merge(df_articles, df_continents_categories.groupby("article")["categoryMain"].apply(list).reset_index(), on="article")

df_pagerank = pd.read_csv(os.path.join("Data", "page_rank.csv"))
display(df_pagerank.head())
print("Size:", df_pagerank.shape)

df_articles = pd.merge(df_articles, df_pagerank, on="article")

plaintext_path = os.path.join("Data", "plaintext_articles")
word_counts = []
for article_name in df_articles.article:
    file_path = os.path.join(plaintext_path, article_name + ".txt")

    with open(file_path, "r", encoding="utf-8") as file:

        _ = file.readline() # Skip the first line because it contains the word #copyright
        content = file.read()

    content = content[:re.search("Retrieved from", content).start(0)]
    word_counts.append(len(content.split()))

df_articles["length"] = word_counts

display(df_articles.head())
print("Size:", df_articles.shape)

display(df_articles.head())
print("Size:", df_articles.shape)

df_paths_finished = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "paths_finished.tsv"),
    sep="\t",
    header=None,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"],
    skip_blank_lines=True,
    comment="#"
)
df_paths_unfinished = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "paths_unfinished.tsv"),
    sep="\t",
    header=None,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "motif"],
    skip_blank_lines=True,
    comment="#"
)

df_paths_finished["backclicks"] = df_paths_finished["path"].apply(lambda x: x.count("<"))
df_paths_finished["pathSteps"] = df_paths_finished["path"].apply(lambda x: x.count(";") + 1)
df_paths_finished["uniqueArticles"] = df_paths_finished["pathSteps"] - df_paths_finished["backclicks"]
df_paths_finished["path"] = df_paths_finished["path"].apply(lambda x: x.split(";"))
df_paths_finished["start"] = df_paths_finished["path"].str[0]
df_paths_finished["target"] = df_paths_finished["path"].str[-1]
df_paths_finished["isFinished"] = True

df_paths_unfinished["backclicks"] = df_paths_unfinished["path"].apply(lambda x: x.count("<"))
df_paths_unfinished["pathSteps"] = df_paths_unfinished["path"].apply(lambda x: x.count(";") + 1)
df_paths_unfinished["uniqueArticles"] = df_paths_unfinished["pathSteps"] - df_paths_unfinished["backclicks"]
df_paths_unfinished["path"] = df_paths_unfinished["path"].apply(lambda x: x.split(";"))
df_paths_unfinished["start"] = df_paths_unfinished["path"].str[0]
df_paths_unfinished["isFinished"] = False

df_paths = pd.concat([df_paths_finished, df_paths_unfinished])
display(df_paths.head())

df_articles_all = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "articles.tsv"),
    delimiter="\t",
    header=None,
    names=["name"],
    skip_blank_lines=True,
    comment="#",
    encoding="UTF-8"
)

display(df_articles_all.head())
print("Size:", df_articles_all.shape)

shortest_paths = []
with open(os.path.join("Data", "wikispeedia_paths-and-graph", "shortest-path-distance-matrix.txt")) as file:
    for line in file:
        line = line.strip()
        if line == "" or line.startswith("#"):
            continue
        shortest_paths.append(list(map(lambda x: -1 if x == "_" else int(x), list(line))))
        
shortest_paths = np.array(shortest_paths)

df_shortest_paths = pd.DataFrame(shortest_paths, index=df_articles_all.name, columns=df_articles_all.name)

display(df_shortest_paths.head())
print("Size:", df_shortest_paths.shape)

df_articles_target = df_articles.copy()
df_articles_target.columns = [column[0].upper() + column[1:] for column in df_articles_target.columns]
df_articles_target = df_articles_target.add_prefix("target")

df_paths_articles = pd.merge(df_paths, df_articles_target, left_on="target", right_on="targetArticle", suffixes=["", ]).drop(columns="targetArticle")

df_start_articles = df_articles.copy()
df_start_articles.columns = [column[0].upper() + column[1:] for column in df_start_articles.columns]
df_start_articles = df_start_articles.add_prefix("start")
df_paths_articles = pd.merge(df_paths_articles, df_start_articles, left_on="start", right_on="startArticle", suffixes=["", ]).drop(columns="startArticle")

df_paths_articles["isFinishedInt"] = df_paths_articles["isFinished"].astype(int)

display(df_paths_articles.head())
print("Size:", df_paths_articles.shape)

Removing articles labeled as International, Removed articles: 1870


Unnamed: 0,article,continent
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Europe
1,%C3%85land,Europe
2,%C3%89douard_Manet,Europe
3,%C3%89ire,Europe
4,%C3%93engus_I_of_the_Picts,Europe


Size: (2734, 2)


Unnamed: 0,article,category,categoryMain
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.History.British_History.British_Histor...,History
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.People.Historical_figures,People
2,%C3%85land,subject.Countries,Countries
3,%C3%85land,subject.Geography.European_Geography.European_...,Geography
4,%C3%89douard_Manet,subject.People.Artists,People


Size: (5204, 3)


Unnamed: 0,article,continent,category,categoryMain
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Europe,subject.History.British_History.British_Histor...,History
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Europe,subject.People.Historical_figures,People
2,%C3%85land,Europe,subject.Countries,Countries
3,%C3%85land,Europe,subject.Geography.European_Geography.European_...,Geography
4,%C3%89douard_Manet,Europe,subject.People.Artists,People


Size: (3177, 4)


Unnamed: 0,article,pageRank
0,United_States,0.014263
1,United_Kingdom,0.007679
2,Scientific_classification,0.007209
3,Europe,0.007043
4,England,0.006815


Size: (4592, 2)


Unnamed: 0,article,continent,categoryMain,pageRank,length
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Europe,"[History, People]",3.3e-05,1836
1,%C3%85land,Europe,"[Countries, Geography]",3.3e-05,2412
2,%C3%89douard_Manet,Europe,[People],3.3e-05,2887
3,%C3%89ire,Europe,"[Countries, Geography]",3.3e-05,2026
4,%C3%93engus_I_of_the_Picts,Europe,"[History, People]",3.3e-05,2029


Size: (2731, 5)


Unnamed: 0,article,continent,categoryMain,pageRank,length
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Europe,"[History, People]",3.3e-05,1836
1,%C3%85land,Europe,"[Countries, Geography]",3.3e-05,2412
2,%C3%89douard_Manet,Europe,[People],3.3e-05,2887
3,%C3%89ire,Europe,"[Countries, Geography]",3.3e-05,2026
4,%C3%93engus_I_of_the_Picts,Europe,"[History, People]",3.3e-05,2029


Size: (2731, 5)


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,backclicks,pathSteps,uniqueArticles,start,target,isFinished,motif
0,6a3701d319fc3754,1297740409,166,"[14th_century, 15th_century, 16th_century, Pac...",,0,9,9,14th_century,African_slave_trade,True,
1,3824310e536af032,1344753412,88,"[14th_century, Europe, Africa, Atlantic_slave_...",3.0,0,5,5,14th_century,African_slave_trade,True,
2,415612e93584d30e,1349298640,138,"[14th_century, Niger, Nigeria, British_Empire,...",,0,8,8,14th_century,African_slave_trade,True,
3,64dd5cd342e3780c,1265613925,37,"[14th_century, Renaissance, Ancient_Greece, Gr...",,0,4,4,14th_century,Greece,True,
4,015245d773376aab,1366730828,175,"[14th_century, Italy, Roman_Catholic_Church, H...",3.0,0,7,7,14th_century,John_F._Kennedy,True,


Unnamed: 0,name
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
1,%C3%85land
2,%C3%89douard_Manet
3,%C3%89ire
4,%C3%93engus_I_of_the_Picts


Size: (4604, 1)


name,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,%C3%85land,%C3%89douard_Manet,%C3%89ire,%C3%93engus_I_of_the_Picts,%E2%82%AC2_commemorative_coins,10th_century,11th_century,12th_century,13th_century,...,Ziad_Jarrah,Zimbabwe,Zinc,Zinc_chloride,Zion_National_Park,Zionism,Zirconium,Zoroaster,Zuid-Gelders,Zulu
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,0,-1,-1,-1,-1,-1,3,3,3,3,...,4,3,3,4,4,3,4,4,4,2
%C3%85land,-1,0,-1,-1,-1,-1,2,2,2,2,...,4,2,3,4,4,3,4,3,3,3
%C3%89douard_Manet,-1,-1,0,-1,-1,-1,3,3,2,2,...,4,3,2,3,4,3,4,3,3,3
%C3%89ire,-1,-1,-1,0,-1,-1,3,3,3,3,...,4,2,2,3,4,3,4,4,3,3
%C3%93engus_I_of_the_Picts,-1,-1,-1,-1,0,-1,2,2,3,2,...,4,2,3,4,4,3,4,3,3,3


Size: (4604, 4604)


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,backclicks,pathSteps,uniqueArticles,start,target,...,motif,targetContinent,targetCategoryMain,targetPageRank,targetLength,startContinent,startCategoryMain,startPageRank,startLength,isFinishedInt
0,1a218aa161301e6e,1355086784,40,"[James_Bond, United_Kingdom, Europe, Africa, A...",,0,6,6,James_Bond,African_slave_trade,...,,Africa,[History],5.5e-05,2654,Europe,[Everyday_life],0.000186,7496,1
1,1ad6fbd964102221,1332642329,144,"[James_Bond, Star_Wars, Mythology, The_Lord_of...",,5,11,6,James_Bond,Iron_Maiden,...,restart,Europe,[Music],0.000159,4047,Europe,[Everyday_life],0.000186,7496,0
2,3e6b12634169fb72,1357250279,28,"[James_Bond, Sean_Connery, Scotland, Scottish_...",,0,4,4,James_Bond,Scottish_Gaelic_language,...,,Europe,[Language_and_literature],0.000243,4780,Europe,[Everyday_life],0.000186,7496,1
3,2141997163054c23,1272956123,18,"[James_Bond, United_States, Canada, Stephen_Ha...",,0,4,4,James_Bond,Stephen_Harper,...,,North America,[People],0.000107,4801,Europe,[Everyday_life],0.000186,7496,1
4,15945db656214ee5,1253827056,64,"[James_Bond, Germany, Adolf_Hitler, Nazi_Germa...",,0,5,5,James_Bond,Nazism,...,,Europe,[History],0.000706,7377,Europe,[Everyday_life],0.000186,7496,1


Size: (19164, 21)


In [4]:
df_analysis = df_paths_articles.copy()
df_analysis = df_analysis.fillna(0)
df_analysis["treatment"] = df_analysis.targetContinent == "Europe"

df_analysis["shortestPath"] = df_analysis.apply(lambda x: df_shortest_paths.loc[x["start"], x["target"]], axis="columns")

display(df_analysis.head())

for col in ["isFinishedInt", "durationInSec", "pathSteps", "rating"]:
    print(col, *scipy.stats.ttest_ind(df_analysis[df_analysis.treatment][col], df_analysis[~df_analysis.treatment][col], equal_var=False))


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,backclicks,pathSteps,uniqueArticles,start,target,...,targetCategoryMain,targetPageRank,targetLength,startContinent,startCategoryMain,startPageRank,startLength,isFinishedInt,treatment,shortestPath
0,1a218aa161301e6e,1355086784,40,"[James_Bond, United_Kingdom, Europe, Africa, A...",0.0,0,6,6,James_Bond,African_slave_trade,...,[History],5.5e-05,2654,Europe,[Everyday_life],0.000186,7496,1,False,3
1,1ad6fbd964102221,1332642329,144,"[James_Bond, Star_Wars, Mythology, The_Lord_of...",0.0,5,11,6,James_Bond,Iron_Maiden,...,[Music],0.000159,4047,Europe,[Everyday_life],0.000186,7496,0,True,2
2,3e6b12634169fb72,1357250279,28,"[James_Bond, Sean_Connery, Scotland, Scottish_...",0.0,0,4,4,James_Bond,Scottish_Gaelic_language,...,[Language_and_literature],0.000243,4780,Europe,[Everyday_life],0.000186,7496,1,True,2
3,2141997163054c23,1272956123,18,"[James_Bond, United_States, Canada, Stephen_Ha...",0.0,0,4,4,James_Bond,Stephen_Harper,...,[People],0.000107,4801,Europe,[Everyday_life],0.000186,7496,1,False,2
4,15945db656214ee5,1253827056,64,"[James_Bond, Germany, Adolf_Hitler, Nazi_Germa...",0.0,0,5,5,James_Bond,Nazism,...,[History],0.000706,7377,Europe,[Everyday_life],0.000186,7496,1,True,2


isFinishedInt 2.1398924823233725 0.03237710067318371
durationInSec -1.2458380650637124 0.21284060850800837
pathSteps -3.0689467950970153 0.00215167329336182
rating 2.5344953935136685 0.01126962834502213


In [5]:
df_articles.columns

Index(['article', 'continent', 'categoryMain', 'pageRank', 'length'], dtype='object')

In [6]:
eq = "isFinishedInt ~ startLength + startPageRank + targetLength + targetPageRank"

model = smf.logit(eq, df_analysis).fit()

df_analysis["propensityScore"] = model.predict()

model.summary()

Optimization terminated successfully.
         Current function value: 0.577612
         Iterations 10


0,1,2,3
Dep. Variable:,isFinishedInt,No. Observations:,19164.0
Model:,Logit,Df Residuals:,19159.0
Method:,MLE,Df Model:,4.0
Date:,"Thu, 21 Dec 2023",Pseudo R-squ.:,0.04308
Time:,20:35:11,Log-Likelihood:,-11069.0
converged:,True,LL-Null:,-11568.0
Covariance Type:,nonrobust,LLR p-value:,1.935e-214

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.4316,0.038,11.263,0.000,0.357,0.507
startLength,-9.863e-06,6.49e-06,-1.520,0.129,-2.26e-05,2.86e-06
startPageRank,-37.8348,20.384,-1.856,0.063,-77.787,2.117
targetLength,3.569e-05,6.26e-06,5.704,0.000,2.34e-05,4.8e-05
targetPageRank,1201.1078,63.863,18.807,0.000,1075.938,1326.278


In [7]:
treatment_df = df_analysis[df_analysis["treatment"]].head(500)
control_df = df_analysis[~df_analysis["treatment"]].head(500)

def get_similarity(propensity_score1, propensity_score2):
    '''Calculate similarity for instances with given propensity scores'''
    return 1 - np.abs(propensity_score1 - propensity_score2)

G = nx.Graph()
for control_id, control_row in control_df.iterrows():
    for treatment_id, treatment_row in treatment_df.iterrows():

        if len(set(treatment_row['startCategoryMain']) & set(control_row['startCategoryMain'])) \
        and len(set(treatment_row['targetCategoryMain']) & set(control_row['targetCategoryMain'])) \
        and treatment_row["shortestPath"] == control_row["shortestPath"]:
            weight = get_similarity(treatment_row["propensityScore"], control_row["propensityScore"])
            G.add_edge(treatment_id, control_id, weight=weight)

matching = nx.max_weight_matching(G)

In [8]:
treatment_df = df_analysis[df_analysis["treatment"]].head(500)
control_df = df_analysis[~df_analysis["treatment"]].head(500)

def get_similarity(propensity_score1, propensity_score2):
    '''Calculate similarity for instances with given propensity scores'''
    return 1 - np.abs(propensity_score1 - propensity_score2)

G = nx.Graph()
for control_id, control_row in control_df.iterrows():
    for treatment_id, treatment_row in treatment_df.iterrows():

        if len(set(treatment_row['startCategoryMain']) & set(control_row['startCategoryMain'])) \
        and len(set(treatment_row['targetCategoryMain']) & set(control_row['targetCategoryMain'])) \
        and treatment_row["shortestPath"] == control_row["shortestPath"]:
            weight = get_similarity(treatment_row["propensityScore"], control_row["propensityScore"])
            G.add_edge(treatment_id, control_id, weight=weight)

matching = nx.max_weight_matching(G)

with open("matching.pkl", "wb") as file:
    pickle.dump(matching, file)


In [9]:
with open("matching.pkl", "rb") as file:
    matching = pickle.load(file)