In [1]:
import pandas as pd
from urllib.parse import unquote
from tqdm import tqdm

In [2]:
data = pd.read_csv(
    "./wikispeedia_paths-and-graph/paths_finished.tsv",
    sep="\t",
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"]
)

data

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0
...,...,...,...,...,...
51313,15a13a1d66ef5456,1349231015,66,Yagan;Ancient_Egypt;Civilization,
51314,2ef7ac844cefda58,1300254138,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,3.0
51315,12863abb7887f890,1385095372,228,Yagan;Australia;England;France;United_States;T...,
51316,19f8284371753362,1298792567,56,Yarralumla%2C_Australian_Capital_Territory;Aus...,1.0


In [3]:
ground_truths = data[data["path"].map(lambda x: 7 <= len(x.split(";")) <= 20)]
ground_truths = ground_truths[~ground_truths["path"].str.contains("<")]  # remove paths with back-clicks

# # Remove duplicate st-paths. I.e., paths with the same src and tgt but may nonetheless be different.
# def get_src_tgt(x):
#     return (x.split(";")[0], x.split(";")[-1])
# ground_truths = ground_truths[~(ground_truths["path"].map(get_src_tgt).duplicated())]

# Although there are multiple paths for the same source and target, we actually want
# # to include those so that our metrics are averaged over multiple user-extracted paths
ground_truths

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0
9,08888b1b428dd90e,1232241601,167,14th_century;15th_century;Plato;Nature;Ultravi...,
11,1082b6b8501a04b1,1248791776,218,14th_century;Christianity;Bible;God;Nature;Ear...,3.0
...,...,...,...,...,...
51294,58a5abc229ae7bcf,1376751589,161,Tropical_Storm_Odette_%282003%29;Colombia;Sout...,3.0
51295,69c67c896118f02c,1338929469,110,Tropical_Storm_Odette_%282003%29;Caribbean_Sea...,
51300,54d0011452bb9d48,1349877073,75,United_States_Numbered_Highways;United_States;...,2.0
51311,4753cde919cd5ce5,1348670636,418,Work_%28thermodynamics%29;Energy;Aristotle;Poe...,3.0


In [6]:
ground_truths.to_csv("./ground_truth_paths.csv", index=False)

In [7]:
doc_names = {doc for path in ground_truths["path"] for doc in path.split(";")}
# for reproducibility, since the set may yield different orders that do not match the order of the embeddings
doc_names = sorted(doc_names)

processed_docs = []

for doc in tqdm(doc_names):
    with open(f"./plaintext_articles/{doc}.txt", "r") as fp:
        processed_docs.append({
            "title": unquote(doc),
            "plain_text": " ".join(fp.read().replace("#copyright\n\n", '').split(" ")[:4000]).strip()
        })

100%|██████████| 3928/3928 [00:02<00:00, 1673.89it/s]


In [8]:
dataset = pd.DataFrame(processed_docs)
dataset

Unnamed: 0,title,plain_text
0,Édouard_Manet,Édouard Manet\n\n2007 Schools Wikipedia Select...
1,Éire,Éire\n\n2007 Schools Wikipedia Selection. Rela...
2,10th_century,10th century\n\n2007 Schools Wikipedia Selecti...
3,11th_century,11th century\n\n2007 Schools Wikipedia Selecti...
4,12th_century,12th century\n\n2007 Schools Wikipedia Selecti...
...,...,...
3923,Zionism,Zionism\n\n2007 Schools Wikipedia Selection. R...
3924,Zirconium,Zirconium\n\n2007 Schools Wikipedia Selection....
3925,Zoroaster,Zoroaster\n\n2007 Schools Wikipedia Selection....
3926,Zuid-Gelders,Zuid-Gelders\n\n2007 Schools Wikipedia Selecti...


In [9]:
dataset.to_csv("./text_data.csv", index=False)