# Expected structure of folder:

```
ada-2024-project-outliars/
    ├── analysis.ipynb
    └── data/
        ├── articles.tsv
        ├── categories.tsv
        ├── links.tsv
        ├── paths_finished.tsv
        ├── paths_unfinished.tsv
        ├── shortest-path-distance-matrix.txt
        └── plaintext_articles/
            └── ...
```

# Loading the data

In [6]:
import numpy as np
import pandas as pd
from urllib.parse import unquote

## Loading article names

In [28]:
articles_df = pd.read_csv("data/articles.tsv", sep = "\t", comment = '#', header = None)
articles_df.columns = ['article_name']

# Decode names
articles_df['article_name'] = articles_df['article_name'].apply(unquote) 

articles_df.head()

Unnamed: 0,article_name
0,Áedán_mac_Gabráin
1,Åland
2,Édouard_Manet
3,Éire
4,Óengus_I_of_the_Picts


## Loading categories for each article

In [29]:
categories_df = pd.read_csv("data/categories.tsv", sep = "\t", comment = '#', header = None)
categories_df.columns = ['article_name', 'category']

# Decode article names
categories_df['article_name'] = categories_df['article_name'].apply(unquote)

# Split the 'category' column into multiple columns (one for each level of category)
df_split = categories_df['category'].str.split('.', expand=True).drop(columns=[0])

# Rename the columns to represent each level
df_split.columns = ['Level_1', 'Level_2', 'Level_3']

# Join the new columns with starting dataframe
categories_df = categories_df.drop(columns = ['category']).join(df_split)
categories_df.set_index('article_name', inplace = True)

categories_df.head()

Unnamed: 0_level_0,Level_1,Level_2,Level_3
article_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Áedán_mac_Gabráin,History,British_History,British_History_1500_and_before_including_Roma...
Áedán_mac_Gabráin,People,Historical_figures,
Åland,Countries,,
Åland,Geography,European_Geography,European_Countries
Édouard_Manet,People,Artists,


## Loading existing links between articles

In [31]:
links_df = pd.read_csv("data/links.tsv", sep = "\t", comment = '#', header = None)
links_df.columns = ['source', 'target']

# Decode article names
links_df = links_df.map(unquote)

links_df.head()

Unnamed: 0,source,target
0,Áedán_mac_Gabráin,Bede
1,Áedán_mac_Gabráin,Columba
2,Áedán_mac_Gabráin,Dál_Riata
3,Áedán_mac_Gabráin,Great_Britain
4,Áedán_mac_Gabráin,Ireland


## Loading data about finished paths

In [32]:
finished_df = pd.read_csv("data/paths_finished.tsv", sep = "\t", comment = '#', header = None)
finished_df.columns = ['hashIP', 'timestamp', 'duration', 'path', 'difficulty_rating']

# Decode article names and transform path into list
finished_df['path'] = finished_df['path'].apply(lambda a: [unquote(art) for art in a.split(";")])

# Calculate path length
finished_df['path_length'] = finished_df['path'].apply(len)

# Calculate number of backward clicks in each path
finished_df['numBackward'] = finished_df['path'].apply(lambda a: a.count("<"))

finished_df.head()

Unnamed: 0,hashIP,timestamp,duration,path,difficulty_rating,path_length,numBackward
0,6a3701d319fc3754,1297740409,166,"[14th_century, 15th_century, 16th_century, Pac...",,9,0
1,3824310e536af032,1344753412,88,"[14th_century, Europe, Africa, Atlantic_slave_...",3.0,5,0
2,415612e93584d30e,1349298640,138,"[14th_century, Niger, Nigeria, British_Empire,...",,8,0
3,64dd5cd342e3780c,1265613925,37,"[14th_century, Renaissance, Ancient_Greece, Gr...",,4,0
4,015245d773376aab,1366730828,175,"[14th_century, Italy, Roman_Catholic_Church, H...",3.0,7,0


## Loading data about unfinished paths

In [33]:
unfinished_df = pd.read_csv("data/paths_unfinished.tsv", sep = "\t", comment = '#', header = None)
unfinished_df.columns = ['hashIP', 'timestamp', 'duration', 'path', 'target_article', 'type_end']

# Decode article names and transform path to list
unfinished_df['path'] = unfinished_df['path'].apply(lambda a: [unquote(art) for art in a.split(";")])

# Calculate length of unfinished paths
unfinished_df['path_length'] = unfinished_df['path'].apply(len)

# Calculate number of backward clicks
unfinished_df['numBackward'] = unfinished_df['path'].apply(lambda a: a.count("<"))

unfinished_df.head()

Unnamed: 0,hashIP,timestamp,duration,path,target_article,type_end,path_length,numBackward
0,2426091a53125110,1297054935,1804,[Obi-Wan_Kenobi],Microsoft,timeout,1,0
1,26141fd878806294,1297055651,1805,[Julius_Caesar],Caracas,timeout,1,0
2,2b015fb8181c48f2,1297090819,1818,"[Malawi, Democracy, Alexander_the_Great]",First_Crusade,timeout,3,0
3,53a53bc244e08a6a,1297094761,49,[Paraguay],Mount_St._Helens,restart,1,0
4,53a53bc244e08a6a,1297099105,1808,"[Paraguay, Bolivia]",Mount_St._Helens,timeout,2,0


## Loading data about shortest paths between articles (Floyd-Warshall algorithm)

In [36]:
# Open text file with distances
with open('data/shortest-path-distance-matrix.txt', 'r') as file:
    lines = file.readlines()

# Skip metadata lines
lines = lines[17:]

# Transform each line into a list of distances
distances = []
for line in lines:
    # Treat each character as a distance
    distances.append([np.nan if char == '_' else int(char) for char in line.strip()])

# Retrieve list of article names
article_names = articles_df['article_name'].tolist()

# Ensure the number of articles in articles_df matches the number of distances
assert len(article_names) == len(distances)

# Create the distance matrix dataframe
distance_df = pd.DataFrame(distances, columns=article_names, index=article_names)

distance_df.head()

Unnamed: 0,Áedán_mac_Gabráin,Åland,Édouard_Manet,Éire,Óengus_I_of_the_Picts,€2_commemorative_coins,10th_century,11th_century,12th_century,13th_century,...,Ziad_Jarrah,Zimbabwe,Zinc,Zinc_chloride,Zion_National_Park,Zionism,Zirconium,Zoroaster,Zuid-Gelders,Zulu
Áedán_mac_Gabráin,0.0,,,,,,3.0,3.0,3.0,3.0,...,4.0,3.0,3.0,4.0,4.0,3.0,4.0,4.0,4.0,2.0
Åland,,0.0,,,,,2.0,2.0,2.0,2.0,...,4.0,2.0,3.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0
Édouard_Manet,,,0.0,,,,3.0,3.0,2.0,2.0,...,4.0,3.0,2.0,3.0,4.0,3.0,4.0,3.0,3.0,3.0
Éire,,,,0.0,,,3.0,3.0,3.0,3.0,...,4.0,2.0,2.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0
Óengus_I_of_the_Picts,,,,,0.0,,2.0,2.0,3.0,2.0,...,4.0,2.0,3.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0
