In [1]:
import pandas as pd
import os
from urllib.parse import unquote
import datetime
import numpy as np
import ast


os.chdir("/home/gabri/Desktop/ADA/ADA_wikispeedia")
print(os.getcwd())

SAVE_FOLDER = "data_processed"

/home/gabri/Desktop/ADA/ADA_wikispeedia


# Article Name tsv

In [22]:
df_names_path = "data/wikispeedia_paths-and-graph/articles.tsv"
df_names = pd.read_csv(df_names_path, sep='\t', header=None, comment='#')

# name a colum
df_names = df_names.rename(columns={0:"article_name"})

# Save encoded nmes
df_names["article_name_encoded"] = df_names.article_name

# Decode the article name
df_names.article_name = df_names.article_name.apply(unquote)

# Use " " instead of "_"
df_names.article_name = df_names.article_name.str.replace('_', ' ', regex=False)

#  Reset row index
df_names.reset_index(drop=True, inplace=True)

# put ID column
df_names["article_id"] = df_names.index.astype(str)

# Save processed df
df_names.to_csv(os.path.join(SAVE_FOLDER, "articles_processed.csv"), index = False)

display(df_names.head(20))

Unnamed: 0,article_name,article_name_encoded,article_id
0,Áedán mac Gabráin,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,0
1,Åland,%C3%85land,1
2,Édouard Manet,%C3%89douard_Manet,2
3,Éire,%C3%89ire,3
4,Óengus I of the Picts,%C3%93engus_I_of_the_Picts,4
5,€2 commemorative coins,%E2%82%AC2_commemorative_coins,5
6,10th century,10th_century,6
7,11th century,11th_century,7
8,12th century,12th_century,8
9,13th century,13th_century,9


# Article Category tsv

In [3]:
df_categories_path = "data/wikispeedia_paths-and-graph/categories.tsv"
df_categories = pd.read_csv(df_categories_path, sep='\t', header=None, comment='#')

# name a colum
df_categories = df_categories.rename(columns={0:"article_name", 1:"article_category"})

# Decode the article name
df_categories.article_name = df_categories.article_name.apply(unquote)

# Use " " instead of "_"
df_categories.article_name = df_categories.article_name.str.replace('_', ' ', regex=False)

# Use " " instead of "_"
df_categories.article_category = df_categories.article_category.str.replace('_', ' ', regex=False)

# Split the article_category into a list of categories
df_categories['article_category_list'] = df_categories['article_category'].str.split('.')

# Remove 'subject' from each list
df_categories['article_category_list'] = df_categories['article_category_list'].apply(lambda x: [cat for cat in x if cat.strip() != "subject"])

# Create a col for each cat
category_lists = df_categories['article_category_list']
category_df = pd.DataFrame(category_lists.tolist())
category_df.columns = [f'article_category_{i+1}' for i in range(category_df.shape[1])]
df_categories = pd.concat([df_categories, category_df], axis=1)


# HOW TO DEAL WITH ARTICLE THAT HAVE MORE CATEGORIES???
#print("Article with same names:")
#print(df_categories[df_categories["article_name"].duplicated()])
# ??????????????????????''
# ??????????????????????

#  Reset row index
df_categories.reset_index(drop=True, inplace=True)

display(df_categories)

df_categories.to_csv(os.path.join(SAVE_FOLDER, "categories_processed.csv"), index = False)


Unnamed: 0,article_name,article_category,article_category_list,article_category_1,article_category_2,article_category_3
0,Áedán mac Gabráin,subject.History.British History.British Histor...,"[History, British History, British History 150...",History,British History,British History 1500 and before including Roma...
1,Áedán mac Gabráin,subject.People.Historical figures,"[People, Historical figures]",People,Historical figures,
2,Åland,subject.Countries,[Countries],Countries,,
3,Åland,subject.Geography.European Geography.European ...,"[Geography, European Geography, European Count...",Geography,European Geography,European Countries
4,Édouard Manet,subject.People.Artists,"[People, Artists]",People,Artists,
...,...,...,...,...,...,...
5199,Zirconium,subject.Science.Chemistry.Chemical elements,"[Science, Chemistry, Chemical elements]",Science,Chemistry,Chemical elements
5200,Zoroaster,subject.People.Religious figures and leaders,"[People, Religious figures and leaders]",People,Religious figures and leaders,
5201,Zuid-Gelders,subject.Geography.European Geography,"[Geography, European Geography]",Geography,European Geography,
5202,Zuid-Gelders,subject.Language and literature.Languages,"[Language and literature, Languages]",Language and literature,Languages,


# Article links relationship

In [4]:
df_links_path = "data/wikispeedia_paths-and-graph/links.tsv"
df_links = pd.read_csv(df_links_path, sep='\t', header=None, comment='#')

# name a colum
df_links = df_links.rename(columns={0:"link_source", 1:"link_target"})

# Decode the article name and Use " " instead of "_"
df_links.link_source = df_links.link_source.apply(unquote)
df_links.link_source = df_links.link_source.str.replace('_', ' ', regex=False)

df_links.link_target = df_links.link_target.apply(unquote)
df_links.link_target = df_links.link_target.str.replace('_', ' ', regex=False)

# add coplumns with article Id
df_article = pd.read_csv(os.path.join(SAVE_FOLDER, "articles_processed.csv"))

    #soruce
df_links = pd.merge(left = df_links, 
                    right = df_article,
                    left_on="link_source",
                    right_on="article_name")
df_links = df_links.rename(columns={"article_id": "link_source_id"})
df_links.drop(["article_name"], axis=1, inplace=True)

    #target
df_links = pd.merge(left = df_links, 
                    right = df_article,
                    left_on="link_target",
                    right_on="article_name")

df_links = df_links.rename(columns={"article_id": "link_target_id"})
df_links.drop(["article_name"], axis=1, inplace=True)

df_links.to_csv(os.path.join(SAVE_FOLDER, "links_processed.csv"), index = False)

df_links


Unnamed: 0,link_source,link_target,link_source_id,link_target_id
0,Áedán mac Gabráin,Bede,0,530
1,Áedán mac Gabráin,Columba,0,974
2,Áedán mac Gabráin,Dál Riata,0,1115
3,Áedán mac Gabráin,Great Britain,0,1772
4,Áedán mac Gabráin,Ireland,0,2149
...,...,...,...,...
119877,Zulu,South Africa,4603,3803
119878,Zulu,Swaziland,4603,3967
119879,Zulu,United Kingdom,4603,4293
119880,Zulu,Zambia,4603,4587


# Path Finished


In [5]:
df_pf_path = "data/wikispeedia_paths-and-graph/paths_finished.tsv"
df_pf = pd.read_csv(df_pf_path, sep='\t', header=None, comment='#')

# name a colum
df_pf = df_pf.rename(columns={0:"hashed_ip", 1:"timestamp", 2:"duration", 3:"path", 4:"rating"})

# create a list of liks for rach path
df_pf['path_list'] = df_pf['path'].str.split(';')

# for each element of the path: 1) decode, 2) substitue "_" with " "
df_pf['path_list'] = df_pf['path_list'].apply(lambda links: [unquote(link.replace('_', ' ')) for link in links])

# source link and target link
df_pf['source_link'] = df_pf['path_list'].apply(lambda x: x[0] if len(x) > 0 else None)  # First element
df_pf['target_link'] = df_pf['path_list'].apply(lambda x: x[-1] if len(x) > 0 else None)  # Last element

# colum for finished or not
df_pf["finished"] = True

# column for cause fo unfinisehd in case (None for finished paths)
df_pf["type_unfinished"] = None

df_pf.head(2)

Unnamed: 0,hashed_ip,timestamp,duration,path,rating,path_list,source_link,target_link,finished,type_unfinished
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,,"[14th century, 15th century, 16th century, Pac...",14th century,African slave trade,True,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,"[14th century, Europe, Africa, Atlantic slave ...",14th century,African slave trade,True,


# Path Unfinished

In [6]:
df_uf_path = "data/wikispeedia_paths-and-graph/paths_unfinished.tsv"
df_uf = pd.read_csv(df_uf_path, sep='\t', header=None, comment='#')

# name a colum
df_uf = df_uf.rename(columns={0:"hashed_ip", 1:"timestamp", 2:"duration", 3:"path", 4:"target_link", 5:"type_unfinished"})

# create a list of liks for rach path
df_uf['path_list'] = df_uf['path'].str.split(';')

# for each element of the path: 1) decode, 2) substitue "_" with " "
df_uf['path_list'] = df_uf['path_list'].apply(lambda links: [unquote(link.replace('_', ' ')) for link in links])

# source link and target link
df_uf['source_link'] = df_uf['path_list'].apply(lambda x: x[0] if len(x) > 0 else None)  # First element

# colum for finished or not
df_uf["finished"] = False

# Rating columns (not present in unfinsef
df_uf["rating"] = None

# Remove usless cols

df_uf.head(5)

Unnamed: 0,hashed_ip,timestamp,duration,path,target_link,type_unfinished,path_list,source_link,finished,rating
0,2426091a53125110,1297054935,1804,Obi-Wan_Kenobi,Microsoft,timeout,[Obi-Wan Kenobi],Obi-Wan Kenobi,False,
1,26141fd878806294,1297055651,1805,Julius_Caesar,Caracas,timeout,[Julius Caesar],Julius Caesar,False,
2,2b015fb8181c48f2,1297090819,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade,timeout,"[Malawi, Democracy, Alexander the Great]",Malawi,False,
3,53a53bc244e08a6a,1297094761,49,Paraguay,Mount_St._Helens,restart,[Paraguay],Paraguay,False,
4,53a53bc244e08a6a,1297099105,1808,Paraguay;Bolivia,Mount_St._Helens,timeout,"[Paraguay, Bolivia]",Paraguay,False,


# Merge

In [7]:
# Put cols in the same order
cols_name = df_pf.columns.to_list()
df_up = df_uf[cols_name]

# Syack veticllay the 2 df
df_p = pd.concat([df_pf, df_up], axis=0, ignore_index=True)

df_p

  df_p = pd.concat([df_pf, df_up], axis=0, ignore_index=True)


Unnamed: 0,hashed_ip,timestamp,duration,path,rating,path_list,source_link,target_link,finished,type_unfinished
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,,"[14th century, 15th century, 16th century, Pac...",14th century,African slave trade,True,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,"[14th century, Europe, Africa, Atlantic slave ...",14th century,African slave trade,True,
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,,"[14th century, Niger, Nigeria, British Empire,...",14th century,African slave trade,True,
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,,"[14th century, Renaissance, Ancient Greece, Gr...",14th century,Greece,True,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0,"[14th century, Italy, Roman Catholic Church, H...",14th century,John F. Kennedy,True,
...,...,...,...,...,...,...,...,...,...,...
76188,109ed71f571d86e9,1389787605,180,Franz_Kafka;Tuberculosis;World_Health_Organiza...,,"[Franz Kafka, Tuberculosis, World Health Organ...",Franz Kafka,Cholera,False,restart
76189,232f992e57d43e8d,1389787697,6,Modern_history,,[Modern history],Modern history,Hollandic,False,restart
76190,2e09a7224600a7cd,1389798400,1900,Computer_programming;Linguistics;Culture;Popul...,,"[Computer programming, Linguistics, Culture, P...",Computer programming,The_Beatles,False,timeout
76191,60af9e2138051b96,1389799481,1903,Jamaica;United_Kingdom;World_War_II;Battle_of_...,,"[Jamaica, United Kingdom, World War II, Battle...",Jamaica,Alan_Turing,False,timeout


In [8]:
# Add other metrics

# number cliks /length path
df_p['n_click'] = df_p['path_list'].apply(len)

# Count occurrences of "<" in each list of path_list
df_p['n_back'] = df_p['path_list'].apply(lambda x: len([el for el in x if el == "<"]))

#convert timestamp into date
df_p['date'] = df_p['timestamp'].apply(datetime.datetime.fromtimestamp)

# click rate
df_p['click_rate'] = df_p['n_click']/df_p['duration']
df_p['normalized_duration'] = df_p['duration'] / df_p['n_click']
# Replace infinite values with 0
df_p['click_rate'].replace([np.inf, -np.inf], 0, inplace=True)

# freq back click
df_p['freq_back'] = df_p['n_back']/df_p['n_click']

df_p


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_p['click_rate'].replace([np.inf, -np.inf], 0, inplace=True)


Unnamed: 0,hashed_ip,timestamp,duration,path,rating,path_list,source_link,target_link,finished,type_unfinished,n_click,n_back,date,click_rate,normalized_duration,freq_back
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,,"[14th century, 15th century, 16th century, Pac...",14th century,African slave trade,True,,9,0,2011-02-15 04:26:49,0.054217,18.444444,0.000
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,"[14th century, Europe, Africa, Atlantic slave ...",14th century,African slave trade,True,,5,0,2012-08-12 08:36:52,0.056818,17.600000,0.000
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,,"[14th century, Niger, Nigeria, British Empire,...",14th century,African slave trade,True,,8,0,2012-10-03 23:10:40,0.057971,17.250000,0.000
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,,"[14th century, Renaissance, Ancient Greece, Gr...",14th century,Greece,True,,4,0,2010-02-08 08:25:25,0.108108,9.250000,0.000
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0,"[14th century, Italy, Roman Catholic Church, H...",14th century,John F. Kennedy,True,,7,0,2013-04-23 17:27:08,0.040000,25.000000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76188,109ed71f571d86e9,1389787605,180,Franz_Kafka;Tuberculosis;World_Health_Organiza...,,"[Franz Kafka, Tuberculosis, World Health Organ...",Franz Kafka,Cholera,False,restart,8,1,2014-01-15 13:06:45,0.044444,22.500000,0.125
76189,232f992e57d43e8d,1389787697,6,Modern_history,,[Modern history],Modern history,Hollandic,False,restart,1,0,2014-01-15 13:08:17,0.166667,6.000000,0.000
76190,2e09a7224600a7cd,1389798400,1900,Computer_programming;Linguistics;Culture;Popul...,,"[Computer programming, Linguistics, Culture, P...",Computer programming,The_Beatles,False,timeout,5,1,2014-01-15 16:06:40,0.002632,380.000000,0.200
76191,60af9e2138051b96,1389799481,1903,Jamaica;United_Kingdom;World_War_II;Battle_of_...,,"[Jamaica, United Kingdom, World War II, Battle...",Jamaica,Alan_Turing,False,timeout,4,0,2014-01-15 16:24:41,0.002102,475.750000,0.000


# Encode paths nodes with article_id

In [9]:
# df for path name trnaspltion
df_names = pd.read_csv("data_processed/articles_processed.csv")

#Create a mapping dictionary from the DataFrame
mapping_dict = pd.Series(df_names.article_id.values, index=df_names.article_name).to_dict()
mapping_dict.update({"<":"<"})
#print(mapping_dict)


mapped_paths = []
for i, row in df_p.iterrows():
    path = row["path_list"]
    #print(path)
    mapped_path = [mapping_dict[item] for item in path]  
    #print(mapped_path)
    #print(i)
    mapped_paths.append(mapped_path)


df_p["path_list_id"] = mapped_paths

df_p

Unnamed: 0,hashed_ip,timestamp,duration,path,rating,path_list,source_link,target_link,finished,type_unfinished,n_click,n_back,date,click_rate,normalized_duration,freq_back,path_list_id
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,,"[14th century, 15th century, 16th century, Pac...",14th century,African slave trade,True,,9,0,2011-02-15 04:26:49,0.054217,18.444444,0.000,"[10, 12, 15, 3134, 377, 105, 128, 379, 143]"
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,"[14th century, Europe, Africa, Atlantic slave ...",14th century,African slave trade,True,,5,0,2012-08-12 08:36:52,0.056818,17.600000,0.000,"[10, 1433, 128, 379, 143]"
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,,"[14th century, Niger, Nigeria, British Empire,...",14th century,African slave trade,True,,8,0,2012-10-03 23:10:40,0.057971,17.250000,0.000,"[10, 2982, 2984, 694, 3755, 128, 379, 143]"
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,,"[14th century, Renaissance, Ancient Greece, Gr...",14th century,Greece,True,,4,0,2010-02-08 08:25:25,0.108108,9.250000,0.000,"[10, 3464, 241, 1793]"
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0,"[14th century, Italy, Roman Catholic Church, H...",14th century,John F. Kennedy,True,,7,0,2013-04-23 17:27:08,0.040000,25.000000,0.000,"[10, 2183, 3529, 1836, 3542, 3342, 2266]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76188,109ed71f571d86e9,1389787605,180,Franz_Kafka;Tuberculosis;World_Health_Organiza...,,"[Franz Kafka, Tuberculosis, World Health Organ...",Franz Kafka,Cholera,False,restart,8,1,2014-01-15 13:06:45,0.044444,22.500000,0.125,"[1585, 4247, 4538, 2614, 3223, <, 3917, 1696]"
76189,232f992e57d43e8d,1389787697,6,Modern_history,,[Modern history],Modern history,Hollandic,False,restart,1,0,2014-01-15 13:08:17,0.166667,6.000000,0.000,[2802]
76190,2e09a7224600a7cd,1389798400,1900,Computer_programming;Linguistics;Culture;Popul...,,"[Computer programming, Linguistics, Culture, P...",Computer programming,The_Beatles,False,timeout,5,1,2014-01-15 16:06:40,0.002632,380.000000,0.200,"[1008, 2478, 1098, 3315, <]"
76191,60af9e2138051b96,1389799481,1903,Jamaica;United_Kingdom;World_War_II;Battle_of_...,,"[Jamaica, United Kingdom, World War II, Battle...",Jamaica,Alan_Turing,False,timeout,4,0,2014-01-15 16:24:41,0.002102,475.750000,0.000,"[2205, 4293, 4542, 506]"


## Resolve back-clicks

In [10]:
def replace_backClicks(path_arr):
  lastWasBC = False
  offset = 2
  #look into the path array and if you find a < character at position n, you replace it with the n-2 page.
  for idx, n in enumerate(path_arr):

    if n == "<" and idx >= 2:
      if(lastWasBC): 
        offset+=2
      path_arr[idx] = path_arr[idx-offset]
      lastWasBC = True
    else:
      lastWasBC = False
      offset = 2
      
  return path_arr


def resolve_backsteps(path):
    resolved_path = []  # Final resolved path
    resolved_path_stack = []  # Stack to track the unique articles added

    for item in path:
        if item != "<":  # Normal behavior, add article
            resolved_path.append(item)
            resolved_path_stack.append(item)
        elif item == "<":  # Handle the first backtracking step
            if len(resolved_path_stack) > 1:
                resolved_path_stack.pop()  # Remove the most recent article
                resolved_path.append(resolved_path_stack.pop())  # Append the second last article
                resolved_path_stack.append(resolved_path[-1])
    return resolved_path


# Apply the function to create a new column 'resolved_path_list_id'
df_p['resolved_path_list_id'] = df_p['path_list_id'].apply(resolve_backsteps)
# Save as array iof int
#df_p['resolved_path_list_id'] = df_p['resolved_path_list_id'].apply(lambda x: np.array(x, dtype=int))

# Add a column with resolved paths with names
def id2names(ids):
  list_names = []
  for id in ids:
     list_names.append(df_article["article_name"].iloc[id])
  return list_names
df_p['resolved_path_list_name'] = df_p['resolved_path_list_id'].apply(id2names)

# Display the DataFrame
display(df_p[['path_list_id', 'resolved_path_list_id', 'resolved_path_list_name']].head(20))


Unnamed: 0,path_list_id,resolved_path_list_id,resolved_path_list_name
0,"[10, 12, 15, 3134, 377, 105, 128, 379, 143]","[10, 12, 15, 3134, 377, 105, 128, 379, 143]","[14th century, 15th century, 16th century, Pac..."
1,"[10, 1433, 128, 379, 143]","[10, 1433, 128, 379, 143]","[14th century, Europe, Africa, Atlantic slave ..."
2,"[10, 2982, 2984, 694, 3755, 128, 379, 143]","[10, 2982, 2984, 694, 3755, 128, 379, 143]","[14th century, Niger, Nigeria, British Empire,..."
3,"[10, 3464, 241, 1793]","[10, 3464, 241, 1793]","[14th century, Renaissance, Ancient Greece, Gr..."
4,"[10, 2183, 3529, 1836, 3542, 3342, 2266]","[10, 2183, 3529, 1836, 3542, 3342, 2266]","[14th century, Italy, Roman Catholic Church, H..."
5,"[10, 1433, 3011, 4297, 3342, 2266]","[10, 1433, 3011, 4297, 3342, 2266]","[14th century, Europe, North America, United S..."
6,"[10, 894, 1820, 1511]","[10, 894, 1820, 1511]","[14th century, China, Gunpowder, Fire]"
7,"[10, 4147, 2165, 2466, 969, 3425]","[10, 4147, 2165, 2466, 969, 3425]","[14th century, Time, Isaac Newton, Light, Colo..."
8,"[10, 4147, 2466, 3425]","[10, 4147, 2466, 3425]","[14th century, Time, Light, Rainbow]"
9,"[10, 12, 3267, 2933, 4285, 969, 3425]","[10, 12, 3267, 2933, 4285, 969, 3425]","[14th century, 15th century, Plato, Nature, Ul..."


## Remove paths unfished with only one node (~ Unplayed games)

In [11]:
df_p = df_p[~((df_p["type_unfinished"] == "timeout") & (df_p["n_click"] == 1))]

#there are some paths that timed out without the player navigating, so path_length 1 and type timeout
print(f"The number of unfinished paths that timed out without the player making a move are {len(df_p[(df_p['type_unfinished'] == 'timeout') & (df_p['n_click'] == 1)])}")

The number of unfinished paths that timed out without the player making a move are 0


## Remove paths where initial and final nodes are the same

In [12]:
print(df_p[df_p["duration"] == 0].size)

df_p = df_p[df_p["duration"] != 0]

print(df_p[df_p["duration"] == 0].size)


209
0


## Check for Nan

In [13]:
# Cehck for nan values, after reomobn colsumn thata re not presdent in both df
missing_rows = df_p[df_p.drop(["rating", "type_unfinished"], axis=1).isnull().any(axis=1)]
missing_rows

Unnamed: 0,hashed_ip,timestamp,duration,path,rating,path_list,source_link,target_link,finished,type_unfinished,n_click,n_back,date,click_rate,normalized_duration,freq_back,path_list_id,resolved_path_list_id,resolved_path_list_name
10377,,1219094972,96,Heat;Latin;Europe;Italy,,"[Heat, Latin, Europe, Italy]",Heat,Italy,True,,4,0,2008-08-18 23:29:32,0.041667,24.0,0.0,"[1879, 2417, 1433, 2183]","[1879, 2417, 1433, 2183]","[Heat, Latin, Europe, Italy]"
13719,,1219094972,140,Tyrannosaurus;London;Temperate;Weather;Solar_S...,,"[Tyrannosaurus, London, Temperate, Weather, So...",Tyrannosaurus,Sun,True,,6,0,2008-08-18 23:29:32,0.042857,23.333333,0.0,"[4271, 2538, 4019, 4437, 3791, 3939]","[4271, 2538, 4019, 4437, 3791, 3939]","[Tyrannosaurus, London, Temperate, Weather, So..."
15545,,1219094972,106,Tropical_Storm_Bonnie_%282004%29;Gulf_of_Mexic...,,"[Tropical Storm Bonnie (2004), Gulf of Mexico,...",Tropical Storm Bonnie (2004),Rhodium,True,,12,3,2008-08-18 23:29:32,0.113208,8.833333,0.25,"[4225, 1818, <, 2931, 1888, 871, 381, 1345, <,...","[4225, 1818, 4225, 2931, 1888, 871, 381, 1345,...","[Tropical Storm Bonnie (2004), Gulf of Mexico,..."


## Save df

In [14]:
# Save
df_p.to_csv(os.path.join(SAVE_FOLDER, "all_articles_processed.csv"), index = False)

# BFS Matrix

In [15]:
shortest_distance_path = "data/wikispeedia_paths-and-graph/shortest-path-distance-matrix.txt"

def CreateShortestPathMatrix():
    f = open(shortest_distance_path, "r")

    num_articles = 0
    for line in f:
        if not (line[0] == "#" or len(line.strip()) == 0):
            num_articles += 1

    matrix = np.zeros(shape=(num_articles, num_articles))
    # reset pointer to start of file
    f.seek(0)

    current_article = 0
    for line in f:
        if line[0] == "#" or len(line.strip()) == 0:
            continue
        line = line.rstrip()
        digits = [int(d) if d != "_" else np.nan for d in line]
        matrix[current_article] = digits
        current_article += 1
    return matrix

shortest_path_matrix = CreateShortestPathMatrix()

print(shortest_path_matrix)

# returns shortest path distance between two articles, nan if cannot find the articles or no path exists
def ShortestPath(article_from, article_to):
    idx1 = df_names.index[df_names["article_name"] == article_from].tolist()
    idx2 = df_names.index[df_names["article_name"] == article_to].tolist()

    if not idx1:
        print(f"ShortestPath: can't find article {article_from}")
    if not idx2:
        print(f"ShortestPath: can't find article {article_to}")
    if not idx1 or not idx2:
        return np.nan

    return shortest_path_matrix[idx1[0]][idx2[0]]

ShortestPath(1, 0)

[[ 0. nan nan ...  4.  4.  2.]
 [nan  0. nan ...  3.  3.  3.]
 [nan nan  0. ...  3.  3.  3.]
 ...
 [nan nan nan ...  0.  3.  3.]
 [nan nan nan ...  4.  0.  3.]
 [nan nan nan ...  3.  3.  0.]]
ShortestPath: can't find article 1
ShortestPath: can't find article 0


nan

In [16]:
print(shortest_path_matrix.shape)

bfs_matrix = pd.DataFrame(shortest_path_matrix, index = df_names["article_name"], columns=df_names["article_name"])

display(bfs_matrix)

bfs_matrix.to_csv("data_processed/bfs_matrix.csv")

(4604, 4604)


article_name,Áedán mac Gabráin,Åland,Édouard Manet,Éire,Óengus I of the Picts,€2 commemorative coins,10th century,11th century,12th century,13th century,...,Ziad Jarrah,Zimbabwe,Zinc,Zinc chloride,Zion National Park,Zionism,Zirconium,Zoroaster,Zuid-Gelders,Zulu
article_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Áedán mac Gabráin,0.0,,,,,,3.0,3.0,3.0,3.0,...,4.0,3.0,3.0,4.0,4.0,3.0,4.0,4.0,4.0,2.0
Åland,,0.0,,,,,2.0,2.0,2.0,2.0,...,4.0,2.0,3.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0
Édouard Manet,,,0.0,,,,3.0,3.0,2.0,2.0,...,4.0,3.0,2.0,3.0,4.0,3.0,4.0,3.0,3.0,3.0
Éire,,,,0.0,,,3.0,3.0,3.0,3.0,...,4.0,2.0,2.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0
Óengus I of the Picts,,,,,0.0,,2.0,2.0,3.0,2.0,...,4.0,2.0,3.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zionism,,,,,,,2.0,2.0,2.0,2.0,...,3.0,2.0,2.0,3.0,3.0,0.0,3.0,3.0,3.0,2.0
Zirconium,,,,,,,3.0,3.0,3.0,3.0,...,3.0,3.0,2.0,2.0,3.0,3.0,0.0,3.0,4.0,3.0
Zoroaster,,,,,,,2.0,2.0,2.0,2.0,...,3.0,2.0,2.0,3.0,4.0,3.0,3.0,0.0,3.0,3.0
Zuid-Gelders,,,,,,,3.0,3.0,3.0,3.0,...,4.0,3.0,3.0,4.0,4.0,3.0,5.0,4.0,0.0,3.0
