# Load the Dataset into 2 Dataframes


## Article DF
| Article | Category | linkTargets | Article_id | distances | plain_text | 
| ------- | ---------- | ------------ | ---------- | ---------- | ---------- |
| string | string | list[string] | string | dict[string, int] | string | 


## Path DF
| Path | uid | start_article | target_article | length | finished | distance | rating | reason of fail |
| ------- | ---------- | ------------ | ---------- | ---------- | ---------- | ----- | ---- | ------ |
| list[string] | string | string | string | int  | boolean | int | int | string |

In [None]:
%pip install -r requirements.txt

In [16]:
from src.load_data import *
import pandas as pd 

In [17]:
# create the directory `Data/dataframes`
import os
if not os.path.exists('Data/dataframes'):
    os.makedirs('Data/dataframes')

In [18]:
articles = load_articles_into_df()
categories = load_categories()
links = load_links()
paths_finished = load_paths_finished()
paths_unfinished = load_paths_unfinished()
distances = load_distances()

In [27]:
distances

KeyboardInterrupt: 

In [19]:
# Print the shapes of articles, categories, links
print("Articles shape: ", articles.shape)
print("Categories shape: ", categories.shape)
print("Links shape: ", links.shape)

Articles shape:  (4604, 1)
Categories shape:  (4598, 2)
Links shape:  (4587, 2)


In [20]:
# List the articles that are in articles and not in categories
articles_not_in_categories = articles[~articles['article'].isin(categories['article'])]
articles_not_in_categories

Unnamed: 0,article
1210,Directdebit
1231,Donation
1600,Friend_Directdebit
3253,Pikachu
3849,Sponsorship_Directdebit
4545,Wowpurchase


In [21]:
# List the articles that are in articles and not in link sources
articles_not_in_source_links = articles[~articles['article'].isin(links['linkSource'])]
articles_not_in_source_links


Unnamed: 0,article
441,Badugi
970,Color_Graphics_Adapter
1210,Directdebit
1231,Donation
1237,Douglas_DC-4
1257,Duchenne_muscular_dystrophy
2351,Klinefelter's_syndrome
2530,Local_community
2543,Lone_Wolf_(gamebooks)
3108,Osteomalacia


In [22]:
# List the articles that are in articles and not in link targets
articles_not_in_target_links = articles[~articles['article'].isin(links['linkTarget'].explode())]
articles_not_in_target_links

Unnamed: 0,article
0,Áedán_mac_Gabráin
1,Åland
2,Édouard_Manet
3,Éire
4,Óengus_I_of_the_Picts
...,...
4566,Yellowhammer
4575,Yotsuya_Kaidan
4576,You're_Still_the_One
4580,"Yungay,_Peru"


In [23]:

# | Article | Category | linkTargets | Article_id | distances | plain_text | 

article_dataframe = articles

artiles_unrendered_unicode = load_articles_into_df(do_decode=False)
# Add articles_unrendered_unicode to article_dataframe
article_dataframe['article_unrendered_unicode'] = artiles_unrendered_unicode['article']

# Merge the articles and categories dataframes
article_dataframe = pd.merge(article_dataframe, categories, on='article', how='left')

# Merge the article_dataframe and links dataframes
article_dataframe = pd.merge(article_dataframe, links, left_on='article', right_on='linkSource', how='left')

# Merge the article_dataframe by inserting dictionary into `distances` column` where distances is dict[article_name, dict]
article_dataframe['distances'] = article_dataframe['article'].map(distances)

# for each row in article_dataframe, 
# load the plain_text from the file `Data/plaintext_articles/{article_name}.txt`
def load_plain_text(article_name):
    file_path = f'Data/plaintext_articles/{article_name}.txt'
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    return None

article_dataframe['plain_text'] = article_dataframe['article_unrendered_unicode'].apply(load_plain_text)


article_dataframe.to_feather('Data/dataframes/article_dataframe.feather')

In [24]:
article_dataframe

Unnamed: 0,article,article_unrendered_unicode,category,linkSource,linkTarget,distances,plain_text
0,Áedán_mac_Gabráin,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,[subject.History.British_History.British_Histo...,Áedán_mac_Gabráin,"[Bede, Columba, Dál_Riata, Great_Britain, Irel...","{'Áedán_mac_Gabráin': 0, 'Åland': None, 'Édoua...",#copyright\n\nÁedán mac Gabráin\n\n2007 Sch...
1,Åland,%C3%85land,"[subject.Countries, subject.Geography.European...",Åland,"[20th_century, Baltic_Sea, Crimean_War, Curren...","{'Áedán_mac_Gabráin': None, 'Åland': 0, 'Édoua...",#copyright\n\nÅland\n\n2007 Schools Wikiped...
2,Édouard_Manet,%C3%89douard_Manet,[subject.People.Artists],Édouard_Manet,"[Absinthe, Beer, Claude_Monet, Diego_Velázquez...","{'Áedán_mac_Gabráin': None, 'Åland': None, 'Éd...",#copyright\n\nÉdouard Manet\n\n2007 Schools...
3,Éire,%C3%89ire,"[subject.Countries, subject.Geography.European...",Éire,"[Canada, English_language, George_VI_of_the_Un...","{'Áedán_mac_Gabráin': None, 'Åland': None, 'Éd...",#copyright\n\nÉire\n\n2007 Schools Wikipedi...
4,Óengus_I_of_the_Picts,%C3%93engus_I_of_the_Picts,[subject.History.British_History.British_Histo...,Óengus_I_of_the_Picts,"[Dál_Riata, Durham, England, Great_Britain, Ir...","{'Áedán_mac_Gabráin': None, 'Åland': None, 'Éd...",#copyright\n\nÓengus I of the Picts\n\n2007...
...,...,...,...,...,...,...,...
4599,Zionism,Zionism,"[subject.People.Political_People, subject.Reli...",Zionism,"[18th_century, 19th_century, Adolf_Hitler, Alb...","{'Áedán_mac_Gabráin': None, 'Åland': None, 'Éd...",#copyright\n\nZionism\n\n2007 Schools Wikip...
4600,Zirconium,Zirconium,[subject.Science.Chemistry.Chemical_elements],Zirconium,"[Aluminium, Arabic_language, Australia, Bicycl...","{'Áedán_mac_Gabráin': None, 'Åland': None, 'Éd...",#copyright\n\nZirconium\n\n2007 Schools Wik...
4601,Zoroaster,Zoroaster,[subject.People.Religious_figures_and_leaders],Zoroaster,"[18th_century, 9th_century, Afghanistan, Age_o...","{'Áedán_mac_Gabráin': None, 'Åland': None, 'Éd...",#copyright\n\nZoroaster\n\n2007 Schools Wik...
4602,Zuid-Gelders,Zuid-Gelders,"[subject.Geography.European_Geography, subject...",Zuid-Gelders,"[Brabantian, Dutch_language, East_Flemish, Hol...","{'Áedán_mac_Gabráin': None, 'Åland': None, 'Éd...",#copyright\n\nZuid-Gelders\n\n2007 Schools ...


## Create the links dataframe

| Path | uid | start_article | target_article | length | finished | distance | rating | reason of fail |


In [10]:
paths_unfinished

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,target,type
0,2426091a53125110,1297054935,1804,Obi-Wan_Kenobi,Microsoft,timeout
1,26141fd878806294,1297055651,1805,Julius_Caesar,Caracas,timeout
2,2b015fb8181c48f2,1297090819,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade,timeout
3,53a53bc244e08a6a,1297094761,49,Paraguay,Mount_St._Helens,restart
4,53a53bc244e08a6a,1297099105,1808,Paraguay;Bolivia,Mount_St._Helens,timeout
...,...,...,...,...,...,...
24870,109ed71f571d86e9,1389787605,180,Franz_Kafka;Tuberculosis;World_Health_Organiza...,Cholera,restart
24871,232f992e57d43e8d,1389787697,6,Modern_history,Hollandic,restart
24872,2e09a7224600a7cd,1389798400,1900,Computer_programming;Linguistics;Culture;Popul...,The_Beatles,timeout
24873,60af9e2138051b96,1389799481,1903,Jamaica;United_Kingdom;World_War_II;Battle_of_...,Alan_Turing,timeout


In [11]:
# Add a "finished" column to paths_finished
paths_finished['finished'] = True
paths_finished['failure_reason'] = None
# Extract the start_article and target_article from the `path` column
paths_finished['start_article'] = paths_finished['path'].apply(lambda x: x.split(';')[0])
paths_finished['target_article'] = paths_finished['path'].apply(lambda x: x.split(';')[-1])

# Same to paths_unfinished
paths_unfinished['finished'] = False
# Clean the paths_unfinished dataframe
# Rename 'type' to 'failure_reason'
paths_unfinished.rename(columns={'type': 'failure_reason', 'target' : 'target_article'}, inplace=True)
# extract the start_article from the `path` column
paths_unfinished['start_article'] = paths_unfinished['path'].apply(lambda x: x.split(';')[0])

In [12]:
p = paths_unfinished[paths_unfinished['start_article'] == '<'] 
p

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,target_article,failure_reason,finished,start_article


In [13]:
# add paths_finished and paths_unfinished together
paths = pd.concat([paths_finished, paths_unfinished])

In [14]:
paths

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,finished,failure_reason,start_article,target_article
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,,True,,14th_century,African_slave_trade
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,True,,14th_century,African_slave_trade
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,,True,,14th_century,African_slave_trade
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,,True,,14th_century,Greece
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0,True,,14th_century,John_F._Kennedy
...,...,...,...,...,...,...,...,...,...
24870,109ed71f571d86e9,1389787605,180,Franz_Kafka;Tuberculosis;World_Health_Organiza...,,False,restart,Franz_Kafka,Cholera
24871,232f992e57d43e8d,1389787697,6,Modern_history,,False,restart,Modern_history,Hollandic
24872,2e09a7224600a7cd,1389798400,1900,Computer_programming;Linguistics;Culture;Popul...,,False,timeout,Computer_programming,The_Beatles
24873,60af9e2138051b96,1389799481,1903,Jamaica;United_Kingdom;World_War_II;Battle_of_...,,False,timeout,Jamaica,Alan_Turing


In [15]:
paths.to_feather('Data/dataframes/paths.feather')