In [20]:
import pandas as pd
DATA_FOLDER = 'data/wikispeedia_paths-and-graph/'

In [21]:
paths_finished = pd.read_csv(DATA_FOLDER + 'paths_finished.tsv', sep='\t', skiprows=15, names = ['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating'])[['hashedIpAddress', 'path', 'rating']]

In [22]:
paths_finished.fillna(paths_finished['rating'].median(), inplace=True)
paths_finished.to_csv('tests/paths_finished_cleaned.tsv', sep='\t', index=False)

In [23]:
paths_finished['path'] = paths_finished['path'].apply(lambda x: x.split(';'))

In [24]:
# TODO: add a column for the semantic distance between the start and end articles

In [25]:
links = pd.read_csv(DATA_FOLDER + 'links.tsv', sep='\t', skiprows=11, names=['linkSource', 'linkTarget'])
links = links.groupby('linkSource').agg(lambda x: x.tolist())

In [29]:
def create_choices_dataframe(paths, links):
    rows = []
    for i, row in enumerate(paths.iterrows()):
        path = row[1]['path']
        # first rewrite the path to deal with '<'
        l = 0
        clean_path = []
        stack = []
        while l < len(path):
            if path[l] != '<':
                stack.append(path[l])
            else:
                clean_path.append('<')
                stack.pop()
            clean_path.append(stack[-1]) # add next article or '<' and previous article 
            l += 1
        
        path = clean_path

        for j in range(len(path) - 1):
            if path[j] not in links.index:
                continue
            dict_row = {'run_id': i, 'article': path[j], 'links': links.loc[path[j]]['linkTarget'] if path[j] in links.index else [], 'link_chosen': path[j+1]}
            rows.append(dict_row)
    return pd.DataFrame(rows)

In [30]:
# Create dataframe without filtering the '<' lines
wikispeedia_choices = create_choices_dataframe(paths_finished, links)
wikispeedia_choices

Unnamed: 0,run_id,article,links,link_chosen
0,0,14th_century,"[13th_century, 15th_century, Abacus, Aztec, Bl...",15th_century
1,0,15th_century,"[10th_century, 11th_century, 12th_century, 13t...",16th_century
2,0,16th_century,"[10th_century, 11th_century, 12th_century, 13t...",Pacific_Ocean
3,0,Pacific_Ocean,"[16th_century, 17th_century, 18th_century, 19t...",Atlantic_Ocean
4,0,Atlantic_Ocean,"[Aberdeen, Abidjan, Accra, Africa, Airship, Al...",Accra
...,...,...,...,...
295414,51316,Yarralumla%2C_Australian_Capital_Territory,"[Anglican_Church_of_Australia, Australia, Belg...",Australia
295415,51316,Australia,"[Adelaide, Agnosticism, Anglican_Communion, As...",United_States
295416,51316,United_States,"[Abraham_Lincoln, Advertising, Agriculture, Am...",Abraham_Lincoln
295417,51317,Ziad_Jarrah,"[Afghanistan, Aircraft, Arabic_language, Atlan...",Germany


In [31]:
# Now we filter lines with '<' in the path then create the dataframe
paths_finished_no_back = paths_finished[paths_finished['path'].apply(lambda x: '<' not in x)]
wikispeedia_choices_no_back = create_choices_dataframe(paths_finished_no_back, links)
wikispeedia_choices_no_back

Unnamed: 0,run_id,article,links,link_chosen
0,0,14th_century,"[13th_century, 15th_century, Abacus, Aztec, Bl...",15th_century
1,0,15th_century,"[10th_century, 11th_century, 12th_century, 13t...",16th_century
2,0,16th_century,"[10th_century, 11th_century, 12th_century, 13t...",Pacific_Ocean
3,0,Pacific_Ocean,"[16th_century, 17th_century, 18th_century, 19t...",Atlantic_Ocean
4,0,Atlantic_Ocean,"[Aberdeen, Abidjan, Accra, Africa, Airship, Al...",Accra
...,...,...,...,...
199940,42321,Yarralumla%2C_Australian_Capital_Territory,"[Anglican_Church_of_Australia, Australia, Belg...",Australia
199941,42321,Australia,"[Adelaide, Agnosticism, Anglican_Communion, As...",United_States
199942,42321,United_States,"[Abraham_Lincoln, Advertising, Agriculture, Am...",Abraham_Lincoln
199943,42322,Ziad_Jarrah,"[Afghanistan, Aircraft, Arabic_language, Atlan...",Germany


In [32]:
# Save both dataframes in tsv files
wikispeedia_choices.to_csv('tests/wikispeedia_choices.tsv', sep='\t', index=False)
wikispeedia_choices_no_back.to_csv('tests/wikispeedia_choices_no_back.tsv', sep='\t', index=False)