In [14]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn', Mutes warnings when copying a slice from a DataFrame.

In [15]:
# Load the TSV file, ignoring lines that start with '#'
articles = pd.read_csv('data/wikispeedia_paths-and-graph/articles.tsv', sep='\t', comment='#')
categories = pd.read_csv('data/wikispeedia_paths-and-graph/categories.tsv', sep='\t', comment='#')
links = pd.read_csv('data/wikispeedia_paths-and-graph/links.tsv', sep='\t', comment='#')
paths_finished = pd.read_csv('data/wikispeedia_paths-and-graph/paths_finished.tsv', sep='\t', comment='#')
paths_unfinished = pd.read_csv('data/wikispeedia_paths-and-graph/paths_unfinished.tsv', sep='\t', comment='#')
shortest_path = pd.read_csv('data/wikispeedia_paths-and-graph/shortest-path-distance-matrix.txt', sep='\t', comment='#', header=None)

In [16]:
links.columns = ['linkSource', 'linkTarget']
paths_finished.columns = ['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating']
paths_unfinished.columns = ['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'target', 'type']
paths_finished['status'] = 'finished'
paths_unfinished['status'] = 'unfinished'

# Concatenate the dataframes with the added 'status' column
concatenated_df = pd.concat(
    [
        paths_finished[['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating', 'status']],
        paths_unfinished[['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'target', 'type', 'status']]
    ],
    ignore_index=True
)

In [17]:
# Create a copy of concatenated_df to avoid modifying the original DataFrame
clean_merge = concatenated_df.copy()

# Modify 'durationInSec' in the new DataFrame based on the condition
for index, row in clean_merge.iterrows():
    if row['type'] == 'timeout':  # Check if the last element in 'path' is 'timeout'
        clean_merge.loc[index, 'durationInSec'] -= 1800  # Subtract 1800 from 'durationInSec'

In [18]:
clean_merge.loc[clean_merge['status'] == 'finished', 'target'] = clean_merge['path'].dropna().str.split(';').str[-1]
clean_merge['type'] = clean_merge['type'].fillna('finished')
clean_merge.to_csv('clean_merge.csv', index=False)
clean_merge.sample(10)


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,status,target,type
43957,250d394a041d0d57,1304457326,46,Nintendo;Europe;Switzerland;Cheese,,finished,Cheese,finished
64800,3a960fe56f4c9590,1350797753,162,Sistine_Chapel_ceiling;Renaissance;Crusades;Fr...,,unfinished,Scottish_Highlands,timeout
21702,5975c03e7ce87ba5,1351324519,162,Pottery;Archaeology;Peru;South_America;Belize,3.0,finished,Belize,finished
10818,75bd20c663d02c8f,1365789537,60,Greek_mythology;20th_century;Telephone,,finished,Telephone,finished
4575,6b039e9953cf075e,1241523213,25,Flower;United_Kingdom,,finished,United_Kingdom,finished
51229,350c57b058550816,1303301079,89,Santiago%2C_Chile;South_America;Asia;Europe;Me...,3.0,finished,Minoan_civilization,finished
40308,632be30a1ea5722a,1323804486,81,Hindi;English_language;Latin;Ancient_Rome;Roma...,,finished,Roman_road,finished
55666,6da0054672544f18,1308790416,5,Olympic_Games,,unfinished,Lyme_disease,timeout
36191,52893d622beb4234,1378740520,53,Buenos_Aires;Horse;Brain,2.0,finished,Brain,finished
47190,0d57c8c57d75e2f5,1279811514,198,Space_Shuttle_Columbia_disaster;Earth%27s_atmo...,,finished,Winter,finished
