In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn', Mutes warnings when copying a slice from a DataFrame.

In [2]:
# Load the TSV file, ignoring lines that start with '#'
articles = pd.read_csv('data/wikispeedia_paths-and-graph/articles.tsv', sep='\t', comment='#')
categories = pd.read_csv('data/wikispeedia_paths-and-graph/categories.tsv', sep='\t', comment='#')
links = pd.read_csv('data/wikispeedia_paths-and-graph/links.tsv', sep='\t', comment='#')
paths_finished = pd.read_csv('data/wikispeedia_paths-and-graph/paths_finished.tsv', sep='\t', comment='#')
paths_unfinished = pd.read_csv('data/wikispeedia_paths-and-graph/paths_unfinished.tsv', sep='\t', comment='#')
shortest_path = pd.read_csv('data/wikispeedia_paths-and-graph/shortest-path-distance-matrix.txt', sep='\t', comment='#', header=None)

In [3]:
links.columns = ['linkSource', 'linkTarget']
paths_finished.columns = ['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating']
paths_unfinished.columns = ['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'target', 'type']
paths_finished['status'] = 'finished'
paths_unfinished['status'] = 'unfinished'

# Concatenate the dataframes with the added 'status' column
concatenated_df = pd.concat(
    [
        paths_finished[['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating', 'status']],
        paths_unfinished[['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'target', 'type', 'status']]
    ],
    ignore_index=True
)

In [4]:
# Create a copy of concatenated_df to avoid modifying the original DataFrame
clean_merge = concatenated_df.copy()

# Modify 'durationInSec' in the new DataFrame based on the condition
for index, row in clean_merge.iterrows():
    if row['type'] == 'timeout':  # Check if the last element in 'path' is 'timeout'
        clean_merge.loc[index, 'durationInSec'] -= 1800  # Subtract 1800 from 'durationInSec'

In [5]:
# Process each row based on 'status' and set 'target' accordingly
for i, row in clean_merge.iterrows():
    if row['status'] == 'finished':
        # Split the path and select the last element
        path_segments = row['path'].split(';') if pd.notnull(row['path']) else []
        row['target'] = path_segments[-1]

# To apply these changes to the DataFrame directly
clean_merge.loc[clean_merge['status'] == 'finished', 'target'] = clean_merge['path'].dropna().str.split(';').str[-1]


In [6]:
clean_merge.sample(10)

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,status,target,type
20240,556610582b5fc732,1274264145,401,Advertising;Ancient_Greece;<;United_States;<;A...,4.0,finished,Stonehenge,
25087,613cfa1c05cbbf9d,1222885439,127,Spacecraft_propulsion;Mars_Reconnaissance_Orbi...,4.0,finished,Bald_Eagle,
57329,3316cb8866a15793,1318658202,151,New_Zealand;United_Kingdom;England;London;Cult...,,unfinished,Krazy_Kat,timeout
5411,4207817a7bf1fcec,1272442921,43,Internet;German_language;Nazism,,finished,Nazism,
76148,3f166d0a191b66e6,1389688545,30,13th_century;Time,,unfinished,Climate_change,restart
3657,482972df0e234ce2,1260239473,167,David_Beckham;England;Jet_engine;Hydrogen;Heli...,1.0,finished,Actinium,
3537,7f930cd75dc18bcc,1329753073,76,Computer_science;Bioinformatics;Chemistry;Crys...,,finished,Edible_salt,
17751,75bd20c663d02c8f,1365713435,84,Ghana;England;Germany;Albert_Einstein;Quantum_...,,finished,String_theory,
39333,08e189ab09da7db4,1230084658,148,Glaciology;Geology;Crust_%28geology%29;Contine...,3.0,finished,Gardening,
27317,469181a156678629,1249175233,129,Ice_hockey;Cricket;<;Denmark;Sunlight;Photosyn...,2.0,finished,Sunflower,


In [7]:
clean_merge.fillna('finished', inplace=True)
clean_merge.to_csv('clean_merge.csv')
clean_merge.sample(10)

  clean_merge.fillna('finished', inplace=True)


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,status,target,type
69276,546557f16be18156,1365780793,5,Data_Encryption_Standard,finished,unfinished,Vampire_bat,timeout
73383,21bf0de12623db9d,1380324312,246,Detroit%2C_Michigan;Chicago,finished,unfinished,Royal_National_Theatre,timeout
65099,11ef55361d13da9c,1351304221,97,Mars_Exploration_Rover;Cornell_University;Chem...,finished,unfinished,Engineering,timeout
15294,1afd1270383bc1fc,1388204772,157,Brain;Mind;Linguistics;Language;Communication;...,finished,finished,Telephone,finished
61261,6f7e60880ae91e69,1344153997,181,Exploration_of_Mars;Europe;British_Isles;Great...,finished,unfinished,Chelsea_F.C.,restart
53245,443d7b8528e2e471,1300598434,967,Theatre;Opera;Richard_Wagner;Germany;Country;E...,finished,unfinished,Zebra,restart
4435,1359a8ee0d6e01c7,1249083976,130,Evolution;United_States;Irish_people,2.0,finished,Irish_people,finished
60565,6594d284353701cc,1340888110,28,First_Macedonian_War,finished,unfinished,Love,restart
29614,1d6271903dd9e42c,1369165394,42,Honey;Water;Fish;Aquarium,3.0,finished,Aquarium,finished
36677,14adab216bfe1e0b,1236047974,166,Che_Guevara;DNA;Animal;Mammal;Chordate;<;Lion;...,3.0,finished,Bear,finished
