In [2]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn', Mutes warnings when copying a slice from a DataFrame.

In [3]:
# Load the TSV file, ignoring lines that start with '#'
articles = pd.read_csv('data/wikispeedia_paths-and-graph/articles.tsv', sep='\t', comment='#')
categories = pd.read_csv('data/wikispeedia_paths-and-graph/categories.tsv', sep='\t', comment='#')
links = pd.read_csv('data/wikispeedia_paths-and-graph/links.tsv', sep='\t', comment='#')
paths_finished = pd.read_csv('data/wikispeedia_paths-and-graph/paths_finished.tsv', sep='\t', comment='#')
paths_unfinished = pd.read_csv('data/wikispeedia_paths-and-graph/paths_unfinished.tsv', sep='\t', comment='#')
shortest_path = pd.read_csv('data/wikispeedia_paths-and-graph/shortest-path-distance-matrix.txt', sep='\t', comment='#', header=None)

In [4]:
links.columns = ['linkSource', 'linkTarget']
paths_finished.columns = ['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating']
paths_unfinished.columns = ['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'target', 'type']
paths_finished['status'] = 'finished'
paths_unfinished['status'] = 'unfinished'

# Concatenate the dataframes with the added 'status' column
concatenated_df = pd.concat(
    [
        paths_finished[['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating', 'status']],
        paths_unfinished[['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'target', 'type', 'status']]
    ],
    ignore_index=True
)

In [5]:
# Create a copy of concatenated_df to avoid modifying the original DataFrame
clean_merge = concatenated_df.copy()

# Modify 'durationInSec' in the new DataFrame based on the condition
for index, row in clean_merge.iterrows():
    if row['type'] == 'timeout':  # Check if the last element in 'path' is 'timeout'
        clean_merge.loc[index, 'durationInSec'] -= 1800  # Subtract 1800 from 'durationInSec'

In [6]:
# Process each row based on 'status' and set 'target' accordingly
for i, row in clean_merge.iterrows():
    if row['status'] == 'finished':
        # Split the path and select the last element
        path_segments = row['path'].split(';') if pd.notnull(row['path']) else []
        row['target'] = path_segments[-1]

# To apply these changes to the DataFrame directly
clean_merge.loc[clean_merge['status'] == 'finished', 'target'] = clean_merge['path'].dropna().str.split(';').str[-1]


In [7]:
clean_merge.to_csv('clean_merge.csv')

In [8]:
clean_merge.sample(10)

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,status,target,type
57039,100f45996ea4adac,1315850893,38,Minneapolis%2C_Minnesota;United_States;Preside...,,unfinished,Religious_Society_of_Friends,restart
57634,76baa89b5bf0f0df,1321294639,353,Drum_and_bass;Bristol;Ice_age;Climate;Weather,,unfinished,Soil,restart
47385,738879d35f1e86e9,1273978895,97,Stegosaurus;East_Africa;Indian_Ocean;Australia...,,finished,Kangaroo,
42770,4398032616bb1310,1254007062,62,Margaret_Sanger;United_States;Pacific_Ocean;Af...,,finished,Western_Sahara,
49933,11f274a11b01180a,1259068550,31,Van_Halen;Guitar;Musical_instrument;Flute,1.0,finished,Flute,
54026,2325e8e4315b363c,1303848450,30,Lusaka;Europe,,unfinished,Edward_VIII_of_the_United_Kingdom,restart
3292,04f22b1a33430c59,1315577149,17,Climate_change;North_Atlantic_oscillation;El_N...,1.0,finished,El_Ni%C3%B1o-Southern_Oscillation,
61988,5dbec8a62ba71d41,1344969761,7,Aluminium_chloride,,unfinished,Ku_Klux_Klan,restart
40070,65a490c216191d92,1313539314,255,Hello_Garci_scandal;Philippines;English_langua...,5.0,finished,Francis_Drake,
45759,0e3862ef0943ac7c,1249912167,47,Rail_transport;Iron;Manganese;Chromium;Molybde...,1.0,finished,Niobium,


In [9]:
clean_merge.fillna('finished', inplace=True)
clean_merge.sample(10)

  clean_merge.fillna('finished', inplace=True)


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,status,target,type
29218,0d57c8c57d75e2f5,1288191394,160,The_Temptations;United_States;North_America;So...,1.0,finished,Chile,finished
4713,0b415aa936f6b968,1351200482,160,French_Revolution;Famine;Agriculture;Fossil_fu...,finished,finished,Calcium,finished
70429,73319ffd63eac41d,1368644071,30,Action_potential;Giraffe;<,finished,unfinished,The_Simpsons,timeout
75936,538bf36b082c7dd1,1388948446,1178,Jazz;United_States;Moon;Solar_System;Sun;Star;...,finished,unfinished,16_Cygni,timeout
14843,3504af4953148500,1348614073,108,Brain;Cell_%28biology%29;Human;Communication;T...,1.0,finished,Telephone,finished
72982,126c53a444812f50,1378744525,7,Famine;Soviet_Union,finished,unfinished,Arctic,restart
9372,5ee218bb01627793,1241160612,296,Russian_language;English_language;Liverpool;Lo...,finished,finished,Tim_Henman,finished
66112,4d5e123b578dc965,1353891168,78,Star_Trek;War;Market;Information,finished,unfinished,Aircraft_carrier,restart
19154,4336d85339763f62,1222831965,51,Socialism;Egypt;North_Africa;Africa;Tanzania,2.0,finished,Tanzania,finished
52981,282cc7a66f3d1d6c,1299795687,168,Sunlight;Earth;German_language;Germany;Gottfri...,finished,unfinished,Johann_Wolfgang_Goethe,timeout
