### Import raw datasets

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

DATA_FOLDER = "../data/"
PATHS_AND_GRAPH = DATA_FOLDER + "wikispeedia_paths-and-graph/"
PATHS_FINISHED = PATHS_AND_GRAPH + "paths_finished.tsv"
PATHS_UNFINISHED = PATHS_AND_GRAPH + "paths_unfinished.tsv"

# data exploration
paths_finished = pd.read_csv(
    PATHS_FINISHED,
    sep="\t",
    header=None,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"],
    encoding="utf-8",
    skiprows=16,
).copy(deep=True)
paths_finished["timestamp"] = pd.to_datetime(paths_finished["timestamp"], unit="s")

paths_unfinished = pd.read_csv(
    PATHS_UNFINISHED,
    sep="\t",
    header=None,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "type"],
    encoding="utf-8",
    skiprows=17,
).copy(deep=True)
paths_unfinished["timestamp"] = pd.to_datetime(paths_unfinished["timestamp"], unit="s")

# Remove annoying future warnings in pandas and seaborn with new python versions
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
from feature_49_generation import add_time_per_edge, add_BERTscore_metric, add_sentence_similarity_metric, split_into_edges

#### Get metric 4: time per edge

In [3]:
add_time_per_edge(paths_finished)

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,path_length,coarse_mean_time
0,6a3701d319fc3754,2011-02-15 03:26:49,166,14th_century;15th_century;16th_century;Pacific...,,9,18.444444
1,3824310e536af032,2012-08-12 06:36:52,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,5,17.600000
2,415612e93584d30e,2012-10-03 21:10:40,138,14th_century;Niger;Nigeria;British_Empire;Slav...,,8,17.250000
3,64dd5cd342e3780c,2010-02-08 07:25:25,37,14th_century;Renaissance;Ancient_Greece;Greece,,4,9.250000
4,015245d773376aab,2013-04-23 15:27:08,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0,7,25.000000
...,...,...,...,...,...,...,...
51313,15a13a1d66ef5456,2012-10-03 02:23:35,66,Yagan;Ancient_Egypt;Civilization,,3,22.000000
51314,2ef7ac844cefda58,2011-03-16 05:42:18,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,3.0,9,18.333333
51315,12863abb7887f890,2013-11-22 04:42:52,228,Yagan;Australia;England;France;United_States;T...,,7,32.571429
51316,19f8284371753362,2011-02-27 07:42:47,56,Yarralumla%2C_Australian_Capital_Territory;Aus...,1.0,4,14.000000


#### Utility function in order to easily fetch an edge-based dataframe

In [4]:
edge_df = split_into_edges(paths_finished)
edge_df

Unnamed: 0,edge
0,"(Kazakhstan, Time_zone)"
1,"(Germany, Augustus)"
2,"(Alcohol, Salt)"
3,"(Neon, Aluminium)"
4,"(British_Isles, Iron_Age)"
...,...
55142,"(Beatles_for_Sale, Germany)"
55143,"(Carbon_dioxide, Argon)"
55144,"(Dinosaur, Cretaceous-Tertiary_extinction_event)"
55145,"(Johnston_Atoll, Pacific_Ocean)"


#### This is only for showing our old metric, but it won't be used

In [5]:
bert_result = add_BERTscore_metric(paths_finished, edge_df)
display(bert_result[0])
display(bert_result[1])

100%|██████████| 51318/51318 [02:08<00:00, 400.02it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  "mean_rating": [np.nanmean(a[:, 0]) for a in global_dict.values()],


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,sucessive_pairs,sucessive_pairs_encoded,BERTscore
0,6a3701d319fc3754,2011-02-15 03:26:49,166,14th_century;15th_century;16th_century;Pacific...,,"[(14th_century, 15th_century), (15th_century, ...","[0.9999865336188821, 0.9996365919982175, 0.700...",0.728473
1,3824310e536af032,2012-08-12 06:36:52,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,"[(14th_century, Europe), (Europe, Africa), (Af...","[0.9991989360002825, 0.9999737552461722, 0.999...",0.996099
2,415612e93584d30e,2012-10-03 21:10:40,138,14th_century;Niger;Nigeria;British_Empire;Slav...,,"[(14th_century, Niger), (Niger, Nigeria), (Nig...","[0.9996385206665049, 0.9999102184557794, 0.993...",0.918952
3,64dd5cd342e3780c,2010-02-08 07:25:25,37,14th_century;Renaissance;Ancient_Greece;Greece,,"[(14th_century, Renaissance), (Renaissance, An...","[0.9998434610819371, 0.9999529237925113, 0.999...",0.999921
4,015245d773376aab,2013-04-23 15:27:08,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0,"[(14th_century, Italy), (Italy, Roman_Catholic...","[0.9996227605786427, 0.9994910726698206, 0.997...",0.864102
...,...,...,...,...,...,...,...,...
51313,15a13a1d66ef5456,2012-10-03 02:23:35,66,Yagan;Ancient_Egypt;Civilization,,"[(Yagan, Ancient_Egypt), (Ancient_Egypt, Civil...","[0.8414925191386161, 0.6745653260705816]",0.758029
51314,2ef7ac844cefda58,2011-03-16 05:42:18,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,3.0,"[(Yagan, Folklore), (Folklore, Brothers_Grimm)...","[0.7061826798319183, 0.10386472073217597, 0.70...",0.668543
51315,12863abb7887f890,2013-11-22 04:42:52,228,Yagan;Australia;England;France;United_States;T...,,"[(Yagan, Australia), (Australia, England), (En...","[0.9795027929002406, 0.9999874136213855, 0.999...",0.947972
51316,19f8284371753362,2011-02-27 07:42:47,56,Yarralumla%2C_Australian_Capital_Territory;Aus...,1.0,"[(Yarralumla%2C_Australian_Capital_Territory, ...","[0.8427053940237842, 0.9968548124510662, 0.958...",0.932568


Unnamed: 0,edge,mean_bert_score,mean_rating
0,"(Kazakhstan, Time_zone)",0.998009,3.000000
1,"(Germany, Augustus)",0.997902,4.000000
2,"(Alcohol, Salt)",0.723050,3.000000
3,"(Neon, Aluminium)",0.187076,3.000000
4,"(British_Isles, Iron_Age)",0.725868,2.000000
...,...,...,...
55142,"(Beatles_for_Sale, Germany)",0.998158,2.000000
55143,"(Carbon_dioxide, Argon)",0.013619,3.666667
55144,"(Dinosaur, Cretaceous-Tertiary_extinction_event)",0.577140,1.500000
55145,"(Johnston_Atoll, Pacific_Ocean)",0.392250,2.000000


#### Sentence-transformers based word-pair similarity metric

In [6]:
sim_result = add_sentence_similarity_metric(paths_finished, edge_df)
display(sim_result[0])
display(sim_result[1])

Batches:   0%|          | 0/66 [00:00<?, ?it/s]

Starting loop


Computing dot products: 100%|██████████| 51318/51318 [00:08<00:00, 5805.37it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  "mean_rating": [np.nanmean(a[:, 0]) for a in global_dict.values()],


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,sucessive_pairs,sucessive_pairs_encoded,BERTscore
0,6a3701d319fc3754,2011-02-15 03:26:49,166,14th_century;15th_century;16th_century;Pacific...,,"[(14th_century, 15th_century), (15th_century, ...","[0.843832790851593, 0.8465811014175415, 0.2658...",0.596161
1,3824310e536af032,2012-08-12 06:36:52,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,"[(14th_century, Europe), (Europe, Africa), (Af...","[0.2547887861728668, 0.650822639465332, 0.3347...",0.528948
2,415612e93584d30e,2012-10-03 21:10:40,138,14th_century;Niger;Nigeria;British_Empire;Slav...,,"[(14th_century, Niger), (Niger, Nigeria), (Nig...","[0.22429414093494415, 0.7818886637687683, 0.39...",0.488120
3,64dd5cd342e3780c,2010-02-08 07:25:25,37,14th_century;Renaissance;Ancient_Greece;Greece,,"[(14th_century, Renaissance), (Renaissance, An...","[0.3987607955932617, 0.3302411437034607, 0.666...",0.465273
4,015245d773376aab,2013-04-23 15:27:08,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0,"[(14th_century, Italy), (Italy, Roman_Catholic...","[0.17625783383846283, 0.30731645226478577, 0.2...",0.333869
...,...,...,...,...,...,...,...,...
51313,15a13a1d66ef5456,2012-10-03 02:23:35,66,Yagan;Ancient_Egypt;Civilization,,"[(Yagan, Ancient_Egypt), (Ancient_Egypt, Civil...","[0.17201462388038635, 0.40563252568244934]",0.288824
51314,2ef7ac844cefda58,2011-03-16 05:42:18,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,3.0,"[(Yagan, Folklore), (Folklore, Brothers_Grimm)...","[0.2101505845785141, 0.29172438383102417, 0.21...",0.321190
51315,12863abb7887f890,2013-11-22 04:42:52,228,Yagan;Australia;England;France;United_States;T...,,"[(Yagan, Australia), (Australia, England), (En...","[0.12989801168441772, 0.7208418846130371, 0.62...",0.396291
51316,19f8284371753362,2011-02-27 07:42:47,56,Yarralumla%2C_Australian_Capital_Territory;Aus...,1.0,"[(Yarralumla%2C_Australian_Capital_Territory, ...","[0.44925427436828613, 0.41598689556121826, 0.4...",0.436014


Unnamed: 0,edge,mean_bert_score,mean_rating
0,"(Kazakhstan, Time_zone)",0.230465,3.000000
1,"(Germany, Augustus)",0.160548,4.000000
2,"(Alcohol, Salt)",0.492251,3.000000
3,"(Neon, Aluminium)",0.403407,3.000000
4,"(British_Isles, Iron_Age)",0.365899,2.000000
...,...,...,...
55142,"(Beatles_for_Sale, Germany)",0.039925,2.000000
55143,"(Carbon_dioxide, Argon)",0.281742,3.666667
55144,"(Dinosaur, Cretaceous-Tertiary_extinction_event)",0.454815,1.500000
55145,"(Johnston_Atoll, Pacific_Ocean)",0.624163,2.000000
