Note: Notebook not updated after major changes to the project structure. Some errors could arise from incorrectly specified relative paths for data loading, dependencies imports etc.

### A. Import relevant dependencies

In [18]:
import json
import os
import pickle
import sys

import numpy as np
import pandas as pd
import torch
import yaml
from torch_geometric.loader import NeighborLoader
from torch_geometric.utils import to_undirected
from torch_geometric.utils.convert import from_networkx

try:
    # This will work in scripts where __file__ is defined
    current_dir = os.path.dirname(os.path.abspath(__file__))
    # Assuming "src" is parallel to the script folder
    project_root = os.path.abspath(os.path.join(current_dir, ".."))
except NameError:
    # In notebooks __file__ is not defined: assume we're in notebooks/multihop_dataset/
    project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))

src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

### B. Corpus

In [19]:
# Define path to corpus file and load it
multihop_corpus_path = os.path.join(project_root, "data", "Multi-hop_RAG_dataset", "corpus.json")

with open(multihop_corpus_path, "r", encoding="utf-8") as f:
    corpus = json.load(f)

In [20]:
# Convert corpus data into df
corpus_as_df = pd.DataFrame(corpus)
# Get descriptive stats
corpus_as_df.describe()

Unnamed: 0,title,author,source,published_at,category,url,body
count,609,545,609,609,609,609,609
unique,609,300,49,606,6,609,609
top,200+ of the best deals from Amazon's Cyber Mon...,Natasha Lomas,Sporting News,2023-10-20T21:30:28+00:00,sports,https://mashable.com/article/cyber-monday-deal...,"Table of Contents Table of Contents Echo, Fire..."
freq,1,14,101,2,211,1,1


This corpus comprises 609 articles from 300 different authors, 49 different sources and 6 different categories.

In [21]:
# Detail on authors
corpus_as_df["author"].unique()

array([None, 'Stan Choe', 'Colum Dell, Yardbarker', 'Dan Treacy',
       'Ian Valentino', 'Rob Miech', 'Arunima Bhanot',
       'The New York Times', 'Morgan Little', 'Jacquelyn Melinek',
       'Kyle Wiggers', 'Robert Calcutt', 'Will Unwin', 'David Pierce',
       'Rebecca Bellan', 'Jeffrey Parkin', 'Johnny Yu',
       'Laura Hampson and Meredith Clark', 'Dilvin Yasa', 'Amber Raiken',
       'Christy Doran', 'Catherine Shu', 'Natasha Lomas', 'Sarah Perez',
       'Cade Metz, Karen Weise, Nico Grant and Mike Isaac',
       'Ingrid Lunden', 'Ben Weiss', 'Tim Biggs', 'David Suggs',
       'Bryan Murphy', 'Kyle Irving', 'Romain Dillet', 'Vinnie Iyer',
       'Kyle Bonn', 'Haje Jan Kamps', 'Nell Geraets', 'Chelsea Ritschel',
       'Alyssa Stringer', 'Jessica Conditt', 'Laura Snapes',
       'Elizabeth Lopatto', 'Paul Sawers', 'Pete Volk',
       'Nick Brinkerhoff', 'Dave Richard', 'Shanna McCarriston',
       'Martin Derbyshire', 'Sam Eggleston', 'James Wallace', 'Rob Smyth',
       '', '

In [22]:
# Detail on sources
corpus_as_df["source"].unique()

array(['Mashable', 'The Sydney Morning Herald',
       'Cnbc | World Business News Leader', 'Yardbarker', 'Sporting News',
       'Essentially Sports', 'The New York Times', 'TechCrunch',
       'TalkSport', 'The Guardian', 'The Verge', 'Polygon',
       'The Independent - Life and Style', 'FOX News - Entertainment',
       'The Age', 'The Roar | Sports Writers Blog', 'Fortune', 'Engadget',
       'CBSSports.com', 'FOX News - Health', 'FOX News - Lifestyle',
       'Globes English | Israel Business Arena', 'Hacker News',
       'Eos: Earth And Space Science News', 'Music Business Worldwide',
       'Sky Sports', 'Business Line',
       'Scitechdaily | Science Space And Technology News 2017',
       'Zee Business',
       'Business Today | Latest Stock Market And Economy News India',
       'Sport Grill', 'Wide World Of Sports', 'Financial Times',
       'Iot Business News', 'BBC News - Technology',
       'The Independent - Sports', 'Yahoo News',
       'Live Science: The Most Interest

In [23]:
# Detail on category
corpus_as_df["category"].unique()

array(['entertainment', 'business', 'sports', 'technology', 'health',
       'science'], dtype=object)

In [24]:
# Detail on article's word count

# Add new feature with word count of each article
corpus_as_df["body_wordcount"] = corpus_as_df["body"].str.split().str.len()
# Get descriptive stats
corpus_as_df["body_wordcount"].describe()



count      609.000000
mean      1746.008210
std       1338.843744
min        839.000000
25%       1036.000000
50%       1298.000000
75%       1938.000000
max      12387.000000
Name: body_wordcount, dtype: float64

In [25]:
# Detail on article's character count

# Add new feature with character count of each article
corpus_as_df["body_charcount"] = corpus_as_df["body"].str.len()
# Get descriptive stats
corpus_as_df["body_charcount"].describe()

count      609.000000
mean     10340.182266
std       7809.231296
min       4770.000000
25%       6112.000000
50%       7836.000000
75%      11471.000000
max      71034.000000
Name: body_charcount, dtype: float64

The median wordcount is ~1300 words, roughly equivalent to between 3.5 and 5 pages depending on the particular format.

In [26]:
corpus_as_df

Unnamed: 0,title,author,source,published_at,category,url,body,body_wordcount,body_charcount
0,200+ of the best deals from Amazon's Cyber Mon...,,Mashable,2023-11-27T08:45:59+00:00,entertainment,https://mashable.com/article/cyber-monday-deal...,"Table of Contents Table of Contents Echo, Fire...",987,5813
1,ASX set to drop as Wall Street’s September slu...,Stan Choe,The Sydney Morning Herald,2023-09-26T19:11:30+00:00,business,https://www.smh.com.au/business/markets/asx-se...,"ETF provider Betashares, which manages $30 bil...",952,5649
2,Amazon sellers sound off on the FTC's 'long-ov...,,Cnbc | World Business News Leader,2023-10-06T21:31:00+00:00,business,https://www.cnbc.com/2023/10/06/amazon-sellers...,A worker sorts out parcels in the outbound doc...,1626,9801
3,"Christmas Day preview: 49ers, Ravens square of...","Colum Dell, Yardbarker",Yardbarker,2023-12-24T23:34:39+00:00,sports,https://www.yardbarker.com/nfl/articles/christ...,"Christmas Day isn't just for the NBA, as the N...",1101,6625
4,"Raiders vs. Lions live score, updates, highlig...",Dan Treacy,Sporting News,2023-10-30T22:20:03+00:00,sports,https://www.sportingnews.com/us/nfl/news/raide...,The Lions just needed to get themselves back i...,2766,15385
...,...,...,...,...,...,...,...,...,...
604,Cannabis use in pregnancy linked to small birt...,,Live Science: The Most Interesting Articles,2023-12-13T17:21:53+00:00,science,https://www.livescience.com/health/fertility-p...,Using cannabis during pregnancy may come with ...,934,5800
605,"Walks, tech and protein: how to parent your ow...",Annabel Streets,The Guardian,2023-10-10T09:00:35+00:00,entertainment,https://www.theguardian.com/lifeandstyle/2023/...,Parenting no longer starts and stops with our ...,1075,6557
606,Super Mario Bros. Wonder is a perfect end for ...,Chris Plante,Polygon,2023-10-18T13:00:00+00:00,entertainment,https://www.polygon.com/reviews/23920680/super...,The second level of Super Mario Bros. Wonder i...,1664,9857
607,The best apps and tools for managing your mone...,David Pierce,The Verge,2023-11-12T13:00:00+00:00,technology,https://www.theverge.com/2023/11/12/23955681/b...,"Hi, friends! Welcome to Installer No. 14, your...",2585,14259


In [33]:
print(corpus_as_df.sample().body.values[0])

Several would-be antidotes to the supersized and high-priced EV trend kicked the bucket this year. That’s bad news for everyone, even if you’d pick a beefy Cybertruck over something resembling a teeny Kei car.

Cars in general are no substitute for more efficient modes of transportation, including trains and bikes. But the last thing the climate and most budgets need is another tank-like truck or SUV. Large, heavy electric vehicles demand more materials and energy than smaller EVs; that means more avoidable greenhouse gas emissions during and after production. Despite these downsides, automakers build big because big sells.

Numerous vehicles from startups and some legacy automakers challenge the “bigger is better” notion via compact designs and smaller price tags. Yet, little typically comes of their resistance. We observed as much with several ill-fated EVs this year, including the ElectraMeccanica Solo, Sono Sion and Mazda MX-30.

Here we’ll take a look at the small and low(er)-pric

### C. Queries

In [13]:
# Define path to queries file and load it
multihop_queries_path = os.path.join(project_root, "data", "Multi-hop_RAG_dataset", "MultiHopRAG.json")

with open(multihop_queries_path, "r", encoding="utf-8") as f:
    queries = json.load(f)

In [14]:
queries[0]

{'query': 'Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?',
 'answer': 'Sam Bankman-Fried',
 'question_type': 'inference_query',
 'evidence_list': [{'title': 'The FTX trial is bigger than Sam Bankman-Fried',
   'author': 'Elizabeth Lopatto',
   'url': 'https://www.theverge.com/2023/9/28/23893269/ftx-sam-bankman-fried-trial-evidence-crypto',
   'source': 'The Verge',
   'category': 'technology',
   'published_at': '2023-09-28T12:00:00+00:00',
   'fact': 'Before his fall, Bankman-Fried made himself out to be the Good Boy of crypto — the trustworthy face of a sometimes-shady industry.'},
  {'title': 'SBF’s trial starts soon, but how did he — and FTX — get here?',
   'author': 'Jacquelyn Melinek',
   'url': 'https://techcrunch.com/2023/10/01/ftx-lawsuit-timeline/',
   'source': 'TechCrunch',
   'catego

In [15]:
# Convert corpus data into df
queries_as_df = pd.DataFrame(queries)

In [16]:
queries_as_df

Unnamed: 0,query,answer,question_type,evidence_list
0,Who is the individual associated with the cryp...,Sam Bankman-Fried,inference_query,[{'title': 'The FTX trial is bigger than Sam B...
1,Which individual is implicated in both inflati...,Donald Trump,inference_query,[{'title': 'Donald Trump defrauded banks with ...
2,Who is the figure associated with generative A...,Sam Altman,inference_query,[{'title': 'OpenAI's ex-chairman accuses board...
3,Do the TechCrunch article on software companie...,Yes,comparison_query,"[{'title': 'Here’s how Rainforest, a budding S..."
4,Which online betting platform provides a welco...,Caesars Sportsbook,inference_query,[{'title': '2023 Kentucky online sports bettin...
...,...,...,...,...
2551,Who is the individual associated with using FT...,Sam Bankman-Fried,inference_query,[{'title': 'Sam Bankman-Fried was a terrible b...
2552,Who is the individual that was once likened to...,Sam Bankman-Fried,inference_query,"[{'title': 'SBF’s trial starts soon, but how d..."
2553,"Who, according to articles in Sporting News, s...",Bettors,inference_query,[{'title': 'Moneyline Betting - What is the Mo...
2554,Does the Cnbc | World Business News Leader art...,Yes,comparison_query,"[{'title': 'Sex, Signal messages, and sabotagi..."


In [17]:
queries_as_df["question_type"].unique()

array(['inference_query', 'comparison_query', 'null_query',
       'temporal_query'], dtype=object)

In [45]:
sample_query['query'].values[0]

"Does the Engadget article claim that CyberGhost's cybersecurity measures exclude an independent security audit, a vulnerability disclosure program, and transparency reporting, while the TechCrunch article suggests that Keep Labs employs automated tools for code vulnerability assessments, indicating different approaches to product security?"

In [51]:
sample_query = queries_as_df.sample()

query = sample_query['query'].values[0]
answer = sample_query['answer'].values[0]

print(f"Query: {query}")
print(f"Answer: {answer}")

sample_query['evidence_list'].values[0]


Query: Which company recently underwent a leadership strategy allowing Ryan Petersen to focus more on customer relations, faced workforce reductions by the same individual, and had its internal affairs publicly described as "deeply concerning" by Dave Clark, as reported by Cnbc | World Business News Leader and TechCrunch?
Answer: Flexport


[{'title': "The inside story of Dave Clark's tumultuous last days at Flexport",
  'author': None,
  'url': 'https://www.cnbc.com/2023/10/02/the-inside-story-of-dave-clarks-tumultuous-last-days-at-flexport.html',
  'source': 'Cnbc | World Business News Leader',
  'category': 'business',
  'published_at': '2023-10-02T17:46:00+00:00',
  'fact': 'The co-CEO arrangement would free Petersen up to do what he loved – "getting beers with customers," in the words of two former Flexport employees.'},
 {'title': 'Cruise hits the brakes on driverless, UAW makes progress and more EV backpedaling',
  'author': 'Kirsten Korosec',
  'url': 'https://techcrunch.com/2023/10/30/cruise-hits-the-brakes-on-driverless-uaw-makes-progress-and-more-ev-backpedaling/',
  'source': 'TechCrunch',
  'category': 'technology',
  'published_at': '2023-10-30T10:15:24+00:00',
  'fact': 'Petersen has spent the past month cutting costs, including laying off about 20% of its workers, or about 600 people.'},
 {'title': 'Sam Al

### D. Mapping queries to relevant evidence

In [54]:
queries_as_df.iloc[i]
queries_as_df.iloc[0]['evidence_list']

[{'title': 'The FTX trial is bigger than Sam Bankman-Fried',
  'author': 'Elizabeth Lopatto',
  'url': 'https://www.theverge.com/2023/9/28/23893269/ftx-sam-bankman-fried-trial-evidence-crypto',
  'source': 'The Verge',
  'category': 'technology',
  'published_at': '2023-09-28T12:00:00+00:00',
  'fact': 'Before his fall, Bankman-Fried made himself out to be the Good Boy of crypto — the trustworthy face of a sometimes-shady industry.'},
 {'title': 'SBF’s trial starts soon, but how did he — and FTX — get here?',
  'author': 'Jacquelyn Melinek',
  'url': 'https://techcrunch.com/2023/10/01/ftx-lawsuit-timeline/',
  'source': 'TechCrunch',
  'category': 'technology',
  'published_at': '2023-10-01T14:00:29+00:00',
  'fact': 'The highly anticipated criminal trial for Sam Bankman-Fried, former CEO of bankrupt crypto exchange FTX, started Tuesday to determine whether he’s guilty of seven counts of fraud and conspiracy.'},
 {'title': 'Sam Altman backs teens’ startup, Google unveils the Pixel 8 an

In [56]:
queries_as_df.iloc[0]['evidence_list'][0]['title']

'The FTX trial is bigger than Sam Bankman-Fried'

In [60]:
corpus_as_df[corpus_as_df['title'] == queries_as_df.iloc[0]['evidence_list'][0]['title']].index[0]

175

In [65]:
queries_as_df

Unnamed: 0,query,answer,question_type,evidence_list
0,Who is the individual associated with the cryp...,Sam Bankman-Fried,inference_query,[{'title': 'The FTX trial is bigger than Sam B...
1,Which individual is implicated in both inflati...,Donald Trump,inference_query,[{'title': 'Donald Trump defrauded banks with ...
2,Who is the figure associated with generative A...,Sam Altman,inference_query,[{'title': 'OpenAI's ex-chairman accuses board...
3,Do the TechCrunch article on software companie...,Yes,comparison_query,"[{'title': 'Here’s how Rainforest, a budding S..."
4,Which online betting platform provides a welco...,Caesars Sportsbook,inference_query,[{'title': '2023 Kentucky online sports bettin...
...,...,...,...,...
2551,Who is the individual associated with using FT...,Sam Bankman-Fried,inference_query,[{'title': 'Sam Bankman-Fried was a terrible b...
2552,Who is the individual that was once likened to...,Sam Bankman-Fried,inference_query,"[{'title': 'SBF’s trial starts soon, but how d..."
2553,"Who, according to articles in Sporting News, s...",Bettors,inference_query,[{'title': 'Moneyline Betting - What is the Mo...
2554,Does the Cnbc | World Business News Leader art...,Yes,comparison_query,"[{'title': 'Sex, Signal messages, and sabotagi..."


In [70]:
queries_as_df.iloc[2]['evidence_list'][0]

{'title': "OpenAI's ex-chairman accuses board of going rogue in firing Altman: 'Sam and I are shocked and saddened by what the board did'",
 'author': "Matt O'Brien, The Associated Press",
 'url': 'https://fortune.com/2023/11/18/how-did-openai-fire-sam-altman-greg-brockman-rogue-board/',
 'source': 'Fortune',
 'category': 'business',
 'published_at': '2023-11-18T15:33:09+00:00',
 'fact': 'Altman’s exit “is indeed shocking as he has been the face of” generative AI technology, said Gartner analyst Arun Chandrasekaran.'}

In [68]:
#heyhey
all_evidences = []
for i in range(queries_as_df.shape[0]):
    evidence_index_list = []
    for evidence in queries_as_df.iloc[i]['evidence_list']:
        evidence_index_list.append(corpus_as_df[corpus_as_df['title'] == evidence['title']].index[0])
    all_evidences.append(evidence_index_list)

queries_as_df['evidence_index'] = all_evidences

In [74]:
queries_as_df.iloc[2552]['evidence_list']

[{'title': 'SBF’s trial starts soon, but how did he — and FTX — get here?',
  'author': 'Jacquelyn Melinek',
  'url': 'https://techcrunch.com/2023/10/01/ftx-lawsuit-timeline/',
  'source': 'TechCrunch',
  'category': 'technology',
  'published_at': '2023-10-01T14:00:29+00:00',
  'fact': 'Bankman-Fried was even compared to Warren Buffet and many called him the white horse of crypto (TechCrunch never did, for what it’s worth).'},
 {'title': "Sex, Signal messages, and sabotaging FTX's code: SBF execs and Bahamas roommates tell all in court",
  'author': None,
  'url': 'https://www.cnbc.com/2023/10/06/sex-signal-messages-and-sabotaging-ftxs-code-sbf-criminal-trial.html',
  'source': 'Cnbc | World Business News Leader',
  'category': 'business',
  'published_at': '2023-10-06T20:17:00+00:00',
  'fact': "During cross-examination, Huang said Paradigm pressed Bankman-Fried on the board issue and was told he didn't want investors as directors but he did plan on having a board with experts."},
 {

In [71]:
queries_processed_path = os.path.join(project_root, "data", "multihop_queries_processed.pkl")

queries_as_df.to_pickle(queries_processed_path)