In [7]:
import json
import os
import pandas as pd
import networkx as nx
import gremlin_python
from langchain_text_splitters import RecursiveCharacterTextSplitter
#import matplotlib.pyplot as plt

In [8]:
# Path to corpus file
multihop_corpus_path = os.path.join("..", "data", "Multi-hop_RAG_dataset", "corpus.json")

# Read JSON
with open(multihop_corpus_path, "r", encoding="utf-8") as f:
    corpus = json.load(f)

# Convert corpus data into df
corpus_as_df = pd.DataFrame(corpus)

In [12]:
# Add new feature to act as a unique identifier for each article
corpus_as_df["article_id"] = corpus_as_df.index.copy()
corpus_as_df.head()

Unnamed: 0,title,author,source,published_at,category,url,body,article_id
0,200+ of the best deals from Amazon's Cyber Mon...,,Mashable,2023-11-27T08:45:59+00:00,entertainment,https://mashable.com/article/cyber-monday-deal...,"Table of Contents Table of Contents Echo, Fire...",0
1,ASX set to drop as Wall Street’s September slu...,Stan Choe,The Sydney Morning Herald,2023-09-26T19:11:30+00:00,business,https://www.smh.com.au/business/markets/asx-se...,"ETF provider Betashares, which manages $30 bil...",1
2,Amazon sellers sound off on the FTC's 'long-ov...,,Cnbc | World Business News Leader,2023-10-06T21:31:00+00:00,business,https://www.cnbc.com/2023/10/06/amazon-sellers...,A worker sorts out parcels in the outbound doc...,2
3,"Christmas Day preview: 49ers, Ravens square of...","Colum Dell, Yardbarker",Yardbarker,2023-12-24T23:34:39+00:00,sports,https://www.yardbarker.com/nfl/articles/christ...,"Christmas Day isn't just for the NBA, as the N...",3
4,"Raiders vs. Lions live score, updates, highlig...",Dan Treacy,Sporting News,2023-10-30T22:20:03+00:00,sports,https://www.sportingnews.com/us/nfl/news/raide...,The Lions just needed to get themselves back i...,4


In [None]:
# create graph
G = nx.DiGraph()

# Define sets to store unique authors, sources, and categories and avoid duplicates
authors = set()
sources = set()
categories = set()

# Iterate over rows in corpus and add nodes and edges to graph
for i, row in enumerate(corpus_as_df.iterrows()):
    if i == 10:
        break

    # Extract data from row
    article_id = row["article_id"]

    # Add article node
    G.add_node(article_id, title=row["title"], type="article")

    # If author is not in set, add author node and edges
    author_id = row["author"]
    if author_id not in authors:
        authors.add(author_id)
        G.add_node(author_id, type="author")

    G.add_edge(article_id, author_id, relation="WRITTEN_BY")  # (Ar -> Au)
    G.add_edge(author_id, article_id, relation="AUTHORED")    # (Au -> Ar)

    # If author is not in set, add author node and edges
    source_id = row["source"]
    if source_id not in sources:
        sources.add(source_id)
        G.add_node(source_id, type="source")

    G.add_edge(article_id, source_id, relation="PUBLISHED_IN")  # (Ar -> So)
    G.add_edge(source_id, article_id, relation="PUBLISHES")     # (So -> Ar)

    # If author is not in set, add author node and edges
    category_id = row["category"]
    if category_id not in categories:
        categories.add(author_id)
        G.add_node(category_id, type="category")

    G.add_edge(article_id, category_id, relation="CLASSIFIED_WITHIN")  # (Ar -> Ca)
    G.add_edge(category_id, article_id, relation="COVERED_IN")         # (Ca -> Ar)

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents([corpus])
print(texts[0])
print(texts[1])

TypeError: expected string or bytes-like object, got 'list'