In [2]:
%load_ext autoreload
%autoreload 2

 
import sys
sys.path.append("..")
import pandas as pd

# Importing the LLM models and the necessary modules

In [None]:
from langchain_mistralai import ChatMistralAI
from langchain_mistralai import MistralAIEmbeddings

mistral_api_key = "##"
mistral_llm_model = ChatMistralAI(
    api_key = mistral_api_key,
    model="mistral-large-latest",
    temperature=0,
    max_retries=2,
)


mistral_embeddings_model = MistralAIEmbeddings(
    model="mistral-embed",
    api_key = mistral_api_key
)

In [4]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

openai_api_key = "##"

openai_llm_model = ChatOpenAI(
    api_key = openai_api_key,
    model="gpt4o-mini",
    #temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

openai_embeddings_model = OpenAIEmbeddings(
    api_key = openai_api_key ,
    model="text-embedding-3-large",
)


In [5]:
from atom.utils import LangchainOutputParser

lg = LangchainOutputParser(llm_model=openai_llm_model, embeddings_model=openai_embeddings_model)

# Loading the datasets

In [7]:
openai_tweets = pd.read_excel("../datasets/llms_history_and_openai_posts/france_covid_history.xlsx")
llms_history = pd.read_excel("../datasets/llms_history_and_openai_posts/llms_history.xlsx")
france_covid_history = pd.read_excel("../datasets/llms_history_and_openai_posts/france_covid_history.xlsx")

# Extracting triplets and factoids

In [None]:
from atom.utils import Factoid

async def extract_factoids(batch):

    factoids = await lg.extract_information_as_json_for_context(output_data_structure=Factoid, contexts=batch)

    return [factoid.phrase for factoid in factoids]

In [105]:
openai_tweets["factoids"] = await extract_factoids(openai_tweets["Tweet"])

Total number of batches: 1


# Building the TKG
---

In [42]:
from atom import Atom

atom = Atom(llm_model=openai_llm_model, embeddings_model=openai_embeddings_model)

In [49]:
import ast 

def to_dictionary(df): 

    if isinstance(df['factoids'][0], str):
        df["factoids"] = df["factoids"].apply(lambda x:ast.literal_eval(x))
    grouped_df = df.groupby("observation date")["factoids"].sum().reset_index()
    return {
        str(date): factoids for date, factoids in grouped_df.set_index("observation date")["factoids"].to_dict().items()
        }

In [54]:
llms_history_dict = to_dictionary(llms_history)

In [67]:
llms_history_dict

{'30-12-2017': ['Google researchers introduced the Transformer architecture on June 12, 2017.',
  "The paper 'Attention Is All You Need' was released on June 12, 2017.",
  'The Transformer architecture revolutionized natural language processing on June 12, 2017.',
  'The Transformer architecture enables models to process data more efficiently through self-attention mechanisms.'],
 '30-12-2018': ['OpenAI released GPT-1 on June 11, 2018.',
  'GPT-1 is the first Generative Pre-trained Transformer model.',
  'GPT-1 demonstrated the effectiveness of unsupervised pre-training for language understanding tasks.',
  'Google introduced BERT on October 11, 2018.',
  'BERT stands for Bidirectional Encoder Representations from Transformers.',
  'BERT is a transformer-based model.',
  'BERT achieved state-of-the-art results on various NLP benchmarks.',
  'BERT understands context from both directions.'],
 '30-12-2019': ['OpenAI announced GPT-2 on February 14, 2019.',
  'GPT-2 is a significantly larg

In [70]:
kg_llms_mini_3= await atom.build_graph_from_different_obs_times(atomic_facts_with_obs_timestamps=llms_history_dict, rel_threshold=0.7, ent_threshold=0.8)

[INFO] ------- Extracting Triplets
Total number of batches: 1
[INFO] ------- Extracting Triplets
Total number of batches: 1
[INFO] ------- Extracting Triplets
Total number of batches: 1
[INFO] ------- Extracting Triplets
Total number of batches: 1
[INFO] ------- Extracting Triplets
Total number of batches: 1
[INFO] ------- Extracting Triplets
Total number of batches: 1
[INFO] ------- Extracting Triplets
Total number of batches: 1
[INFO] ------- Extracting Triplets
Total number of batches: 1
[INFO] ------- Building Atomic KGs
[INFO] ------- Adding Source Context to Atomic KGs
[INFO] ------- Merging Atomic KGs
[INFO] Wohoo! Entity was matched --- [gpt 2:model] --merged --> [gpt 2:product] (score=0.88)
[INFO] Wohoo! Entity was matched --- [gpt 2:model] --merged --> [gpt 2:algorithm] (score=0.86)
[INFO] Exact match for Entity: openai
[INFO] Wohoo! Entity was matched --- [gpt 2:algorithm] --merged --> [gpt 2:product] (score=0.85)
[INFO] ------- Adding Timestamps to Relationships
[INFO] ----

# Draw the graph
---

The final section involves visualizing the constructed knowledge graph using GraphIntegrator. The graph database Neo4j is accessed using specified credentials, and the resulting graph is visualized to provide a visual representation of the relationships and entities extracted from the document.

In [71]:
from atom.graph_integration import GraphIntegrator


URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "##"

GraphIntegrator(uri=URI, username=USERNAME, password=PASSWORD).visualize_graph(knowledge_graph=kg_llms_mini_3)