In [None]:
import getpass
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()


# commands run:

## for langchain & langflow environment
1. pip install langchain
2. pip3 install torch torchvision
3. pip install langchain langsmith langchain-community langgraph langchain-cli langchainhub langchain-openai langchain-chroma bs4

## langflow start

python3 -m langflow run


## remote login

ssh -C -p 222 -L 7860:127.0.0.1:7860 cmejo@rstudio-tr.braverock.com # langflow http://localhost:7860/ 
ssh -C -p 222 -L 11434:127.0.0.1:11434 cmejo@rstudio-tr.braverock.com # for ollama http://localhost:11434/ 


In [None]:
# load and preprocess custom dataset

import json
import os
from transformers import AutoTokenizer, AutoModel
import numpy as np
import faiss
import torch

# Load a pre-trained LLaMA model and tokenizer
model_name = "meta-llama/llama3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Load custom dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

dataset_path = "path/to/your/dataset.json"
dataset = load_dataset(dataset_path)

# Generate embeddings for the dataset
def generate_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

embeddings_list = [generate_embeddings(doc["text"], tokenizer, model) for doc in dataset]
embeddings_array = np.array(embeddings_list)

# Create a FAISS index
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_array)

# Save the index
faiss.write_index(index, "faiss_index.bin")


In [None]:
# langfllow configuration 

from langchain.chains import SimpleChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import Prompt
from langchain.vectorstores import FAISS

# Load the FAISS index
index = faiss.read_index("faiss_index.bin")

# Define a prompt template
prompt_template = """
You are an AI assistant specialized in answering questions based on the provided dataset. Given the following document text, provide a brief and accurate answer to the question:
Document: {document}
Question: {question}
Answer:
"""

# Define the function to find the most relevant document
def find_relevant_document(question, index, dataset, embeddings):
    query_embedding = embeddings.generate_embeddings(question).reshape(1, -1)
    D, I = index.search(query_embedding, k=1)
    document = dataset[I[0][0]]["text"]
    return document

# Define the chatbot function
def rag_chatbot(question, index, dataset, embeddings):
    document = find_relevant_document(question, index, dataset, embeddings)
    prompt = Prompt(prompt_template.format(document=document, question=question))
    answer = prompt.run()
    return answer

# Example usage
question = "What is quantum entanglement?"
answer = rag_chatbot(question, index, dataset, HuggingFaceEmbeddings(model, tokenizer))
print("Answer:", answer)


In [None]:
# Ollama Configuration File

api_version: v1
name: physics-rag-chatbot
description: A RAG chatbot for answering physics questions using LLaMA3 and LangFlow
models:
  - name: llama3
    version: latest
endpoints:
  - path: /ask
    method: POST
    handler: rag_chatbot

In [None]:
# deploy ollama

ollama deploy


In [None]:
# create knowledge graph using 'rdflib'

from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import FOAF, DC

# Create an RDF graph
g = Graph()
n = Namespace("http://example.org/")

# Add data to the graph
for doc in dataset:
    doc_uri = URIRef(f"http://example.org/document/{doc['id']}")
    g.add((doc_uri, DC.title, Literal(doc["title"])))
    g.add((doc_uri, DC.description, Literal(doc["text"])))
    for author in doc.get("authors", []):
        g.add((doc_uri, DC.creator, Literal(author)))

# Save the graph
g.serialize(destination="knowledge_graph.rdf", format="xml")


In [None]:
# Data Visualization in R

install.packages("ggplot2")
install.packages("jsonlite")


In [None]:
# Load and Visualize Data:

library(ggplot2)
library(jsonlite)

# Load dataset
dataset <- fromJSON("path/to/your/dataset.json")

# Convert to data frame
df <- data.frame(
  Title = sapply(dataset, function(x) x$title),
  Date = as.Date(sapply(dataset, function(x) x$date))
)

# Plot the number of documents over time
ggplot(df, aes(x = Date)) +
  geom_histogram(binwidth = 30) +
  labs(title = "Number of Documents Over Time",
       x = "Date",
       y = "Number of Documents")


# visuslize.r code explanation

Explanation
Load the Dataset:

The fromJSON function from the jsonlite package is used to load the dataset from a JSON file.
Convert to Data Frame:

The dataset is converted into a data frame with columns for Title, Date, and TextLength.
Plot 1: Number of Documents Over Time:

This plot shows the number of documents over time using a histogram. The bin width is set to 30 days.
Plot 2: Distribution of Text Lengths:

This plot shows the distribution of text lengths (number of characters) of the documents using a histogram.
Plot 3: Documents Over Time with Text Length:

This plot shows the text length of documents over time using a scatter plot.
Save the Plots:

The ggsave function is used to save each plot as a PNG file.
Running the Script
Save the above script as visualize.R and run it using R:

bash
Copy code
Rscript visualize.R
This script will generate and save three plots in the current directory:

number_of_documents_over_time.png
distribution_of_text_lengths.png
documents_over_time_with_text_length.png
Make sure to update the dataset_path variable with the correct path to your custom dataset JSON file.

In [None]:
# n-grams using word2vec

import os
import json
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser

# Load the dataset
dataset_path = os.getenv("DATASET_PATH", "path/to/your/custom_dataset.json")
with open(dataset_path, 'r') as f:
    dataset = json.load(f)

# Preprocess the dataset: Tokenize and clean text
def preprocess_text(text):
    return simple_preprocess(text, deacc=True)  # Tokenizes text and removes punctuation

texts = [preprocess_text(doc["text"]) for doc in dataset]

# Create bigrams and trigrams
bigram = Phrases(texts, min_count=5, threshold=100)
trigram = Phrases(bigram[texts], threshold=100)

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

data_bigrams = make_bigrams(texts)
data_trigrams = make_trigrams(texts)

# Train a Word2Vec model on the trigram data
model = Word2Vec(sentences=data_trigrams, vector_size=100, window=5, min_count=5, workers=4)

# Save the model
model_path = os.getenv("MODEL_PATH", "models/word2vec_trigrams.model")
model.save(model_path)

# Example: Get embedding for a specific trigram
example_trigram = trigram_mod[bigram_mod[preprocess_text("This is an example sentence for n-grams")]]
embedding = model.wv[example_trigram[0]]
print(f"Trigram: {example_trigram[0]}, Embedding: {embedding}")



Explanation
Loading the Dataset:

The dataset is loaded from a JSON file specified by the DATASET_PATH environment variable.
Preprocessing the Text:

The preprocess_text function tokenizes and cleans the text, removing punctuation.
Creating Bigrams and Trigrams:

Bigrams and trigrams are created using the gensim.models.phrases.Phrases and Phraser classes.
make_bigrams and make_trigrams functions are used to generate bigrams and trigrams for the entire dataset.
Training the Word2Vec Model:

The Word2Vec model is trained on the trigram data.
The vector_size parameter specifies the dimensionality of the word vectors.
The window parameter specifies the maximum distance between the current and predicted word within a sentence.
The min_count parameter ignores all words with total frequency lower than this.
Saving the Model:

The trained Word2Vec model is saved to the path specified by the MODEL_PATH environment variable.
Generating Embeddings:

An example sentence is preprocessed into trigrams, and the embedding for the first trigram is printed.

Running the Script
Save the script as word2vec_ngrams.py and run it:


python word2vec_ngrams.py

.env file:

DATASET_PATH=/home/cmejo/arxiv-dataset/custom_dataset.json
MODEL_PATH=models/word2vec_trigrams.model
This script will preprocess your text data into bigrams and trigrams, train a Word2Vec model, and save the model for later use. You can then use this model to generate embeddings for any n-gram in your dataset.