## Engish Langaguge Model for Grammer Correction

In [1]:
# !pip install langchain
# !pip install langchain_nvidia_ai_endpoints
# !pip install faiss-cpu

In [1]:
import os
from dotenv import load_dotenv
import openai
import re
from typing import List, Union
import requests
from bs4 import BeautifulSoup
import glob
import os
import pickle
import re
import time
import textwrap
import umap
from tqdm import tqdm

import faiss
import matplotlib.pyplot as plt
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader
from datasets import load_dataset

# Include the NVIDIA API key
load_dotenv()
nvidia_api_key = os.environ.get("NVIDIA_API_KEY")

# client = openai.OpenAI(
#   base_url = "https://integrate.api.nvidia.com/v1",
#   api_key = nvidia_api_key
# )
# print(client)

In [2]:
# model_name = "microsoft/phi-3-mini-128k-instruct"
# llm = ChatNVIDIA(model=model_name, max_tokens=1000)

In [18]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

model_kwargs = {
    "temperature": 0.1,  # Control the randomness of the predictions
    "max_length": 1000   # Set the maximum length for the generated sequences
}

# Define terminators for the text generation
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("")
]

pipe = pipeline(
    task="text2text-generation",
    model=model,
    tokenizer=tokenizer,
    temperature=model_kwargs["temperature"],
    # max_length=model_kwargs["max_length"],
    do_sample=True,
    repetition_penalty=1.1,
    max_new_tokens=512,
    eos_token_id=terminators,
)

In [19]:
# Set up the HuggingFacePipeline with the defined pipeline
llm = HuggingFacePipeline(
    pipeline=pipe,
    model_kwargs=model_kwargs
)

In [20]:
result = llm.invoke("Please correct my sentence 'I could had been a engineering student'?")
print(result)

I could have been a engineering student


In [21]:
result = llm.invoke("What is grammer and how can I learn it properly?")
print(result)

a grammatically correct word


In [22]:
# Initialize HuggingFaceEmbeddings
model_path = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_path,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Define embedding path
embedding_path = "embed/"

# Load the community-datasets/sentence-compression dataset from Hugging Face
dataset = load_dataset("embedding-data/sentence-compression")

documents = []
for example in dataset['train']:
    document = " ".join(example['set'])  # Concatenate list of strings into a single string
    documents.append(Document(page_content=document, metadata={}))

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=512,
    chunk_overlap=0
)
docs = character_splitter.split_documents(documents)

print(docs[:2])
print(f"\nTotal chunks: {len(docs)}")




[Document(page_content="The USHL completed an expansion draft on Monday as 10 players who were on the rosters of USHL teams during the 2009-10 season were selected by the League's two newest entries, the Muskegon Lumberjacks and Dubuque Fighting Saints. USHL completes expansion draft"), Document(page_content='Major League Baseball Commissioner Bud Selig will be speaking at St. Norbert College next month. Bud Selig to speak at St. Norbert College')]

Total chunks: 180799


In [23]:
# Initialize token splitter
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for doc in docs:
    # Convert each chunk into the expected format
    split_chunks = token_splitter.split_text(doc.page_content)
    for chunk in split_chunks:
        token_split_texts.append({"page_content": chunk})

# Print the shortened text of the 10th chunk and the total number of chunks
print(textwrap.shorten(token_split_texts[10]["page_content"], width=200))
print(f"\nTotal chunks: {len(token_split_texts)}")

a woman has been seriously injured in a collision with a police van in north devon. woman seriously injured in collision with police van

Total chunks: 180799


In [24]:
# Step 2: Create embeddings for the documents
model_path = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}

# Define file paths
index_file_path = "language_helper_faiss_index.bin"
metadata_file_path = "language_helper_metadata.pkl"

# Recreate the HuggingFaceEmbeddings object
embeddings = HuggingFaceEmbeddings(
    model_name=model_path, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

# Step 3: Create a FAISS vector store from the documents and embeddings
# Check if the FAISS index and metadata files exist
if os.path.isfile(index_file_path) and os.path.isfile(metadata_file_path):
    print("Loading existing FAISS index and metadata...")

    # Load the FAISS index from the file
    faiss_index = faiss.read_index(index_file_path)

    # Load the metadata and embeddings
    with open(metadata_file_path, "rb") as f:
        metadata = pickle.load(f)

    docs = metadata["docs"]
    index_to_docstore_id = metadata["index_to_docstore_id"]

    # Recreate the document store
    docstore = InMemoryDocstore(docs)

    # Recreate the FAISS vector store
    db = FAISS(
        embedding_function=embeddings.embed_query,
        index=faiss_index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
    )

    print("FAISS index and metadata loaded successfully.")
else:
    print("Creating new FAISS index and saving metadata...")

    # Assuming `docs` and `embeddings` are already defined before this step
    db = FAISS.from_documents(docs, embeddings)

    # Save the FAISS index to a file
    faiss.write_index(db.index, index_file_path)

    # Save the document store and index_to_docstore_id
    docstore = db.docstore
    index_to_docstore_id = db.index_to_docstore_id

    # Save the metadata
    metadata = {
        "docs": docstore._dict,
        "index_to_docstore_id": index_to_docstore_id,
        "embedding_model_name": model_path,
    }
    with open(metadata_file_path, "wb") as f:
        pickle.dump(metadata, f)

    print("FAISS index and metadata saved successfully.")

Creating new FAISS index and saving metadata...
FAISS index and metadata saved successfully.


In [106]:
def correct_my_sentence(question):
    # Define the template for the prompt
    template = """
    Context: {context}
    
    Sentence: {question}
    """
    
    # Creating a PromptTemplate
    prompt = PromptTemplate(template=template, input_variables=["context", "question"])
    
    # LangChain Chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=db.as_retriever(search_kwargs={"k": 1}),
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt},
    )
    
    # Define the context and question for the query
    context = "Correct the grammer of the following sentence:"
    query = f"Context: {context}\nQuestion: {question}"
    
    # Get the result
    result = qa_chain({"query": query})

    return result["result"].strip()


In [107]:
# Example usage
sentence = 'I are a boy'
result = correct_my_sentence(sentence)
print(result)

I am a boy


In [109]:
question = "I could had been went there"
result = correct_my_sentence(question)
print(result)

Whitney Houston
