In [None]:
!pip install -q -U langchain unstructured   arxiv datasets unstructured[all-docs] transformers
!pip install -q -U torch transformers transformers accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl

In [None]:
import os
import arxiv
import csv
import pandas as pd
import warnings
import torch

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from transformers import AutoTokenizer
from langchain.docstore.document import Document as LangchainDocument
from tqdm.notebook import tqdm
from datasets import Dataset,load_dataset,DatasetDict
from typing import Union,List, Dict, Optional,Tuple
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import DirectoryLoader

def download_papers(query, max_results, save_dir):
    """
    This function downloads papers from arXiv based on the provided query.

    Parameters:
    query (str): The search query for the papers.
    max_results (int): The maximum number of results to return.
    save_dir (str): The directory where the papers will be saved.
    """
    # Construct the default API client
    client = arxiv.Client()

    # Create a search object
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )

    # Get the results as a list
    results = list(client.results(search))

    # Check if the folder exists
    if not os.path.exists(save_dir):
        # If not, create the folder
        os.makedirs(save_dir)

    # Loop through the results
    for result in results:
        # Print the title of the paper
        print(result.title)
        # Download the paper as a PDF file and save it to the directory
        result.download_pdf(save_dir)


def write_to_csv(file_path: str, data: dict, write_header: bool) -> None:
    """
    Function to append data into a CSV file.

    Args:
    file_path (str): The path to the CSV file.
    data (dict): The data to be appended into the CSV file.
    write_header (bool): Whether to write the header.
    """
    mode = 'a' if os.path.exists(file_path) else 'w'
    with open(file_path, mode, newline='', encoding='UTF-8', errors='ignore') as file:
        writer = csv.DictWriter(file, fieldnames=["content", "documents", "metasource"], quoting=csv.QUOTE_ALL, escapechar='\\')
        if write_header and mode == 'w':
            writer.writeheader()
        try:
            writer.writerow({k: data[k] for k in ["content", "documents", "metasource"]})
        except UnicodeEncodeError:
            print(f"Warning: UnicodeEncodeError encountered for file {data['documents']}. Skipping this file.")


def read_pdfs_from_folder(folder_path: str, csv_file_path: str) -> None:
    """
    Function to recursively read PDF files from a folder and its subfolders and extract their content.

    Args:
    folder_path (str): The path to the folder containing the PDF files.
    csv_file_path (str): The path to the CSV file.
    """
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith(".pdf"):
                full_file_path = os.path.join(root, file_name)
                loader = PyPDFLoader(full_file_path)
                pages = loader.load_and_split()
                for page in pages:
                    data = {
                        "content": page.page_content,
                        "documents": file_name,
                        "metasource": page.metadata['source']
                    }
                    write_to_csv(csv_file_path, data, True)

def load_files(folder_path: str, file_extension: str = None) -> List[str]:
    """
    This function loads all files of a specific extension from a given folder.
    If no extension is provided, it loads all files.

    Parameters:
    folder_path (str): The path to the folder.
    file_extension (str): The file extension to look for.

    Returns:
    docs (List[str]): The list of loaded documents.
    """
    glob_pattern = f"**/*.{file_extension}" if file_extension else "**/[!.]*"
    # print(f"============================* {file_extension if file_extension else 'all'} files *==============================")
    loader = DirectoryLoader(
        folder_path,
        glob=glob_pattern,
        show_progress=True,
        use_multithreading=True,
        silent_errors=True
    )
    docs = loader.load()
    return docs

def advanced_data_loader(input: Union[str, Dict[str, str]], format: Optional[str] = None, split_ratios: Optional[Dict[str, float]] = None) -> Optional[DatasetDict]:
    """
    Loads a dataset from a given input path or dictionary specifying file paths and splits it.

    :param input: A string representing the dataset name or directory, or a dictionary containing file paths.
    :param format: The format of the dataset if loading from a file (e.g., 'csv' or 'json').
    :param split_ratios: A dictionary with keys 'train', 'test', and 'eval' containing split ratios.
    :return: A loaded and split dataset or None in case of failure.
    """
    if split_ratios is None:
        split_ratios = {'train': 0.8, 'test': 0.1, 'eval': 0.1}

    try:
        # Load the dataset
        if isinstance(input, dict) and format in ['csv', 'json']:
            dataset = load_dataset(format, data_files=input)
        elif isinstance(input, str) and format == 'text':
            dataset = load_dataset(format, data_dir=input)
        elif isinstance(input, str) and format is None:
            dataset = load_dataset(input)
        else:
            warnings.warn("Invalid input or format. Please provide a valid dataset name, directory, or file paths.")
            return None
    except FileNotFoundError as e:
        warnings.warn(str(e))
        return None

    # Split the dataset
    if dataset:
        split_dataset = dataset['train'].train_test_split(test_size=split_ratios['test'] + split_ratios['eval'])
        test_eval_dataset = split_dataset['test'].train_test_split(test_size=split_ratios['eval'] / (split_ratios['test'] + split_ratios['eval']))
        dataset = DatasetDict({
            'train': split_dataset['train'],
            'test': test_eval_dataset['train'],
            'eval': test_eval_dataset['test']
        })

    print("Splits: ", dataset.keys())
    print("Columns: ", {split: dataset[split].column_names for split in dataset.keys()})
    return dataset


def split_text_into_chunks(raw_knowledge_base: List[str],chunk_size:int=1000,chunk_overlap:int=100) -> List[str]:
    """
    This function splits the text documents into chunks using the RecursiveCharacterTextSplitter.

    Parameters:
    raw_knowledge_base (List[str]): The list of documents to be split.

    Returns:
    docs_processed (List[str]): The list of processed documents.
    """
    # Define the markdown separators
    MARKDOWN_SEPARATORS = [
        "\n#{1,6} ", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n",
        "\n\n", "\n", " ", "", "\n### ", "\n***\n", "\n---\n", "\n___\n", "\n\n\n"
    ]

    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Maximum number of characters in a chunk
        chunk_overlap=100,  # Number of characters to overlap between chunks
        add_start_index=True,  # If `True`, includes chunk's start index in metadata
        strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
        separators=MARKDOWN_SEPARATORS,
    )

    # Process the documents
    docs_processed = []
    for doc in raw_knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    return docs_processed
def save_to_csv(file_path: str, data: List[Dict[str, str]], mode: str = 'a') -> None:
    """
    Save or append data to a CSV file.

    :param file_path: Path to the CSV file to save or append data.
    :param data: List of dictionaries containing the data to be saved.
    :param mode: File opening mode ('a' for append, 'w' for write).
    """
    with open(file_path, mode, newline='', encoding='utf-8') as csvfile:
        fieldnames = data[0].keys() if data else []
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        if mode == 'w':
            writer.writeheader()

        for row in data:
            writer.writerow(row)

def rename_columns(csv_file: str, new_columns: List[str]) -> None:
    """
    Rename the columns of a CSV file.

    :param csv_file: The path to the CSV file.
    :param new_columns: The new column names.
    """
    df = pd.read_csv(csv_file)

    if len(df.columns) != len(new_columns):
        raise ValueError("The number of new column names must match the number of columns in the CSV file.")

    df.columns = new_columns
    df.to_csv(csv_file, index=False)

def load_docs_from_folder(folder_path: str, file_glob: str) -> List[Dict[str, str]]:
    """
    Load documents from a folder with a specific file extension and return their content and metadata.

    :param folder_path: The path to the folder containing the files.
    :param file_glob: The file extension or pattern to filter files.
    :return: A list of dictionaries with file content and metadata.
    """
    directory_loader = DirectoryLoader(folder_path, glob=file_glob, show_progress=True,
                                       use_multithreading=True, silent_errors=True)
    return directory_loader.load()

def Folder_csv_writer(folder_path: str, csv_output_path: str, file_extension: str = None, new_column_names: List[str] = None) -> None:
    """
    Main function to load documents from a directory and save them into a CSV file.

    :param folder_path: The path to the folder containing the files.
    :param csv_output_path: The path to the output CSV file.
    :param file_extension: The file extension to filter files. If None, use a pattern to match all files.
    :param new_column_names: A list of new column names for the CSV file.
    """
    file_glob = f"**/*{file_extension}" if file_extension else "**/[!.]*"
    docs = load_docs_from_folder(folder_path, file_glob)


    if not docs:
        print(f"No files found with extension {file_extension}.")
        return

    data_to_save = [{'id': str(i),
                     'content': doc.page_content,
                     'source': doc.metadata['source']} for i, doc in enumerate(docs) if hasattr(doc, 'metadata')]

    save_to_csv(csv_output_path, data_to_save, 'w')

    if new_column_names is not None:
        rename_columns(csv_output_path, new_column_names)
def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] ,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
   # Existing separators
    MARKDOWN_SEPARATORS = [
    "\n#{1,6} ", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n",
    "\n\n", "\n", " ", "", "\n### ", "\n***\n", "\n---\n", "\n___\n", "\n\n\n"
    ]

    # Additional separators
    additional_separators = ["\n====\n", "\n****\n", "\n----\n"]

    # Append additional separators to the list
    MARKDOWN_SEPARATORS.extend(additional_separators)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer,
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique






In [None]:


# Define the directory where the papers will be saved
FOLDER_PATH = "/content/sample_data/Hemanth_papers1"
query="LLMs large language models"
No_of_papers=100
OUT_PUT_FILE='output3.csv'
TEXT_CONTENT='content'
LABEL_CONTENT='documents'
NEW_COLUMN_NAMES=['ID',TEXT_CONTENT,LABEL_CONTENT]
input1={'train':OUT_PUT_FILE, 'test':OUT_PUT_FILE}
download_papers(query=query, max_results=No_of_papers, save_dir=FOLDER_PATH)
Folder_csv_writer(folder_path=FOLDER_PATH, csv_output_path=OUT_PUT_FILE,file_extension=['.md','.pdf','.csv'], new_column_names=NEW_COLUMN_NAMES)
dataset=advanced_data_loader(input=input1,format='csv')
ds=dataset['train']
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc[TEXT_CONTENT], metadata={"source": doc[LABEL_CONTENT]})
    for doc in tqdm(ds)
]
# docs_processed = split_text_into_chunks(RAW_KNOWLEDGE_BASE)
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
READER_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
docs_processed = split_documents(
    chunk_size=512,  # We choose a chunk size adapted to our model
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
)
KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)
USER_QUERY = "what is recent advancement on llms"
query_vector = embedding_model.embed_query(USER_QUERY)
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=USER_QUERY, k=5)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
retrieved_docs_text = [
    doc.page_content for doc in retrieved_docs
]  # we only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(
    question="How to create a pipeline object?", context=context
)

# Redact an answer
answer = READER_LLM(final_prompt)[0]["generated_text"]
print(answer)

IM-3D: Iterative Multiview Diffusion and Reconstruction for High-Quality 3D Generation
Chain Reaction of Ideas: Can Radioactive Decay Predict Technological Innovation?
Mitigating Object Hallucination in Large Vision-Language Models via Classifier-Free Guidance
COLD-Attack: Jailbreaking LLMs with Stealthiness and Controllability
Graph Mamba: Towards Learning on Graphs with State Space Models
Human Curriculum Effects Emerge with In-Context Learning in Neural Networks
Model Assessment and Selection under Temporal Distribution Shift
Rec-GPT4V: Multimodal Recommendation with Large Vision-Language Models
Soliton gas of the integrable Boussinesq equation and its generalised hydrodynamics
Target Score Matching
Improving Generalization in Semantic Parsing by Increasing Natural Language Variation
Crystallization of C*-algebras
Complete Asymptotic Expansions for the Normalizing Constants of High-Dimensional Matrix Bingham and Matrix Langevin Distributions
Learning Emergent Gaits with Decentralize

100%|██████████| 100/100 [15:25<00:00,  9.25s/it]


Error: need to escape, but no escapechar set

In [None]:
pip install sentence-transformers

In [None]:
pip install accelerate



In [None]:
pip install bitsandbytes



In [None]:
input1={'train':'/content/drive/MyDrive/Adversarial_chatbot_dataset _test.csv', 'test':'/content/drive/MyDrive/Adversarial_chatbot_dataset _test.csv', 'eval':'/content/drive/MyDrive/Adversarial_chatbot_dataset _test.csv'}
dataset=advanced_data_loader(input=input1,format='csv')
ds=dataset['train']
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc['input'], metadata={"source": doc['output']})
    for doc in tqdm(ds)
]
# docs_processed = split_text_into_chunks(RAW_KNOWLEDGE_BASE)
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
READER_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
docs_processed = split_documents(
    chunk_size=512,  # We choose a chunk size adapted to our model
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
)
KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)
USER_QUERY = "what is recent advancement on llms"
query_vector = embedding_model.embed_query(USER_QUERY)
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=USER_QUERY, k=5)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
retrieved_docs_text = [
    doc.page_content for doc in retrieved_docs
]  # we only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(
    question="How to create a pipeline object?", context=context
)

# Redact an answer
answer = READER_LLM(final_prompt)[0]["generated_text"]
print(answer)

Splits:  dict_keys(['train', 'test', 'eval'])
Columns:  {'train': ['id', 'input', 'output'], 'test': ['id', 'input', 'output'], 'eval': ['id', 'input', 'output']}


  0%|          | 0/528 [00:00<?, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
import shutil
shutil.make_archive('/content/sample_data/Hemanth_papers1', 'zip', '/content/sample_data/Hemanth_papers1')


'/content/sample_data/Hemanth_papers1.zip'

In [None]:
from google.colab import files
files.download('/content/sample_data/Hemanth_papers1.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
retrieved_docs_text = [
    doc.page_content for doc in retrieved_docs
]  # we only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(
    question="How to create a pipeline object?", context=context
)

# Redact an answer
answer = READER_LLM(final_prompt)[0]["generated_text"]
print(answer)

NameError: name 'tokenizer' is not defined