In [None]:
import locale
def getpreferredencoding(do_setlocale=True):
  return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install -q -U torch transformers transformers accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl  datasets pacmap  unstructured unstructured[pdf]
!%reload_ext dotenv
!%dotenv

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset
import matplotlib.pyplot as plt
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
pd.set_option(
    "display.max_colwidth", None
)

folderpath='/content/drive/MyDrive/Applied-Deep-Learning'
# Loading all or specific extension like .pdf .py .csv .json .txt .md
print("============================* markdown files *==============================")
ALL_markdown = DirectoryLoader(folderpath, glob= "**/*.md",show_progress=True, use_multithreading=True,silent_errors=True) #,txt ,.py, .csv ,.pdf,.md,.csv,.json "**/[!.]*"
docs_mds = ALL_markdown.load()

print("============================* pdf files *===========================")
ALL_PDF = DirectoryLoader(folderpath, glob= "**/*.pdf",show_progress=True, use_multithreading=True,silent_errors=True) #,txt ,.py, .csv ,.pdf,.md,.csv,.json "**/[!.]*"
docs_pds = ALL_PDF.load()

print("============================* py files *===========================")
ALL_Pythonfiles = DirectoryLoader(folderpath, glob= "**/*.py",show_progress=True, use_multithreading=True,silent_errors=True) #,txt ,.py, .csv ,.pdf,.md,.csv,.json "**/[!.]*"
docs_pys = ALL_Pythonfiles.load()

print("============================* csv files *===========================")
ALL_csvfiles = DirectoryLoader(folderpath, glob= "**/*.csv",show_progress=True, use_multithreading=True,silent_errors=True) #,txt ,.py, .csv ,.pdf,.md,.csv,.json "**/[!.]*"
docs_csvs = ALL_csvfiles.load()

print("============================* ipynb files *===========================")
ALL_csvfiles = DirectoryLoader(folderpath, glob= "**/*.ipynb",show_progress=True, use_multithreading=True,silent_errors=True) #,txt ,.py, .csv ,.pdf,.md,.csv,.json "**/[!.]*"
docs_ipynbs = ALL_csvfiles.load()


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # the maximum number of characters in a chunk: we selected this value arbitrarily
    chunk_overlap=100,  # the number of characters to overlap between chunks
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
    separators=MARKDOWN_SEPARATORS,
)

docs_processed = []
for doc in docs_mds:
    docs_processed += text_splitter.split_documents([doc])
print("****completed*** md files")

for doc in docs_pds:
  docs_processed += text_splitter.split_documents([doc])
print("****completed*** pdfs files")

for doc in docs_pys:
  docs_processed += text_splitter.split_documents([doc])
print("****completed*** python files")

for doc in docs_csvs:
  docs_processed += text_splitter.split_documents([doc])
print("****completed*** csv files")

print("length :",len(docs_processed))


In [None]:
from sentence_transformers import SentenceTransformer

# To get the value of the max sequence_length, we will query the underlying `SentenceTransformer` object used in the RecursiveCharacterTextSplitter.
print(
    f"Model's maximum sequence length: {SentenceTransformer('thenlper/gte-small').max_seq_length}"
)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
print("lengths of data:",len(lengths))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def visualize_distribution(lengths, plot_type='hist'):
    """
    Generate distribution plot for list of document lengths

    Args:
        lengths (list): List of document lengths
        plot_type (str): Type of plot to generate. Options:
            - 'hist': Histogram
            - 'kde': Kernel Density Estimate
            - 'box': Boxplot
    """

    fig, ax = plt.subplots()

    if plot_type == 'hist':
        ax.hist(lengths)
    # elif plot_type == 'kde':
    #     ax.sns.kdeplot(lengths)
    elif plot_type == 'box':
        ax.boxplot(lengths)
    else:
        raise ValueError(f"Invalid plot_type: {plot_type}")

    ax.set_title(f"{plot_type.title()} of Document Lengths")
    ax.set_xlabel("Document Length")
    ax.set_ylabel("Frequency")

    fig.tight_layout()

    return fig



fig1 = visualize_distribution(lengths, plot_type='hist')
# fig2 = visualize_distribution(lengths, plot_type='kde')
plt.show()


sns.kdeplot(lengths)
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()
# Plot the distrubution of document lengths, counted as the number of tokens
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()



In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

EMBEDDING_MODEL_NAME = "thenlper/gte-small"


def split_documents(
    chunk_size: int,
    knowledge_base,
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) :
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in docs_mds:
        docs_processed += text_splitter.split_documents([doc])
    print("****completed*** md files")

    for doc in docs_pds:
        docs_processed += text_splitter.split_documents([doc])
    print("****completed*** pdfs files")

    for doc in docs_pys:
        docs_processed += text_splitter.split_documents([doc])
    print("****completed*** python files")

    for doc in docs_csvs:
        docs_processed += text_splitter.split_documents([doc])
    print("****completed*** csv files")
       for doc in docs_ipynbs:
        docs_processed += text_splitter.split_documents([doc])
    print("****completed*** ipynbs files")

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique


docs_processed = split_documents(
    512,  # We choose a chunk size adapted to our model
    docs_processed,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

# Let's visualize the chunk sizes we would have in tokens from a common model
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()

# Import seaborn
import seaborn as sns
# Plot the KDE of the document lengths
sns.kdeplot(lengths)
# Add a title to the plot
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
# Show the plot
plt.show()