In [4]:
# Import necessary libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS
from langchain_chroma import Chroma
from PyPDF2 import PdfReader
import os
import re
import concurrent.futures
from PyPDF2 import PdfReader
from io import BytesIO
import threading

In [5]:
def rename_files(directory):
    """Renames files in a given directory, keeping only letters, underscores, and periods.

    Args:
        directory: The path to the directory.
    """

    for filename in os.listdir(directory):
        old_filepath = os.path.join(directory, filename)
        new_filename = re.sub(r"[^a-zA-Z._]", "", filename).lower()
        new_filepath = os.path.join(directory, new_filename)

        if old_filepath != new_filepath:
            try:
                os.rename(old_filepath, new_filepath)
                print(f"Renamed '{filename}' to '{new_filename}'")
            except FileExistsError:
                print(f"{new_filename} already exists")

directory = "./train_files"
rename_files(directory)

about_howard___howard_university.pdf already exists


In [6]:

def get_filenames_for_creating_embeddings(directory):
  """Gets the names of files in a given directory and returns them as a list.

  Args:
    directory: The path to the directory.

  Returns:
    A list of filenames ready by adding the base directory.
  """

  filenames = []
  for filename in os.listdir(directory):
    filenames.append(f"{directory}/{filename}")
  return filenames

# Get the filenames and print them
directory = "./train_files"
FILES = get_filenames_for_creating_embeddings(directory)
FILES.sort()
print(FILES)

['./train_files/about_howard___howard_university.pdf', './train_files/about_howard___howard_university1.pdf', './train_files/about_our_team___human_resources.pdf', './train_files/administration___howard_university.pdf', './train_files/admission_policy___admission.pdf', './train_files/admission_profile___admission.pdf', './train_files/alma_mater___howard_university.pdf', './train_files/alumni.pdf', './train_files/announcements___the_dig_at_howard_university.pdf', './train_files/billing_policy.pdf', './train_files/bison_s.a.f.e.__the_universitys_safety_response_app___howard_university.pdf', './train_files/blog___admission.pdf', './train_files/calendar__.pdf', './train_files/campus_life___admission.pdf', './train_files/careers___human_resources.pdf', './train_files/course_catalogue.pdf', './train_files/diversity__inclusion___howard_university_student_affairs.pdf', './train_files/events___howard_university_alumni_relations.pdf', './train_files/faq_and_housing_meeting_recaps___howard_univer

In [None]:
# Create embeddings for each files faster by using threads
embedding_model = "sentence-transformers/all-mpnet-base-v2"

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

# Initialize Chroma vector store
vector_store = Chroma(
    collection_name='howard_information',
    embedding_function=embeddings,
    persist_directory='./data/chroma'
)

def process_file(file_path):
    # Get the current thread name
    thread_name = threading.current_thread().name
    
    # Display the file name and thread name
    print(f"Processing {file_path} in thread {thread_name}")

    # Convert the content of the PDF file to a BytesIO stream
    with open(file_path, 'rb') as file:
        pdf = PdfReader(file)
        pdf_text = ""

        # Extract text from each page
        for page in pdf.pages:
            pdf_text += page.extract_text()

        # Initialize text splitter and split the text into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
        documents = text_splitter.create_documents([pdf_text])

        # Create embeddings and store them in the vector store
        vector_store.add_documents(documents)

    print(f"Completed processing {file_path} in thread {thread_name}")

# Use ThreadPoolExecutor to run five threads
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Keep track of threads
    futures = {executor.submit(process_file, file): file for file in FILES}
    
    # Wait for each thread to complete
    for future in concurrent.futures.as_completed(futures):
        file = futures[future]
        try:
            future.result()
            print(f"File {file} has been processed successfully.")
        except Exception as e:
            print(f"Error processing file {file}: {e}")

print("All files have been processed.")
