INSTALLATIONS

In [None]:
# !pip install langchain
# !pip install python-dotenv
# !pip install openai
# !pip install pypdf


IMPORTS

In [2]:

from langchain.document_loaders import CSVLoader
from langchain.document_loaders import DirectoryLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.memory import ConversationTokenBufferMemory
from langchain.memory import ConversationSummaryBufferMemory
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv
# account for deprecation of LLM model
import datetime

API KEY

In [3]:

os.environ["OPEN_API_KEY"] = ""
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ['OPEN_API_KEY']

METHOD TO LOAD ONLY ONE PDF

In [14]:
#load only one pdf
from langchain.document_loaders import PyPDFLoader
def LoadPDF(path):
    loader = PyPDFLoader(path, glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
    return loader.load()

METHOD TO LOAD PDFS FROM A DIRECTORY

In [15]:

def LoadPDF(path):
    loader = DirectoryLoader(path, glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
    return loader.load()

LOAD MY PDFS

In [21]:
pdf_directory="pdf"
loaded_documents=LoadPDF(pdf_directory)

100%|██████████| 14/14 [02:23<00:00, 10.25s/it]


VERIFY ALL PDFS ARE LOADED

In [26]:
from os.path import basename

expected_pdf_files = [
    "pdf1", "pdf2", "pdf3", "pdf4", "pdf5", "pdf6", "pdf7", "pdf8", "pdf9", "pdf10", "pdf11", "pdf12", "pdf13", "pdf14"
]

loaded_pdf_files_base = [basename(document.metadata.get('source')).replace('.pdf', '') for document in loaded_documents]

missing_pdf_files = [pdf_file for pdf_file in expected_pdf_files if pdf_file not in loaded_pdf_files_base]

if not missing_pdf_files:
    print("All pdf are loaded successfully.")
else:
    print("Failed loading:")
    for missing_pdf in missing_pdf_files:
        print(f"{missing_pdf}.pdf")

All pdf are loaded successfully.


METHOD TO LOAD ONE CSV

In [33]:
def LoadCSV(path):
    loader = CSVLoader(file_path=path, encoding="utf-8",csv_args={'delimiter':','})
    return loader.load()

LOAD ALL MY CSVS directory

In [29]:
from tqdm import tqdm

csv_directory="csv"
csv_data_list = []

csv_files = [os.path.join(csv_directory, file) for file in os.listdir(csv_directory) if file.endswith(".csv")]

for csv_file in tqdm(csv_files):
    csv_data = LoadCSV(csv_file)
    csv_data_list.append(csv_data)

100%|██████████| 41/41 [00:03<00:00, 11.45it/s]


SEARCH FOR ENCODING DIFFERENT THAN UTF-8

In [31]:

def csvLoadertoSearchEnc(path):
    try:
        loader = CSVLoader(file_path=path, encoding="utf-8", csv_args={'delimiter': ','})
        return loader.load()
    except UnicodeDecodeError as e:
        print(f"Error in CSV: {path}")
        print(f"Error: {str(e)}")
        return None  

csv_directory = "csv"
csv_data_list = []

csv_files = [os.path.join(csv_directory, file) for file in os.listdir(csv_directory) if file.endswith(".csv")]

for csv_file in csv_files:
    csv_data = csvLoadertoSearchEnc(csv_file)
    if csv_data is not None:
        csv_data_list.append(csv_data)




Error in CSV: csv\FIFA World Cup All Goals 1930-2022.csv
Error: 'utf-8' codec can't decode byte 0xe9 in position 814: invalid continuation byte


In [None]:
#Repare enconding windows1252
import csv


input_csv_file = "FIFA World Cup All Goals 1930-2022.csv"


output_csv_file = "FIFA World Cup All Goals 1930-2022_corregido.csv"

with open(input_csv_file, 'r', encoding='utf-8') as input_file, open(output_csv_file, 'w', newline='', encoding='utf-8') as output_file:
    csv_reader = csv.reader(input_file)
    csv_writer = csv.writer(output_file)

    for row in csv_reader:
        corrected_row = [field.replace(',', '\\,') if ',' in field else field for field in row]

        csv_writer.writerow(corrected_row)

print("CSV reencoded.")


LOAD Directory inside another directory

In [None]:

def load_csv_files_in_directory(directory_path):
    csv_data_list = []
    
    # Go through all general directory
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".csv"):
                csv_file_path = os.path.join(root, file)
                print(f"Loading {csv_file_path}...")
                
                # Carga el archivo CSV utilizando CSVLoader
                csv_data = LoadCSV(csv_file_path)
                csv_data_list.append(csv_data)
    
    return csv_data_list

general_directory = "fifa_wc_2018"

# Load CSV from the subdirectories 
load_csv_files_in_directory(general_directory)


