<a href="https://colab.research.google.com/github/egerdm-ai/Langchain_Multiple_PDF_w_GDrive/blob/main/Langchain_Multiple_PDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain openai chromadb tiktoken pypdf GoogleNews fake_useragent newspaper3k

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#!pip install newspaper3k
# Import necessary libraries
from GoogleNews import GoogleNews
import pandas as pd
import requests
from fake_useragent import UserAgent
import newspaper
from newspaper import fulltext
import re
# Define the keyword to search.
keyword = 'gpt'

# Perform news scraping from Google and extract the result into Pandas dataframe.
googlenews = GoogleNews(lang='en', region='US', period='1d', encode='utf-8')
googlenews.clear()
googlenews.search(keyword)
googlenews.get_page(2)
news_result = googlenews.result(sort=True)
news_data_df = pd.DataFrame.from_dict(news_result)

ua = UserAgent()
news_data_df_with_text = []
for index, headers in news_data_df.iterrows():
    news_title = str(headers['title'])
    news_media = str(headers['media'])
    news_update = str(headers['date'])
    news_timestamp = str(headers['datetime'])
    news_description = str(headers['desc'])
    news_link = str(headers['link'])
    print(news_link)
    news_img = str(headers['img'])
    try:
        # html = requests.get(news_link).text
        html = requests.get(news_link, headers={'User-Agent':ua.chrome}, timeout=5).text
        text = fulltext(html)
        print('Text Content Scraped')
    except:
        print('Text Content Scraped Error, Skipped')
        pass
    news_data_df_with_text.append([news_title, news_media, news_update, news_timestamp,
                                         news_description, news_link, news_img, text])

news_data_with_text_df = pd.DataFrame(news_data_df_with_text, columns=['Title', 'Media', 'Update', 'Timestamp',
                                                                    'Description', 'Link', 'Image', 'Text'])

# Display the entire dataframe for sample checking.
news_data_with_text_df
with open("/content/drive/MyDrive/data/recent_news.txt", "w") as f:
    f.write(f"Document Title: {'Recent News About GPT'}\n")
    for i, row in news_data_with_text_df.iterrows():
        f.write(f"Title: {row['Title']}\n")
        f.write(f"Media: {row['Media']}\n")
        f.write(f"Timestamp: {row['Timestamp']}\n")
        f.write(f"Description: {row['Description']}\n")
        f.write(f"Link: {row['Link']}\n")
        f.write(f"Text: {row['Text']}\n")
        f.write("\n")

In [None]:
import os
import sys
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter

os.environ["OPENAI_API_KEY"] = 'YOUR-OPENAI-API-KEY'
root_dir = "/content/drive/MyDrive/"
documents = []
# Create a List of Documents from all of our files in the ./docs folder
for file in os.listdir(root_dir+"data"):
    if file.endswith(".pdf"):
        pdf_path = root_dir+"data/" + file
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.docx') or file.endswith('.doc'):
        doc_path = root_dir+"data/" + file
        loader = Docx2txtLoader(doc_path)
        documents.extend(loader.load())
    elif file.endswith('.txt'):
        text_path = root_dir+"data/" + file
        loader = TextLoader(text_path)
        documents.extend(loader.load())

# Split the documents into smaller chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 100,
    length_function = len,
    add_start_index = True,
)
documents = text_splitter.split_documents(documents)

# Convert the document chunks to embedding and save them to the vector store
vectordb = Chroma.from_documents(documents, embedding=OpenAIEmbeddings(), persist_directory=root_dir+"data")
vectordb.persist()

# create our Q&A chain
pdf_qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo'),
    retriever=vectordb.as_retriever(search_kwargs={'k': 6}),
    return_source_documents=True,
    verbose=False
)

yellow = "\033[0;33m"
green = "\033[0;32m"
white = "\033[0;39m"

chat_history = []
print(f"{yellow}------------------------------------------------")
print('You can start asking questions about your files')
print('-----------------------------------------------')
while True:
    query = input(f"{green}Prompt: ")
    if query == "exit" or query == "quit" or query == "q" or query == "f":
        print('Exiting')
        sys.exit()
    if query == '':
        continue
    result = pdf_qa(
        {"question": query, "chat_history": chat_history})
    print(f"{white}Answer: " + result["answer"])
    chat_history.append((query, result["answer"]))