<a href="https://colab.research.google.com/github/epicskills1/Alemeno-Assignment/blob/main/ContentEngineBot(withoutoutput).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Extract Text from pdfs

In [None]:
# Install PyMuPDF
!pip install PyMuPDF

import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Upload files
from google.colab import files
uploaded = files.upload()

# Extract text from each PDF
alphabet_text = extract_text_from_pdf("Alphabet_10K.pdf")
tesla_text = extract_text_from_pdf("Tesla_10K.pdf")
uber_text = extract_text_from_pdf("Uber_10K.pdf")

# Save extracted text to variables or files
alphabet_text, tesla_text, uber_text


Generate Embeddings

In [None]:
# Install SentenceTransformers
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

alphabet_embeddings = model.encode(alphabet_text.split('\n'))
tesla_embeddings = model.encode(tesla_text.split('\n'))
uber_embeddings = model.encode(uber_text.split('\n'))

# Save embeddings to variables
alphabet_embeddings, tesla_embeddings, uber_embeddings


Store in Vector Store using FAISS

In [None]:
# Install FAISS
!pip install faiss-cpu

import faiss
import numpy as np

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return index

alphabet_index = create_faiss_index(alphabet_embeddings)
tesla_index = create_faiss_index(tesla_embeddings)
uber_index = create_faiss_index(uber_embeddings)


Query the FAISS Index

In [None]:
def query_index(index, query_embedding, texts):
    D, I = index.search(query_embedding, k=5)
    return [texts[i] for i in I[0]]

def get_query_embedding(query):
    return model.encode([query])

# Example query
query = "What are the risk factors associated with Google and Tesla?"
query_embedding = get_query_embedding(query)

alphabet_results = query_index(alphabet_index, query_embedding, alphabet_text.split('\n'))
tesla_results = query_index(tesla_index, query_embedding, tesla_text.split('\n'))
uber_results = query_index(uber_index, query_embedding, uber_text.split('\n'))

print("Alphabet Inc. Results:")
for result in alphabet_results:
    print(result)

print("\nTesla, Inc. Results:")
for result in tesla_results:
    print(result)

print("\nUber Technologies, Inc. Results:")
for result in uber_results:
    print(result)


Integrate LLM

In [None]:
# Install Transformers
!pip install transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')

def generate_insights(prompt):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = gpt_model.generate(inputs, max_length=150, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example insights generation
insights = generate_insights(query)
print("\nInsights:")
print(insights)


Execution

In [None]:
!pip install streamlit pyngrok sentence-transformers faiss-cpu transformers PyMuPDF


In [None]:
 %%writefile app.py
import streamlit as st
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
import faiss
import numpy as np

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to create a FAISS index
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return index

# Function to query the FAISS index
def query_index(index, query_embedding, texts):
    D, I = index.search(query_embedding, k=5)
    return [texts[i] for i in I[0]]

# Function to get query embeddings
def get_query_embedding(query):
    return model.encode([query])

# Function to generate insights using GPT-2
def generate_insights(prompt):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = gpt_model.generate(inputs, max_length=150, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Load models
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')
# Defining 'model' here, referencing the SentenceTransformer model
model = embedding_model

# Function to get query embeddings
def get_query_embedding(query):
    return model.encode([query])

# Load and process PDF files
alphabet_text = extract_text_from_pdf("Alphabet_10K (1).pdf")
tesla_text = extract_text_from_pdf("Tesla_10K (1).pdf")
uber_text = extract_text_from_pdf("Uber_10K (1).pdf")

alphabet_embeddings = embedding_model.encode(alphabet_text.split('\n'))
tesla_embeddings = embedding_model.encode(tesla_text.split('\n'))
uber_embeddings = embedding_model.encode(uber_text.split('\n'))

alphabet_index = create_faiss_index(alphabet_embeddings)
tesla_index = create_faiss_index(tesla_embeddings)
uber_index = create_faiss_index(uber_embeddings)

# Streamlit UI
st.title("Content Engine Chatbot")

query = st.text_input("Enter your query:")
if query:
    query_embedding = get_query_embedding(query)

    st.write("### Alphabet Inc.")
    alphabet_results = query_index(alphabet_index, query_embedding, alphabet_text.split('\n'))
    for result in alphabet_results:
        st.write(result)

    st.write("### Tesla, Inc.")
    tesla_results = query_index(tesla_index, query_embedding, tesla_text.split('\n'))
    for result in tesla_results:
        st.write(result)

    st.write("### Uber Technologies, Inc.")
    uber_results = query_index(uber_index, query_embedding, uber_text.split('\n'))
    for result in uber_results:
        st.write(result)

    st.write("### Insights")
    insights = generate_insights(query)
    st.write(insights)


In [None]:
!pip install --upgrade pyngrok

from pyngrok import ngrok
ngrok.set_auth_token("2N13huDP2ANtbXSJQ7OQ2HvAShU_2pEtgBSQ4CM4B6mY6kTEF") # Removed extra space before this line

# Terminate all existing ngrok tunnels
ngrok.kill()

# Create a new ngrok tunnel, explicitly specifying HTTP protocol
public_url = ngrok.connect(8501, proto="http") # Specify protocol as "http"
print(f"Streamlit App URL: {public_url}")

In [None]:
import subprocess

subprocess.Popen(['streamlit', 'run', 'app.py'])


