In [2]:
# ! pip install python-dotenv
# ! pip install selenium
# ! pip install tiktoken
# ! pip install pinecone-client[grpc]
# ! pip install openai
# ! pip install langchain

### Get Env Keys

In [201]:
import os
import getpass
from tqdm.auto import tqdm
from dotenv import load_dotenv
import pandas as pd
import tiktoken

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

load_dotenv()

gemini_key = os.getenv("GOOGLE_API_KEY")
pinecone_key = os.getenv("PINECONE_API_KEY")
supabase_key = os.getenv("SUPABASE_API_KEY")

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Provide your Google API Key")

### All things Scraping

In [4]:

# List of URLs to scrape
urls = [
    "https://howdo.com/masterclass/mindset/growth-mindset/",
    "https://howdo.com/masterclass/mindset/resilience/",
    "https://howdo.com/masterclass/mindset/continuous-learning/",
    "https://howdo.com/masterclass/mindset/data-driven-decisions/",
    "https://howdo.com/masterclass/plan/customer-analysis/",
    "https://howdo.com/masterclass/plan/competition-analysis/",
    "https://howdo.com/masterclass/plan/market-analysis/",
    "https://howdo.com/masterclass/plan/solution-analysis/",
    "https://howdo.com/masterclass/plan/build-buy-partner-analysis/",
    "https://howdo.com/masterclass/tools/key-performance-indicators-kpis/",
    "https://howdo.com/masterclass/mindset/weekly-business-reviews/",
    "https://howdo.com/masterclass/tools/product-management/",
    "https://howdo.com/masterclass/tools/startup-accelerator/",
    "https://howdo.com/masterclass/tools/business-incubator/",
    "https://howdo.com/masterclass/tools/mergers-and-acquisitions/",
    "https://howdo.com/masterclass/tools/research-and-development/",
    "https://howdo.com/masterclass/team/talent-acquisition/",
    "https://howdo.com/masterclass/team/corporate-culture/",
    "https://howdo.com/masterclass/team/team-experience/",
    "https://howdo.com/masterclass/team/mentorship/",
    "https://howdo.com/masterclass/team/communities-of-practice/"
]

In [5]:


df = pd.DataFrame(columns=['Topic', 'Subtopic', 'URL', 'Text', 'Total Token'])
tokenizer = tiktoken.get_encoding('cl100k_base')

def get_topics_from_url(url):
    parts = url.split('/')
    if len(parts) > 4:
        return parts[4], parts[5]
    return '', ''

In [30]:
driver = webdriver.Chrome()

In [38]:
# Scrape each URL and populate the dataframe
for url in urls:
    print("Working on URL: " + url)
    masterclass_topic, subtopic = get_topics_from_url(url)
    
    driver.get(url)
    try:
        # Increase the timeout to 20 seconds
        element = WebDriverWait(driver, 20).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.wp-block-column.right-side-content'))
        )
        text = element.text.strip()
        total_token = len(tokenizer.encode(text))
        
        # Create a new row
        new_row = {
            'Topic': masterclass_topic,
            'Subtopic': subtopic,
            'URL': url,
            'Text': text,
            'Total Token': total_token
        }

        # Add the new row to the DataFrame
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    except TimeoutException:
        print(f"Timeout while waiting for element at {url}")
    except Exception as e:
        print(f"An error occurred: {e}")

Working on URL: https://howdo.com/masterclass/mindset/growth-mindset/
Working on URL: https://howdo.com/masterclass/mindset/resilience/
Working on URL: https://howdo.com/masterclass/mindset/continuous-learning/
Working on URL: https://howdo.com/masterclass/mindset/data-driven-decisions/
Working on URL: https://howdo.com/masterclass/plan/customer-analysis/
Working on URL: https://howdo.com/masterclass/plan/competition-analysis/
Working on URL: https://howdo.com/masterclass/plan/market-analysis/
Working on URL: https://howdo.com/masterclass/plan/solution-analysis/
Working on URL: https://howdo.com/masterclass/plan/build-buy-partner-analysis/
Working on URL: https://howdo.com/masterclass/tools/key-performance-indicators-kpis/
Working on URL: https://howdo.com/masterclass/mindset/weekly-business-reviews/
Working on URL: https://howdo.com/masterclass/tools/product-management/
Working on URL: https://howdo.com/masterclass/tools/startup-accelerator/
Working on URL: https://howdo.com/mastercla

In [51]:
driver.quit()

In [7]:
df.head()

Unnamed: 0,Topic,Subtopic,URL,Text,Total Token
0,mindset,growth-mindset,https://howdo.com/masterclass/mindset/growth-m...,Igniting Business Evolution: Building Your Gro...,6439
1,mindset,resilience,https://howdo.com/masterclass/mindset/resilience/,Resilience\nUnlock the power of resilience in ...,7400
2,mindset,continuous-learning,https://howdo.com/masterclass/mindset/continuo...,Continuous Learning\nEmbark on a transformativ...,5021
3,mindset,data-driven-decisions,https://howdo.com/masterclass/mindset/data-dri...,Data-Driven Decisions\nElevate your leadership...,6359
4,plan,customer-analysis,https://howdo.com/masterclass/plan/customer-an...,1. Introduction\nYou will dive deep into your ...,13880


In [9]:
df[['Text','Topic','Subtopic']] = df[['Text','Topic','Subtopic']].astype(str)
df['Text'] = df['Text'].str.replace('Print\n','')

In [None]:
df.to_csv('scraped_data.csv', index=False)

### Vectorisation of Scraped Data

In [202]:
df = pd.read_csv('scraped_data.csv')
df.columns

Index(['Topic', 'Subtopic', 'URL', 'Text', 'Total Token'], dtype='object')

In [203]:
df.head()

Unnamed: 0,Topic,Subtopic,URL,Text,Total Token
0,mindset,growth-mindset,https://howdo.com/masterclass/mindset/growth-m...,Igniting Business Evolution: Building Your Gro...,6439
1,mindset,resilience,https://howdo.com/masterclass/mindset/resilience/,Resilience\nUnlock the power of resilience in ...,7400
2,mindset,continuous-learning,https://howdo.com/masterclass/mindset/continuo...,Continuous Learning\nEmbark on a transformativ...,5021
3,mindset,data-driven-decisions,https://howdo.com/masterclass/mindset/data-dri...,Data-Driven Decisions\nElevate your leadership...,6359
4,plan,customer-analysis,https://howdo.com/masterclass/plan/customer-an...,1. Introduction\nYou will dive deep into your ...,13880


### Intialise and Test LLMs

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough


pc = Pinecone(api_key=pinecone_key)

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
)
embed_model = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 512, chunk_overlap=100)

In [22]:
result = llm.invoke("What is photosynthesis? Tell me in 1 line.")
print(result.content)

Photosynthesis is the process by which plants use sunlight, water, and carbon dioxide to create food (sugar) and oxygen. 



In [23]:
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content="I'd like to understand who won 2011 cricket world cup?")
]
res = llm(messages)
res.content

In [52]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed_model.embed_documents(texts)
print(res)

In [49]:
len(res), len(res[0])

(2, 768)

In [55]:
# Print indexes in Pinecone
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]
existing_indexes

In [143]:
df.columns

Index(['Topic', 'Subtopic', 'URL', 'Text', 'Total Token'], dtype='object')

In [204]:
index_name = "howdo"
howdo_index = pc.Index(index_name)
howdo_index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2268}},
 'total_vector_count': 2268}

In [205]:
## Intialise a vectorstore from Pinecone
# vectorstore = PineconeVectorStore.from_documents(chunk_list, embed_model, index_name=index_name)
## Delete all data from a Pinecone Index
# howdo_index.delete(delete_all=True)

### Vectorisation: Chunking

In [178]:
chunk_list = []
for i in tqdm(range(0,len(df))):
    text = df.loc[i,'Text']
    metadata = {'URL': df.loc[i,'URL']}
    chunk_list+=text_splitter.create_documents([text],[metadata])

  0%|          | 0/21 [00:00<?, ?it/s]

100%|██████████| 21/21 [00:00<00:00, 129.39it/s]


In [179]:
len(chunk_list)

2268

In [180]:
chunk_list[0]

Document(metadata={'URL': 'https://howdo.com/masterclass/mindset/growth-mindset/'}, page_content="Igniting Business Evolution: Building Your Growth Mindset\nUnlock transformative growth in your business journey with our course, inspired by Dr. Carol Dweck's growth mindset philosophy. Dive deep into strategies that fuel innovation, resilience, and adaptability. Empower yourself with tools to propel your career, energize your team, and evolve your business. Ready to ignite your potential?\nSHARE THIS ON")

### Vectorisation: Create Embeddings and Upsert to Pinecone

In [183]:
index_name = "howdo"
vectorstore = PineconeVectorStore.from_documents(chunk_list, embed_model, index_name=index_name)
retriever = vectorstore.as_retriever()

### Test Vector DB

In [184]:
query = "Why You Need Resilience"

In [185]:
vectorstore.similarity_search(query, k=3)

[Document(metadata={'URL': 'https://howdo.com/masterclass/mindset/resilience/'}, page_content="Why You Need Resilience\nYou face challenges every day. A sudden market shift, a product failure, a critical decision that didn't pan out as expected. These aren't mere obstacles; they're opportunities, provided you have the resilience to see them as such.\nOvercoming Setbacks: When things go wrong, it's not the end; it's a lesson. Resilient leaders don't dwell on failure. They analyze, learn, and move forward."),
 Document(metadata={'URL': 'https://howdo.com/masterclass/mindset/resilience/'}, page_content="Whether you're an aspiring entrepreneur, a seasoned executive, or anyone looking to take their career and life to the next level, this course is designed with you in mind. Resilience is a skill, a mindset, a lifestyle, and a vital part of your professional DNA. By embracing these four strategies, you'll not only navigate the challenging world of business but excel in it.\nAre you ready to 

### Set Prompt

In [223]:
template = """You're a helpful AI assistant. Given a user question and context, \
answer the user question and provide citations.
If none of the articles answer the question, just say you don't know.

Question is inside <query></query> tag.
We are also giving you context inside <context></context> tags.

Thse contexts are sources from a vector database after running a similarity search.

Remember, you must return both an answer and citations. A citation consists of a number \
with a hyperlink to the source URL. Return a citation for every theme sources from the context \
that justify the answer. If possible, insert a VERBATIM quote that \
justifies the answer.

Strictly Use the following format for your final output:

<div class="cited_answer">
    <p class="answer">.....adaf...<a href="https://www.google.com" target="_blank">[1]</a>....afafaf...fafa<a href="https://www.google.com" target="_blank">[2]</a>.</p>
    <ul class="citations">
        <li class="citation"><a href="https://www.google.com" target="_blank">[1]</a> <q>exect quote- only add if present</q></li>
        <li class="citation"><a href="https://www.google.com" target="_blank">[2]</a> <q>exact quote-only add if present</q></li>
        ...
    </ul>
</div>

Here are the context articles:{context} 

Now answer the question:"""
prompt = ChatPromptTemplate.from_messages(
    [("system", template), ("human", "{question}")]
)

### Setting up Retrieval Chain

In [224]:
retrieval_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [225]:
query = "Why do we Need Resilience"

In [226]:
answer = retrieval_chain.invoke(query)

In [227]:
answer

'<div class="cited_answer">\n    <p class="answer">Resilience is essential because it helps us overcome challenges and setbacks, manage stress effectively, learn from mistakes, and build strong relationships. <a href="https://howdo.com/masterclass/mindset/resilience/" target="_blank">[1]</a> It allows us to see challenges as opportunities for growth and to turn errors into enhancements. <a href="https://howdo.com/masterclass/mindset/resilience/" target="_blank">[2]</a> Resilience is a long-term strategy that helps us not only survive but thrive in a constantly changing business environment. <a href="https://howdo.com/masterclass/mindset/resilience/" target="_blank">[3]</a></p>\n    <ul class="citations">\n        <li class="citation"><a href="https://howdo.com/masterclass/mindset/resilience/" target="_blank">[1]</a> <q>You face challenges every day. A sudden market shift, a product failure, a critical decision that didn\'t pan out as expected. These aren\'t mere obstacles; they\'re opp

In [228]:
from IPython.display import HTML

In [229]:
HTML(answer)