## Installing Dependencies

In [None]:
!pip install langchain-openai unstructured==0.7.12 pinecone-client openai tiktoken langchain clean-text langchain-pinecone

Collecting langchain-openai
  Downloading langchain_openai-0.1.7-py3-none-any.whl (34 kB)
Collecting unstructured==0.7.12
  Downloading unstructured-0.7.12-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pinecone-client
  Downloading pinecone_client-4.1.0-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.5/215.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.30.1-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading

## Importing Dependencies

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredURLLoader
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.vectorstores.pinecone import Pinecone
from langchain.chains import RetrievalQA
from langchain import OpenAI
from langchain_openai import ChatOpenAI
from cleantext import clean
import os
import nltk
import pinecone
import openai



## User Input for URLs to build the RAG agent

In [None]:
urls = []
n_weblinks = int(input("How many web links you want the RAG agent to refer for response generation & insights? Enter here: "))
print("Enter your links below: ")
for i in range(0, n_weblinks):
  inp = input()
  # appending the element in list
  urls.append(inp)

How many web links you want the RAG agent to refer for response generation & insights? Enter here: 1
Enter your links below: 
https://adasci.org/


## Loading URL through LangChain's UnstructuredURLLoader

In [None]:
loader = UnstructuredURLLoader(urls=urls)
urls = loader.load()

In [None]:
# Merging urls into a single list
documents = []
documents.extend(urls)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
texts = text_splitter.split_documents(documents)
texts

[Document(page_content='Skip to content\n\nUpskill your Team on Generative AI. Start here >\n\nMemberships\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tClose Memberships\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tOpen Memberships\n\nAccreditations\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tClose Accreditations\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tOpen Accreditations\n\nContinuous Learning\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tClose Continuous Learning\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tOpen Continuous Learning\n\nCorporate Trainings\n\nContact\n\nIndividual Membership', metadata={'source': 'https://adasci.org/'}),
 Document(page_content='Join the world’s leading Data Science professional community. You can access both General & Premium Memberships.\n\nLearn More\n\nCorporate Membership\n\nAny corporate, organization

## Cleaning URL Content

In [None]:
clean_url_text = clean(text=texts,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=False,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            replace_with_punct="",
            replace_with_url="This is a URL",
            replace_with_email="Email",
            replace_with_phone_number="",
            replace_with_number="123",
            replace_with_digit="0",
            replace_with_currency_symbol="$",
            lang="en"
            )
clean_url_text

'[document(page_content=\'skip to content\nupskill your team on generative ai. start here >\nmemberships\nclose memberships\nopen memberships\naccreditations\nclose accreditations\nopen accreditations\ncontinuous learning\nclose continuous learning\nopen continuous learning\ncorporate trainings\ncontact\nindividual membership\', metadata={\'source\': \'https://adasci.org/\'}), document(page_content=\'join the world\'s leading data science professional community. you can access both general & premium memberships.\nlearn more\ncorporate membership\nany corporate, organization or academic institution having common interests in the ai field can become a member of adasci.\nlearn more\nchartered data scientist™️\nthe chartered data scientist (cds) credential gives a strong understanding of advanced data science profession and in-depth, applied analytics skills.\nlearn more\', metadata={\'source\': \'https://adasci.org/\'}), document(page_content=\'learn more\ncertified data scientist - assoc

## OpenAI API Key Setting



In [None]:
# Set the OpenAI API key as an environment variable
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_APIKEY")

## Pinecone API Setting

In [None]:
#Set the pinecone key
from pinecone import Pinecone
os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API")

api_key = os.getenv("PINECONE_API_KEY")

# configure client
pc = Pinecone(api_key=api_key)

In [None]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [None]:
import time
index_name = "myindex"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# we create a new index
pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of text-embedding-ada-002
        metric='dotproduct',
        spec=spec
    )

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [None]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)


In [None]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
        texts,
        index_name=index_name,
        embedding=embeddings
    )

In [None]:
query = "What is AdaSci?"
vectorstore.similarity_search(query)

[Document(page_content='Join the world’s leading Data Science professional community. You can access both General & Premium Memberships.\n\nLearn More\n\nCorporate Membership\n\nAny corporate, organization or academic institution having common interests in the AI field can become a member of ADaSci.\n\nLearn More\n\nChartered Data Scientist™\n\nThe Chartered Data Scientist (CDS) credential gives a strong understanding of advanced data science profession and in-depth, applied analytics skills.\n\nLearn More', metadata={'source': 'https://adasci.org/'}),
 Document(page_content='The Generative AI Talent Gap: How Businesses Can Cultivate Their Own Experts\n\nHow to bridge the Generative AI talent gap through upskilling and reskilling initiatives?\n\nADaSci Announces the 4th Edition of Deep Learning DevCon (DLDC) 2024\n\nDive into the world of Generative AI and LLMs at DLDC 2024, the premier conference for cutting-edge AI research\n\nGenpact Launches SkyDive Global Campus Academy 2024 with 

## Model Building and Execution

In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
# completion llm
llm = ChatOpenAI(
    openai_api_key=userdata.get("OPENAI_APIKEY"),
    model_name='gpt-3.5-turbo',
    temperature=0.0
)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)
qa.run(query)

'The primary goal of ADaSci is to advance AI expertise and provide global recognition as a certified professional in the field of data science and artificial intelligence. They offer memberships, accreditations, and opportunities for upskilling and reskilling in AI-related areas.'

In [None]:
initial_prompt = "You are a researcher who is going to search the web links, summarize them and share insights as asked"

In [None]:
query = "What is the primary goal of AdaSci?"
result = qa({"query": query, "prompt": initial_prompt})

  warn_deprecated(


In [None]:
print(result['result'])

The primary goal of ADaSci is to advance AI expertise and provide global recognition as a certified professional in the field of data science and artificial intelligence. They offer memberships, accreditations, and opportunities for upskilling and reskilling in AI-related areas.
