## Installing Dependencies

In [23]:
!pip install langchain-openai unstructured==0.7.12 pinecone-client openai tiktoken langchain clean-text langchain-pinecone langchain-community

## Importing Dependencies

In [24]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredURLLoader
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.vectorstores.pinecone import Pinecone
from langchain.chains import RetrievalQA
from langchain import OpenAI
from langchain_openai import ChatOpenAI
from cleantext import clean
import os
import nltk
import pinecone
import openai

## User Input for URLs to build the RAG agent

In [25]:
urls = []
n_weblinks = int(input("How many web links you want the RAG agent to refer for response generation & insights? Enter here: "))
print("Enter your links below: ")
for i in range(0, n_weblinks):
  inp = input()
  # appending the element in list
  urls.append(inp)

How many web links you want the RAG agent to refer for response generation & insights? Enter here: 1
Enter your links below: 
https://adasci.org


## Loading URL through LangChain's UnstructuredURLLoader

In [26]:
loader = UnstructuredURLLoader(urls=urls)
urls = loader.load()

In [27]:
# Merging urls into a single list
documents = []
documents.extend(urls)

In [28]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
texts = text_splitter.split_documents(documents)
texts

[Document(metadata={'source': 'https://adasci.org'}, page_content='Skip to content\n\nMemberships\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tClose Memberships\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tOpen Memberships\n\nAccreditations\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tClose Accreditations\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tOpen Accreditations\n\nContinuous Learning\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tClose Continuous Learning\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tOpen Continuous Learning\n\nCorporate Trainings\n\nContact\n\nIndividual Membership'),
 Document(metadata={'source': 'https://adasci.org'}, page_content='Join the world’s leading Data Science professional community. You can access both General & Premium Memberships.\n\nLearn More\n\nCorporate Membership\n\nAny corporate, organization or academ

## Cleaning URL Content

In [29]:
clean_url_text = clean(text=texts,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=False,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            replace_with_punct="",
            replace_with_url="This is a URL",
            replace_with_email="Email",
            replace_with_phone_number="",
            replace_with_number="123",
            replace_with_digit="0",
            replace_with_currency_symbol="$",
            lang="en"
            )
clean_url_text

'[document(metadata={\'source\': \'https://adasci.org\'}, page_content=\'skip to content\nmemberships\nclose memberships\nopen memberships\naccreditations\nclose accreditations\nopen accreditations\ncontinuous learning\nclose continuous learning\nopen continuous learning\ncorporate trainings\ncontact\nindividual membership\'), document(metadata={\'source\': \'https://adasci.org\'}, page_content=\'join the world\'s leading data science professional community. you can access both general & premium memberships.\nlearn more\ncorporate membership\nany corporate, organization or academic institution having common interests in the ai field can become a member of adasci.\nlearn more\nchartered data scientist™️\nthe chartered data scientist (cds) credential gives a strong understanding of advanced data science profession and in-depth, applied analytics skills.\nlearn more\'), document(metadata={\'source\': \'https://adasci.org\'}, page_content=\'learn more\ncertified data scientist - associate 

## OpenAI API Key Setting



In [30]:
# Set the OpenAI API key as an environment variable
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_APIKEY")

## Pinecone API Setting

In [31]:
#Set the pinecone key
from pinecone import Pinecone
os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API")

api_key = os.getenv("PINECONE_API_KEY")

# configure client
pc = Pinecone(api_key=api_key)

In [33]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [34]:
import time
index_name = "myindex"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# we create a new index
pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of text-embedding-ada-002
        metric='dotproduct',
        spec=spec
    )

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [35]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [36]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)


In [37]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
        texts,
        index_name=index_name,
        embedding=embeddings
    )

In [38]:
query = "What is AdaSci?"
vectorstore.similarity_search(query)

[Document(metadata={'source': 'https://adasci.org'}, page_content='Join the world’s leading Data Science professional community. You can access both General & Premium Memberships.\n\nLearn More\n\nCorporate Membership\n\nAny corporate, organization or academic institution having common interests in the AI field can become a member of ADaSci.\n\nLearn More\n\nChartered Data Scientist™\n\nThe Chartered Data Scientist (CDS) credential gives a strong understanding of advanced data science profession and in-depth, applied analytics skills.\n\nLearn More'),
 Document(metadata={'source': 'https://adasci.org'}, page_content='Shaping the future of AI talent, Association of Data Scientists accredits and elevates professionals with recognized certifications and transformative corporate training.\n\nJoin now to advance your AI expertise and achieve global recognition as a certified professional!\n\nBECOME A MEMBER\n\nSTART LEARNING AI\n\nOur Accreditations\n\nGet global recognition for AI skills\n

## Model Building and Execution

In [39]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
# completion llm
llm = ChatOpenAI(
    openai_api_key=userdata.get("OPENAI_APIKEY"),
    model_name='gpt-3.5-turbo',
    temperature=0.0
)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)
qa.run(query)

'ADaSci stands for the Association of Data Scientists. It is an organization that offers memberships, accreditations, and certifications in the field of data science and artificial intelligence. They provide training, exams, and resources for professionals looking to advance their skills in AI.'

In [40]:
initial_prompt = "You are a researcher who is going to search the web links, summarize them and share insights as asked"

In [41]:
query = "What is the primary goal of AdaSci?"
result = qa({"query": query, "prompt": initial_prompt})

In [42]:
print(result['result'])

The primary goal of AdaSci is to accredit and elevate professionals in the field of data science by providing recognized certifications and transformative corporate training. They aim to advance AI expertise and achieve global recognition for certified professionals.
