## extracting the links of a data by webscrapping

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:

# Define the URL to scrape
url = "https://www.rguktrkv.ac.in/Departments.php?view=EC"

# Set a user-agent to mimic a request from a browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Send a GET request to the website with headers
response = requests.get(url, headers=headers)
links_list = []

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the specific div containing the links
    div = soup.find('div', class_='row no-margin no-padding')
    if div:
        # Find the ul with class 'MenuBarVertical' within the div
        ul = div.find('ul', class_='MenuBarVertical')
        if ul:
            # Find all anchor tags within this ul and extract the href attribute
            links = ul.find_all('a', href=True)
            for link in links:
                links_list.append(link['href'])
        else:
            print("The 'ul' with class 'MenuBarVertical' was not found.")
    else:
        print("The 'div' with class 'row no-margin no-padding' was not found.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
links_list.append(url)

# Output the list of links

In [3]:
links_list

['Departments.php?view=EC&staff=TS',
 'Departments.php?view=EC&staff=NTS',
 'Syllabus.php?view=ECE',
 '#',
 '#lab',
 '#contact',
 'https://www.rguktrkv.ac.in/Departments.php?view=EC']

In [4]:
prefix="https://www.rguktrkv.ac.in/"

In [5]:
for i in range(len(links_list)):
    if("https" not in links_list[i]):
        links_list[i]=prefix+links_list[i]
    

In [6]:
 links_list

['https://www.rguktrkv.ac.in/Departments.php?view=EC&staff=TS',
 'https://www.rguktrkv.ac.in/Departments.php?view=EC&staff=NTS',
 'https://www.rguktrkv.ac.in/Syllabus.php?view=ECE',
 'https://www.rguktrkv.ac.in/#',
 'https://www.rguktrkv.ac.in/#lab',
 'https://www.rguktrkv.ac.in/#contact',
 'https://www.rguktrkv.ac.in/Departments.php?view=EC']

In [7]:
## click on the links and selelct only which you want 

In [8]:
del links_list[4:7]

In [9]:
links_list

['https://www.rguktrkv.ac.in/Departments.php?view=EC&staff=TS',
 'https://www.rguktrkv.ac.in/Departments.php?view=EC&staff=NTS',
 'https://www.rguktrkv.ac.in/Syllabus.php?view=ECE',
 'https://www.rguktrkv.ac.in/#']

## loading the data from links , by using documentloaders(webbaseloader)

In [10]:
from langchain_community.document_loaders import WebBaseLoader
file_paths = links_list
loader = WebBaseLoader(file_paths)

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [11]:
webdata = loader.load()
## When you use the load() method, the document is loaded fully into memory all at once. This means that the entire content of the document is read and processed immediately, and you get the full result in one go

In [12]:
print(webdata)

[Document(metadata={'source': 'https://www.rguktrkv.ac.in/Departments.php?view=EC&staff=TS', 'title': 'Electronics and Communication Engg - RK Valley :: RGUKT-AP', 'description': '', 'language': ''}, page_content="\n\n\n\n\n\n\n\n\n\n\nElectronics and Communication Engg - RK Valley :: RGUKT-AP\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nRajiv Gandhi University of Knowledge Technologies-Andhra Pradesh\nRK Valley Institute\n\n(Constituted under the A.P Govt. Act 18 of 2008 and recognized as per Section 2(f), 12(B) of UGC Act, 1956) \nAccredited by 'NAAC' with 'B+' Grade\n\n\n\n\n\n\n\n\n\n\n\nToggle navigation\n\n\n\n\n\n\n\n\n\n\n\nInstitute\r\n\t\t\t\t\t\t\t  \n\n\nAbout RGUKT\nVision and Mission\nBest Practices\nOrganization Chart\nStrategy Document\nGoverning Council\nAnnual Report\nConvocation\nOmbudsperson\nNAAC Certificate\nNIRF\nUGC Status (12B)\nUGC Status (2F)\nUGC-NAD/ABC Cell\nAICTE Mandatory Disclosure\n\n\n\n\n\nAcademics\r\n\t\t\t\t\t\t\t  \n\n\nAcademic Audit Reports\n\nMinu

In [14]:
page_contents

["\n\n\n\n\n\n\n\n\n\n\nElectronics and Communication Engg - RK Valley :: RGUKT-AP\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nRajiv Gandhi University of Knowledge Technologies-Andhra Pradesh\nRK Valley Institute\n\n(Constituted under the A.P Govt. Act 18 of 2008 and recognized as per Section 2(f), 12(B) of UGC Act, 1956) \nAccredited by 'NAAC' with 'B+' Grade\n\n\n\n\n\n\n\n\n\n\n\nToggle navigation\n\n\n\n\n\n\n\n\n\n\n\nInstitute\r\n\t\t\t\t\t\t\t  \n\n\nAbout RGUKT\nVision and Mission\nBest Practices\nOrganization Chart\nStrategy Document\nGoverning Council\nAnnual Report\nConvocation\nOmbudsperson\nNAAC Certificate\nNIRF\nUGC Status (12B)\nUGC Status (2F)\nUGC-NAD/ABC Cell\nAICTE Mandatory Disclosure\n\n\n\n\n\nAcademics\r\n\t\t\t\t\t\t\t  \n\n\nAcademic Audit Reports\n\nMinutes of Academic Council Meetings\nFaculty Ethics\nAcademic Programmes\nAcademic Regulations\nAcademic Calendar\nCurricula\nDepartments\nTime Table\nExaminations\n\n\n\nAdministration\r\n\t\t\t\t\t\t\t  \n\n\nChan

## Loading the data from pdf using documentloader(pypdfloader)

In [15]:
from langchain_community.document_loaders import PyPDFLoader

# Initialize the loader for the PDF file
loader = PyPDFLoader("C:\\Users\\Charan Akula\\Desktop\\Work Space\\scrapping\\ECE SYLLABUS.pdf")

# Create a list to store the loaded pages
pages = []

# Load the pages lazily
for doc in loader.lazy_load():
    pages.append(doc)
##The lazy_load() method processes the document incrementally, loading and yielding one page at a time (or chunks of data).
##It doesn't load the entire document into memory at once. Instead, it uses an iterator to load pages one by one as needed.
##Memory Efficiency: Since it doesn't load the whole document into memory all at once, lazy_load() is more memory-efficient and is better suited for large documents.

ece_syllabus = pages[10:18] ## loading data in 11 to 18 pages , use 10 instead of 11 in python file
minor_ml=pages[249:252]


In [16]:
#print(minor_ml)

In [17]:
pdf_data=ece_syllabus+minor_ml ## combining the data extracted from pdf

In [18]:
print(pdf_data)

[Document(metadata={'source': 'C:\\Users\\Charan Akula\\Desktop\\Work Space\\scrapping\\ECE SYLLABUS.pdf', 'page': 10}, page_content='Rajiv Gandhi University of Knowledge Technologies - AP \nDepartment of Electronics & Communications Engineering \n \n \n11 \n \nChapter 2 \nSemester-Wise Structure of Curriculum \nMandatory Induction Program \n \n \n \n \nENGINEERING FIRST YEAR: SEMESTER-1 \nSLNO CATEGORY COURSE \nCODE SUBJECT NAME L-T-P Credits \n1 BSC 20MA1101 Differential Equations and \nMultivariable calculus 3-1-0 4 \n2 BSC 20PY1101 Engineering Physics 3-1-0 4 \n3 BSC 20PY1181 Engineering Physics Lab 0-0-3 1.5 \n4 PCC 20EC1203 Signals and Systems 3-1-0 \n \n4 \n5 ESC 20EE1110 Electrical Technology 3-1-0 4 \n6 ESC 20EE1180 Electrical Technology Lab 0-0-3 1.5 \n7 ESC 20EC1102 Introduction to Latest Technical \nAdvancements 1-0-0 1 \n8 ESC 20CS1108 Programming & Data Structures 3-0-0 3 \n9 ESC 20CS1188 Programming & Data Structures \nLab 0-0-3 1.5 \nTotal Credits 24.5 \nTotal contact h

## Splitting the data into chunks 

In [19]:
overalldata=webdata+pdf_data

In [20]:
page_contents = [doc.page_content for doc in overalldata]

In [21]:
print(page_contents)

["\n\n\n\n\n\n\n\n\n\n\nElectronics and Communication Engg - RK Valley :: RGUKT-AP\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nRajiv Gandhi University of Knowledge Technologies-Andhra Pradesh\nRK Valley Institute\n\n(Constituted under the A.P Govt. Act 18 of 2008 and recognized as per Section 2(f), 12(B) of UGC Act, 1956) \nAccredited by 'NAAC' with 'B+' Grade\n\n\n\n\n\n\n\n\n\n\n\nToggle navigation\n\n\n\n\n\n\n\n\n\n\n\nInstitute\r\n\t\t\t\t\t\t\t  \n\n\nAbout RGUKT\nVision and Mission\nBest Practices\nOrganization Chart\nStrategy Document\nGoverning Council\nAnnual Report\nConvocation\nOmbudsperson\nNAAC Certificate\nNIRF\nUGC Status (12B)\nUGC Status (2F)\nUGC-NAD/ABC Cell\nAICTE Mandatory Disclosure\n\n\n\n\n\nAcademics\r\n\t\t\t\t\t\t\t  \n\n\nAcademic Audit Reports\n\nMinutes of Academic Council Meetings\nFaculty Ethics\nAcademic Programmes\nAcademic Regulations\nAcademic Calendar\nCurricula\nDepartments\nTime Table\nExaminations\n\n\n\nAdministration\r\n\t\t\t\t\t\t\t  \n\n\nChan

In [43]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [44]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1800,
    chunk_overlap=10, #This sets the number of characters to overlap between consecutive chunks. Here, the text will have a 20-character overlap at the end of one chunk and the beginning of the next.
)

In [45]:
chunks=text_splitter.split_documents(overalldata)

In [46]:
len(chunks)

23

In [47]:
chunks[0]

Document(metadata={'source': 'https://www.rguktrkv.ac.in/Departments.php?view=EC&staff=TS', 'title': 'Electronics and Communication Engg - RK Valley :: RGUKT-AP', 'description': '', 'language': ''}, page_content="Electronics and Communication Engg - RK Valley :: RGUKT-AP\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nRajiv Gandhi University of Knowledge Technologies-Andhra Pradesh\nRK Valley Institute\n\n(Constituted under the A.P Govt. Act 18 of 2008 and recognized as per Section 2(f), 12(B) of UGC Act, 1956) \nAccredited by 'NAAC' with 'B+' Grade\n\n\n\n\n\n\n\n\n\n\n\nToggle navigation\n\n\n\n\n\n\n\n\n\n\n\nInstitute\r\n\t\t\t\t\t\t\t  \n\n\nAbout RGUKT\nVision and Mission\nBest Practices\nOrganization Chart\nStrategy Document\nGoverning Council\nAnnual Report\nConvocation\nOmbudsperson\nNAAC Certificate\nNIRF\nUGC Status (12B)\nUGC Status (2F)\nUGC-NAD/ABC Cell\nAICTE Mandatory Disclosure\n\n\n\n\n\nAcademics\r\n\t\t\t\t\t\t\t  \n\n\nAcademic Audit Reports\n\nMinutes of Academic Council

## Embedding (converting the text into numbers)

In [48]:
## initially we will define the embedding model and we will use it vectorestores

In [49]:
from langchain_cohere import CohereEmbeddings
cohere_api_key="txbfSbwJYRR4ogGa6dXvtz63qj5gatE0mM43LLId"
embeddings = CohereEmbeddings(
    model="embed-english-v3.0",
    cohere_api_key=cohere_api_key
)

In [50]:
embeddings

CohereEmbeddings(client=<cohere.client.Client object at 0x000002196B8A92D0>, async_client=<cohere.client.AsyncClient object at 0x000002196B985390>, model='embed-english-v3.0', truncate=None, cohere_api_key=SecretStr('**********'), embedding_types=['float'], max_retries=3, request_timeout=None, user_agent='langchain:partner', base_url=None)

## Vectore Store (we will embedd the chunks and store in a vectore database)

In [51]:
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(chunks, embeddings)

## retriver

In [52]:
## retrives the chunks related to query , and these are used by model to make predictions

In [53]:
retriever = db.as_retriever(
    search_type="similarity", search_kwargs={"k":3}
)

In [54]:
retriever.invoke("1st year ece syllabus")

[Document(metadata={'source': 'C:\\Users\\Charan Akula\\Desktop\\Work Space\\scrapping\\ECE SYLLABUS.pdf', 'page': 10}, page_content='Rajiv Gandhi University of Knowledge Technologies - AP \nDepartment of Electronics & Communications Engineering \n \n \n11 \n \nChapter 2 \nSemester-Wise Structure of Curriculum \nMandatory Induction Program \n \n \n \n \nENGINEERING FIRST YEAR: SEMESTER-1 \nSLNO CATEGORY COURSE \nCODE SUBJECT NAME L-T-P Credits \n1 BSC 20MA1101 Differential Equations and \nMultivariable calculus 3-1-0 4 \n2 BSC 20PY1101 Engineering Physics 3-1-0 4 \n3 BSC 20PY1181 Engineering Physics Lab 0-0-3 1.5 \n4 PCC 20EC1203 Signals and Systems 3-1-0 \n \n4 \n5 ESC 20EE1110 Electrical Technology 3-1-0 4 \n6 ESC 20EE1180 Electrical Technology Lab 0-0-3 1.5 \n7 ESC 20EC1102 Introduction to Latest Technical \nAdvancements 1-0-0 1 \n8 ESC 20CS1108 Programming & Data Structures 3-0-0 3 \n9 ESC 20CS1188 Programming & Data Structures \nLab 0-0-3 1.5 \nTotal Credits 24.5 \nTotal contact h

In [55]:
## using the above data the model will make predictions

## creating  a model

In [56]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [57]:
Groq_api_key="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" # get this api key from groq website

In [58]:
model=ChatGroq(   # model
    temperature=0.4, 
    groq_api_key=Groq_api_key,
    model_name="llama-3.3-70b-versatile",
    max_tokens=None)


In [59]:
system_prompt=("You are bot specially designed for answering the queries related to ECE department in RGUKT,RK Valley"
               "You are trained by a ece , R20 student Charan I'D number is R200037"
                "Use the data obtained from only  the retrieved context and provide the appropraite result"
               "If you dont know answer to the question say that sorry i dont know "
               "{context}"
               
              ) ## conetxt is autofilled

In [60]:
template=ChatPromptTemplate.from_messages(
    [("system",system_prompt),
    ("human","{input}"),
    ("ai","")]
)


In [61]:
from langchain.chains.combine_documents import create_stuff_documents_chain # This chain takes a list of documents and formats them all into a prompt, then passes that prompt to an LLM. 
from langchain.chains import create_retrieval_chain

question_answer_chain = create_stuff_documents_chain(model, template)
rag_chain = create_retrieval_chain(retriever, question_answer_chain) # here 1st chunks are retrived and then it was combined with prompt to get response from llm


In [62]:
response=rag_chain.invoke({"input":"who is head  of the department ece"})

In [63]:
print(response["answer"])

The Head of the Department (HOD) of Electronics and Communication Engineering (ECE) is Mr. Y Arun Kumar Reddy. He is an Assistant Professor and can be reached at the email addresses hodece@rguktrkv.ac.in or yarunkumarreddy@rguktrkv.ac.in.


In [64]:
response2=rag_chain.invoke({"input":"who is prime minister if india"})

In [65]:
print(response2["answer"])

Sorry, I don't know the current Prime Minister of India as my training data is limited to the context of RGUKT, RK Valley, and I don't have information about the current political leaders of India.


In [66]:
response3=rag_chain.invoke({"input":"list out the teaching faculty in ece department"})
print(response3["answer"])

Here is the list of teaching faculty in the ECE department at RGUKT, RK Valley:

1. Mr. Y Arun Kumar Reddy - Assistant Professor (Head of the Department)
2. Mr. B. V. Sudhakar Reddy - Assistant Professor
3. Dr. SK Mahammad Rafi - Assistant Professor (Coordinator for Alumni)
4. Ms. G. Lakshmi Shireesha - Assistant Professor
5. Mr. K. Abdul Munaf - Assistant Professor
6. Mr. N Mohan Raju - Assistant Professor
7. Mrs. M. Anitha - Assistant Professor
8. Mr. P Janardhan Reddy - Assistant Professor
9. Mr. P. Siva Krishna - Assistant Professor
10. Mrs. V Lakshmi Prasanna - Assistant Professor
11. Mr. T Naresh - Assistant Professor
12. Mr. B Madhan Mohan - Assistant Professor
13. Mr. B Mohan Reddy - Assistant Professor
14. Mr. R. Pavan kumar - Assistant Professor
15. SHAIK RIAZUM - Assistant Professor
16. Mr. KRISHNAM HARINATHA REDDY - Assistant Professor
17. Mr. SAFARI BHASKAR RAO - Assistant Professor
18. S. Venkatesulu - Assistant Professor

There are 18 teaching faculty members in the ECE 

In [67]:
response4=rag_chain.invoke({"input":"list out the non teaching faculty in ece department"})
print(response4["answer"])

Based on the provided data, the non-teaching faculty in the ECE department are:

1. Mr. Damodharareddy K - Lab Assistant
2. Mr. R Subba Rayudu - Lab Assistant
3. Mr. K.N.Koundinya Kumar - Lab Technician
4. Mr. Kuruva Venkateswarulu - Lab Technician

These individuals are part of the staff members in the ECE department, but they are not listed as faculty members (who are typically involved in teaching and research). Instead, they are involved in supporting roles such as lab assistance and technical support.


In [68]:
response5=rag_chain.invoke({"input":"list out the labs avliable in ece "})
print(response5["answer"])

Based on the provided context, the following labs are available in the ECE department:

1. Digital Logic Design Laboratory (20ECXX80)
2. Electronic Devices and Circuits Lab (20EC1281)
3. Computational Lab (20EC1285)
4. Object Oriented Programming Laboratory (20CS1289)
5. Internet of Things Lab (20EC2185)
6. Analog Electronic Circuits Lab (20EC2181)
7. Digital Logic Design Lab (20EC2182)
8. Digital Signal Processing Lab (20EC2183)

Additionally, the staff members listed have the following lab-related roles:

1. Mr. Damodharareddy K - Lab Assistant
2. Mr. R Subba Rayudu - Lab Assistant
3. Mr. K.N.Koundinya Kumar - Lab Technician
4. Mr. Kuruva Venkateswarulu - Lab Technician

Note that this list may not be exhaustive, as the context only provides a limited amount of information about the ECE department.


In [69]:
response6=rag_chain.invoke({"input":"what is the 1st year 1st sem syllabus of ece"})
print(response6["answer"])

According to the provided context, the 1st year 1st sem syllabus of ECE in RGUKT, RK Valley is as follows:

1. BSC 20MA1101 - Differential Equations and Multivariable calculus (3-1-0, 4 credits)
2. BSC 20PY1101 - Engineering Physics (3-1-0, 4 credits)
3. BSC 20PY1181 - Engineering Physics Lab (0-0-3, 1.5 credits)
4. PCC 20EC1203 - Signals and Systems (3-1-0, credits not specified)
5. ESC 20EE1110 - Electrical Technology (3-1-0, 4 credits)
6. ESC 20EE1180 - Electrical Technology Lab (0-0-3, 1.5 credits)
7. ESC 20EC1102 - Introduction to Latest Technical Advancements (1-0-0, 1 credit)
8. ESC 20CS1108 - Programming & Data Structures (3-0-0, 3 credits)
9. ESC 20CS1188 - Programming & Data Structures Lab (0-0-3, 1.5 credits)

Total credits: 24.5
Total contact hours: 29 hours

Note: The credits for the course PCC 20EC1203 - Signals and Systems are not specified in the provided context.


In [70]:
response7=rag_chain.invoke({"input":"tell me about mohammad rafi sir"})
print(response7["answer"])

Dr. SK Mahammad Rafi is an Assistant Professor in the Department of Electronics and Communication Engineering at RGUKT, RK Valley. 

Here are some details about him:

* Education: He has completed his M.Tech from JNTU Hyderabad and Ph.D from IIT Hyderabad.
* Email: rafi@rguktrkv.ac.in
* Responsibilities: He is the Coordinator for Alumni.

You can view his full profile on the RGUKT website for more information.


In [71]:
response8=rag_chain.invoke({"input":"who trained you"})
print(response8["answer"])

I was trained by Charan, an ECE student from the R20 batch, with the ID number R200037.


In [80]:
from langchain_core.load import dumps  # or dump for file-based serialization

# Assuming rag_chain is the model you want to save
with open('rag_chain_model2.json', 'w') as f:
    f.write(dumps(rag_chain))  # Serialize the model and save it as a JSON file
