# Import Libraries
Importing libraries are necessary for any project.
We're going to import commonly used Webscrapping and Data Analysis libraries. Make sure all libraries are installed.

In [1]:
import requests # for getting web contents
from bs4 import BeautifulSoup # for scraping web contents
import pandas as pd # for data analysis
from datetime import datetime # for datatime

# URL
For web scrapping it's important to have url.

In [2]:
# link from where you want to scrap data
years = [2017, 2018, 2019, 2020, 2021]

content = []

for year in years:
    URL = 'https://www.nirfindia.org/' + str(year) + '/UniversityRanking.html'

    # get web data
    page = requests.get(URL)

    # parse web data
    soup = BeautifulSoup(page.content, "html.parser")
    
    # find the table
    # our trageted table is last

    # getting the table head because it may contains headings (column names)
    html_thead = soup.find_all('thead')[-1]

    #getting all the rows in table head
    html_tr = [tr for tr in html_thead.find_all('tr')]

    # list to store all table headings
    headings = []

    # loop through table head
    for tr in html_tr:
        # getting all th
        th = tr.find_all(['th'])
        # storing all th value in row and removing white space
        row = [i.text.strip() for i in th]
        row.insert(-1, 'Year')
        # append headings 
        headings.append(row)
    
    # print heading
    print(headings)
    
    # getting the table body
    html_tbody = soup.find_all('tbody')[-1]

    #getting all the rows in table body
    html_text = [tr for tr in html_tbody.find_all('tr')]    

    # loop through table body
    for tr in html_text:
    # getting all th, td
        th = tr.find_all(['th','td'])
        # storing all th value in row and removing white space
        row = [i.text.strip() for i in th]
        row.insert(-1, year)
        # append content 
        content.append(row)  

[['Institute ID', 'Name', 'City', 'State', 'PR Score', 'PR Rank', 'Score', 'Year', 'Rank']]
[['Institute ID', 'Name', 'City', 'State', 'PR Score', 'PR Rank', 'Score', 'Year', 'Rank']]
[['Institute ID', 'Name', 'City', 'State', 'PR Score', 'PR Rank', 'Score', 'Year', 'Rank']]
[['Institute ID', 'Name', 'City', 'State', 'PR Score', 'PR Rank', 'Score', 'Year', 'Rank']]
[['Institute ID', 'Name', 'City', 'State', 'PR Score', 'PR Rank', 'Score', 'Year', 'Rank']]


In [3]:
len(content)

500

In [4]:
# save contents in a dataframe
data = pd.DataFrame(content[:], columns=headings[0])

# Data Analysis

## Look at Data and Example Records

In [5]:
data

Unnamed: 0,Institute ID,Name,City,State,PR Score,PR Rank,Score,Year,Rank
0,IR17-I-2-18243,Indian Institute of Science Bangalore,Bengaluru,Karnataka,83.33,1,83.28,2017,1
1,IR17-I-2-1-319,Jawaharlal Nehru University,New Delhi,Delhi,47.27,3,61.53,2017,2
2,IR17-I-2-18500,Banaras Hindu University,Varanasi,Uttar Pradesh,44.01,4,58.92,2017,3
3,IR17-I-2-30673,Jawaharlal Nehru Centre for Advanced Scientifi...,Bengaluru,Karnataka,6.92,55,58.25,2017,4
4,IR17-I-2-10326,Jadavpur University,Kolkata,West Bengal,28.81,9,57.32,2017,5
...,...,...,...,...,...,...,...,...,...
495,IR-O-U-0121,Goa University,Goa,Goa,20.01,115,38.96,2021,96
496,IR-O-U-0043,"Vignan's Foundation for Science, Technology an...",Guntur,Andhra Pradesh,18.00,142,38.92,2021,97
497,IR-O-U-0555,Graphic Era University,Dehradun,Uttarakhand,32.17,45,38.91,2021,98
498,IR-O-U-0223,"Jain university, Bangalore",Bengluru,Karnataka,25.52,76,38.89,2021,99


In [6]:
# check few top rows of data
data.head()

Unnamed: 0,Institute ID,Name,City,State,PR Score,PR Rank,Score,Year,Rank
0,IR17-I-2-18243,Indian Institute of Science Bangalore,Bengaluru,Karnataka,83.33,1,83.28,2017,1
1,IR17-I-2-1-319,Jawaharlal Nehru University,New Delhi,Delhi,47.27,3,61.53,2017,2
2,IR17-I-2-18500,Banaras Hindu University,Varanasi,Uttar Pradesh,44.01,4,58.92,2017,3
3,IR17-I-2-30673,Jawaharlal Nehru Centre for Advanced Scientifi...,Bengaluru,Karnataka,6.92,55,58.25,2017,4
4,IR17-I-2-10326,Jadavpur University,Kolkata,West Bengal,28.81,9,57.32,2017,5


## Descriptive Statistics

In [7]:
# getting Generate descriptive statistics of data. Generate descriptive statistics include count, mean, std, min_value, 25%, 50%, 75%, max_value
data.describe()

Unnamed: 0,Year
count,500.0
mean,2019.0
std,1.41563
min,2017.0
25%,2018.0
50%,2019.0
75%,2020.0
max,2021.0


## Summary of data-type, columns, non-null values, memory usage.

In [8]:
# data info such as data type, columns, non-null values and memory usage
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Institute ID  500 non-null    object
 1   Name          500 non-null    object
 2   City          500 non-null    object
 3   State         500 non-null    object
 4   PR Score      500 non-null    object
 5   PR Rank       500 non-null    object
 6   Score         500 non-null    object
 7   Year          500 non-null    int64 
 8   Rank          500 non-null    object
dtypes: int64(1), object(8)
memory usage: 35.3+ KB


## Column labels

In [9]:
# get the column labels of the data.
data.columns

Index(['Institute ID', 'Name', 'City', 'State', 'PR Score', 'PR Rank', 'Score',
       'Year', 'Rank'],
      dtype='object')

# Save Data into CSV

In [10]:
# save data
data.to_csv('IndianUniversityRankingFrom2017to2021.csv', index=False)