## Setup

In [None]:
!pip install bs4 pandas matplotlib 
!pip install requests
!pip install seaborn

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

def josaa_scrape(year, Round):
    """
    Sample usage: df = josaa_scrape("2018", "1")
    df.info()
    """
    url = 'https://josaa.admissions.nic.in/applicant/seatmatrix/OpeningClosingRankArchieve.aspx'
    # url for 2022 is different and you have to parse that as well for 2022 data

    params = {
        "ctl00$ContentPlaceHolder1$ddlInstype": "ALL",
        "ctl00$ContentPlaceHolder1$ddlInstitute": "ALL",
        "ctl00$ContentPlaceHolder1$ddlBranch": "ALL",
        "ctl00$ContentPlaceHolder1$ddlSeatType": "OPNO",
        "ctl00$ContentPlaceHolder1$btnSubmit": "Submit"
    }

    with requests.Session() as s:
        R = s.get(url)
        data = {}
        data.update({tag['name']: tag['value'] for tag in BeautifulSoup(R.content, 'html.parser').select('input[name^=__]')})
        data["ctl00$ContentPlaceHolder1$ddlYear"] = year
        R = s.post(url, data=data)

        data.update({tag['name']: tag['value'] for tag in BeautifulSoup(R.content, 'html.parser').select('input[name^=__]')})
        data["ctl00$ContentPlaceHolder1$ddlroundno"] = Round
        R = s.post(url, data=data)

        for key, value in params.items():
            data.update({tag['name']: tag['value'] for tag in BeautifulSoup(R.content, 'html.parser').select('input[name^=__]')})
            data[key] = value
            R = s.post(url, data=data)

    table = BeautifulSoup(R.text, 'html.parser').find(id = 'ctl00_ContentPlaceHolder1_GridView1')
    df = pd.read_html(table.prettify())[0]
    df.dropna(inplace = True, how="all")

    df["Year"] = year
    df["Round"] = Round

    return df

## Read Data

In [None]:
years = ['2016', '2017', '2018', '2019', '2020', '2021']
rounds = ['1']

df = pd.DataFrame()
for year in years:
    for Round in rounds:
        df_temp = josaa_scrape(year, Round)
        df = df.append(df_temp)


In [None]:
df

## Simple preprocessing 
### For analyzing Gender Neutral, OPEN seats from 2016-2021 for IITs only.

In [None]:
# Analyzing Gender Neutral, OPEN seats from 2016-2021 for IITs only.
df_open_IIT = df.copy()
df_open_IIT['Academic Program Name'] = df_open_IIT['Academic Program Name'].apply(lambda x: x[:-34])

# Create a feature to indicate if IIT
df_open_IIT['IS_IIT'] = df_open_IIT['Institute'].apply(lambda x: True if x[:30]=='Indian Institute of Technology' else False)

# Only keep IITs, OPEN
df_open_IIT = df_open_IIT[df_open_IIT['IS_IIT']==True]
df_open_IIT = df_open_IIT[df_open_IIT['Seat Type']=='OPEN']

# Changing NaN Gender to 'Gender-Neutral'. Typically seen in 2017 as 'Female-only (including Supernumerary)' was not introduced.
df_open_IIT['Gender'] = df_open_IIT['Gender'].apply(lambda x: 'Gender-Neutral' if str(x)=='nan' else x)

# Only keep 'Gender-Neutral'
df_open_IIT = df_open_IIT[df_open_IIT['Gender']=='Gender-Neutral']

# Find Mean Rank
df_open_IIT['Opening Rank'] = df_open_IIT['Opening Rank'].apply(lambda x: int(x))
df_open_IIT['Closing Rank'] = df_open_IIT['Closing Rank'].apply(lambda x: int(x))
df_open_IIT['Mean Rank'] = df_open_IIT['Opening Rank'] + df_open_IIT['Closing Rank'] 
df_open_IIT['Mean Rank'] = df_open_IIT['Mean Rank'].apply(lambda x: int(x)/2)

df_open_IIT = df_open_IIT.drop(columns=['IS_IIT'])
df_open_IIT = df_open_IIT.drop(columns=['Quota'])
df_open_IIT = df_open_IIT.drop(columns=['Seat Type'])
df_open_IIT = df_open_IIT.drop(columns=['Gender'])
df_open_IIT = df_open_IIT.reset_index().drop(columns=['index'])

df_open_IIT


In [None]:
DSAI = ['Data Science and Artificial Intelligence', 'Artificial Intelligence and Data Science', 'Data Science and Engineering', 'Artificial Intelligence', 'Statistics and Data Scie', ]
df_open_IIT_dsai = df_open_IIT.copy()
df_open_IIT_dsai['IS_DSAI'] = df_open_IIT_dsai['Academic Program Name'].apply(lambda x: True if x in DSAI else False)
df_open_IIT_dsai = df_open_IIT_dsai[df_open_IIT_dsai['IS_DSAI']==True]
df_open_IIT_dsai = df_open_IIT_dsai.drop(columns=['IS_DSAI'])
df_open_IIT_dsai = df_open_IIT_dsai.sort_values(by=['Institute']).reset_index().drop(columns=['index'])
df_open_IIT_dsai

In [None]:
# Plot ORCR range for an institute
def plot_orcl(Institute, Year):
    dd = df_open_IIT[df_open_IIT.Institute==Institute]
    dd = dd[df_open_IIT.Year==Year]
    plt.bar(dd['Academic Program Name'], dd['Closing Rank'].astype(int))
    plt.bar(dd['Academic Program Name'], dd['Opening Rank'].astype(int))
    plt.xticks(rotation=90)
    plt.legend(['Closing Rank', 'Opening Rank'])
    plt.title(Institute + ' ' + Year + ' ORCR Distribution')
    plt.show()

plot_orcl('Indian Institute of Technology Hyderabad', '2021')


In [None]:
def institute_trend(Institute):
    dd = df_open_IIT[df_open_IIT['Institute']==Institute]
    DPTS = dd['Academic Program Name'].unique()

    plt.rcParams['figure.figsize'] = [10,10]
    for dpt in DPTS:
        plt.plot(dd[dd['Academic Program Name']==dpt]['Year'], dd[dd['Academic Program Name']==dpt]['Mean Rank'],'--o')
    plt.legend(DPTS, bbox_to_anchor=(1,1), loc="upper left")
    plt.show()
    
institute_trend('Indian Institute of Technology Bombay')
institute_trend('Indian Institute of Technology Guwahati')
institute_trend('Indian Institute of Technology Dharwad')

## Analyzing first 1000 Ranks

In [None]:
def plot_range(Institute, Year):
    df_1000 = df_open_IIT.copy()
    df_1000 = df_1000[df_1000['Mean Rank']<=1000]
    df_1000 = df_1000[df_1000['Year']==Year]
    df_1000 = df_1000[df_1000['Institute']==Institute]
    df_1000_1= df_1000.copy()
    df_1000_2= df_1000.copy()
    df_1000_1['Ranks'] = df_1000['Opening Rank']
    df_1000_2['Ranks'] = df_1000['Closing Rank']
    df_1000_all = df_1000_1.append(df_1000_2)
    sns.catplot(x="Academic Program Name", y="Ranks",kind="point",
                data=df_1000_all)
    plt.xticks(rotation=90)
    plt.show()

plot_range('Indian Institute of Technology Kanpur', '2021')
plot_range('Indian Institute of Technology Kanpur', '2020')
