# Import Libraries
Importing libraries are necessary for any project.
We're going to import commonly used Webscrapping and Data Analysis libraries. Make sure all libraries are installed.

In [1]:
import requests # for getting web contents
from bs4 import BeautifulSoup # for scraping web contents
import pandas as pd # for data analysis

# URL
For web scrapping it's important to have url.

In [2]:
# link from where you want to scrap data
URL = 'https://myneta.info/loksabha2019/'

# get web data
page = requests.get(URL)

# parse web data
soup = BeautifulSoup(page.content, "html.parser")

In [3]:
# find the table
# our trageted table is last

# this link have lists of links so first get all the links, all links stored in table so getting last table
table = soup.find_all('table')[-1]

# links stored in div class='items'
divs = table.find_all('div', class_="items")

links = []
cities = []

for div in divs:
    for a in div.find_all('a', href=True):
        link = a['href']
        if '/' in link:
            link = link.split('/')[-1:]
            links.append(link)
        elif '/' not in link:
            links.append(link)
        city = a.text.strip()
        cities.append(city)

In [4]:
# function to scrape values from each url
def url_scrraper(url,city):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.find_all('table')[-1]
    thead = [th for th in table.find_all('th')]
    for i in thead:
        headings.append(i.text)
    headings.insert(-1,'City')
    
    html_text = [tr for tr in table.find_all('tr')][1:]
    
    for text in html_text:
        td = [cells for cells in text.find_all('td')]
        if len(td)!= 0 :
            name = td[0].find('a').get_text()
            party = td[1].get_text()
            case = td[2].get_text()
            edu = td[3].get_text()
            age = td[4].get_text()
            ta = td[5].get_text()
            la = td[6].get_text()
            row = [name, party, case, edu, age, ta, la]
            row.insert(-1,city)

            #print(row)
            content.append(row)

In [5]:
base_url = 'https://myneta.info/loksabha2019/'
content = []

for i in range(len(cities)):
    headings = []
    if type(links[i]) == list:
        url = base_url + links[i][0]
    else:
        url = base_url + links[i]
    url_scrraper(url, cities[i])

In [6]:
# save contents in a dataframe
data = pd.DataFrame(content[:], columns=headings)

# Data Analysis

## Check Data Shape

In [7]:
# data shape
print(data.shape)

(7968, 8)


## Look at Example Records

In [8]:
# check few top rows of data
data.head()

Unnamed: 0,Candidate,Party,Criminal Cases,Education,Age,Total Assets,City,Liabilities
0,Kuldeep Rai Sharma,INC,0,Graduate Professional,52,"Rs 13,22,33,012 ~ 13 Crore+",ANDAMAN AND NICOBAR ISLANDS,"Rs 8,04,50,870 ~ 8 Crore+"
1,Ayan Mandal,AITC,0,Graduate,30,"Rs 72,70,440 ~ 72 Lacs+",ANDAMAN AND NICOBAR ISLANDS,"Rs 15,00,000 ~ 15 Lacs+"
2,C G Saji Kumar,All India Hindustan Congress Party,0,12th Pass,48,"Rs 1,20,000 ~ 1 Lacs+",ANDAMAN AND NICOBAR ISLANDS,Rs 0 ~
3,C U Rasheed,IND,0,12th Pass,34,"Rs 2,02,808 ~ 2 Lacs+",ANDAMAN AND NICOBAR ISLANDS,"Rs 17,00,000 ~ 17 Lacs+"
4,Gour Chandra Majumder,IND,0,Graduate,52,"Rs 60,62,000 ~ 60 Lacs+",ANDAMAN AND NICOBAR ISLANDS,Rs 0 ~


## Descriptive Statistics

In [9]:
# getting Generate descriptive statistics of data. Generate descriptive statistics include count, mean, std, min_value, 25%, 50%, 75%, max_value
data.describe()

Unnamed: 0,Candidate,Party,Criminal Cases,Education,Age,Total Assets,City,Liabilities
count,7968,7968,7968,7968,7968,7968,7968,7968
unique,7665,681,31,12,66,6719,542,2279
top,Sunil Kumar,IND,0,Graduate,43,Nil,NIZAMABAD,Rs 0 ~
freq,11,3389,6461,1442,262,60,183,4301


## Summary of data-type, columns, non-null values, memory usage.

In [10]:
# data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7968 entries, 0 to 7967
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Candidate       7968 non-null   object
 1   Party           7968 non-null   object
 2   Criminal Cases  7968 non-null   object
 3   Education       7968 non-null   object
 4   Age             7968 non-null   object
 5   Total Assets    7968 non-null   object
 6   City            7968 non-null   object
 7   Liabilities     7968 non-null   object
dtypes: object(8)
memory usage: 498.1+ KB


## Column labels

In [11]:
# get the column labels of the data.
data.columns

Index(['Candidate', 'Party', 'Criminal Cases', 'Education', 'Age',
       'Total Assets', 'City', 'Liabilities'],
      dtype='object')

# Data Cleaning

## Drop Column

In [12]:
# drop column
data = data.drop(['Total Assets', 'Liabilities'], axis=1)

In [13]:
# data
data

Unnamed: 0,Candidate,Party,Criminal Cases,Education,Age,City
0,Kuldeep Rai Sharma,INC,0,Graduate Professional,52,ANDAMAN AND NICOBAR ISLANDS
1,Ayan Mandal,AITC,0,Graduate,30,ANDAMAN AND NICOBAR ISLANDS
2,C G Saji Kumar,All India Hindustan Congress Party,0,12th Pass,48,ANDAMAN AND NICOBAR ISLANDS
3,C U Rasheed,IND,0,12th Pass,34,ANDAMAN AND NICOBAR ISLANDS
4,Gour Chandra Majumder,IND,0,Graduate,52,ANDAMAN AND NICOBAR ISLANDS
...,...,...,...,...,...,...
7963,Maksuda Khatun,CPI(M),0,Doctorate,58,ULUBERIA
7964,Minati Sarkar,SUCI(C),0,Graduate,61,ULUBERIA
7965,Shoma Ranisree Roy\t,INC,0,Post Graduate,40,ULUBERIA
7966,Simal Saren,Indian Unity Centre,0,Graduate Professional,33,ULUBERIA


# Save Data into CSV

In [14]:
# save data
data.to_csv('LokSabha2019.csv', index=False)