# Import Libraries
Importing libraries are necessary for any project.
We're going to import commonly used Webscrapping and Data Analysis libraries. Make sure all libraries are installed.

In [1]:
import requests # for getting web contents
from bs4 import BeautifulSoup # for scraping web contents
import pandas as pd # for data analysis

# URL
For web scrapping it's important to have url.

In [6]:
# link from where you want to scrap data
URL = 'https://myneta.info/ls2014/'

# get web data
page = requests.get(URL)

# parse web data
soup = BeautifulSoup(page.content, "html.parser")

In [8]:
# find the table
# our trageted table is last

# this link have lists of links so first get all the links, all links stored in table so getting last table
table = soup.find_all('table')[-1]

# links stored in div class='items'
divs = table.find_all('div', class_="items")

links = []
cities = []

for div in divs:
    for a in div.find_all('a', href=True):
        link = a['href']
        if '/' in link:
            link = link.split('/')[-1:]
            links.append(link)
        elif '/' not in link:
            links.append(link)
        city = a.text.strip()
        cities.append(city)

In [9]:
# function to scrape values from each url
def url_scrraper(url,city):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.find_all('table')[-1]
    thead = [th for th in table.find_all('th')]
    for i in thead:
        headings.append(i.text)
    headings.insert(-1,'City')
    
    html_text = [tr for tr in table.find_all('tr')][1:]
    
    for text in html_text:
        td = [cells for cells in text.find_all('td')]
        if len(td)!= 0 :
            name = td[0].find('a').get_text()
            party = td[1].get_text()
            case = td[2].get_text()
            edu = td[3].get_text()
            age = td[4].get_text()
            ta = td[5].get_text()
            la = td[6].get_text()
            row = [name, party, case, edu, age, ta, la]
            row.insert(-1,city)

            #print(row)
            content.append(row)

In [11]:
base_url = 'https://myneta.info/ls2014/'
content = []

for i in range(len(cities)):
    headings = []
    if type(links[i]) == list:
        url = base_url + links[i][0]
    else:
        url = base_url + links[i]
    url_scrraper(url, cities[i])

In [12]:
# save contents in a dataframe
data = pd.DataFrame(content[:], columns=headings)

# Data Analysis

## Check Data Shape

In [13]:
# data shape
print(data.shape)

(8484, 8)


## Look at Example Records

In [14]:
# check few top rows of data
data.head()

Unnamed: 0,Candidate,Party,Criminal Cases,Education,Age,Total Assets,City,Liabilities
0,Bishnu Pada Ray,BJP,1,Graduate,64,"Rs 56,58,980 ~ 56 Lacs+",ANDAMAN AND NICOBAR ISLANDS,"Rs 1,75,115 ~ 1 Lacs+"
1,A Pandian,AIFB,0,12th Pass,41,"Rs 8,59,410 ~ 8 Lacs+",ANDAMAN AND NICOBAR ISLANDS,"Rs 3,50,000 ~ 3 Lacs+"
2,Anita Mondal,AITC,1,Graduate,48,"Rs 54,71,073 ~ 54 Lacs+",ANDAMAN AND NICOBAR ISLANDS,"Rs 6,000 ~ 6 Thou+"
3,C G Saji Kumar,IND,0,12th Pass,41,"Rs 50,000 ~ 50 Thou+",ANDAMAN AND NICOBAR ISLANDS,Rs 0 ~
4,Gaur Chandra Majumder,BSP,0,Graduate,47,"Rs 31,92,819 ~ 31 Lacs+",ANDAMAN AND NICOBAR ISLANDS,"Rs 6,88,689 ~ 6 Lacs+"


## Descriptive Statistics

In [15]:
# getting Generate descriptive statistics of data. Generate descriptive statistics include count, mean, std, min_value, 25%, 50%, 75%, max_value
data.describe()

Unnamed: 0,Candidate,Party,Criminal Cases,Education,Age,Total Assets,City,Liabilities
count,8484,8484,8484,8484,8484,8484,8484,8484
unique,8114,513,33,12,64,7041,571,2270
top,Ashok Kumar,IND,0,Graduate,43,Nil,VARANASI,Rs 0 ~
freq,15,3337,7036,1560,300,78,42,4904


## Summary of data-type, columns, non-null values, memory usage.

In [16]:
# data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8484 entries, 0 to 8483
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Candidate       8484 non-null   object
 1   Party           8484 non-null   object
 2   Criminal Cases  8484 non-null   object
 3   Education       8484 non-null   object
 4   Age             8484 non-null   object
 5   Total Assets    8484 non-null   object
 6   City            8484 non-null   object
 7   Liabilities     8484 non-null   object
dtypes: object(8)
memory usage: 530.4+ KB


## Column labels

In [17]:
# get the column labels of the data.
data.columns

Index(['Candidate', 'Party', 'Criminal Cases', 'Education', 'Age',
       'Total Assets', 'City', 'Liabilities'],
      dtype='object')

# Data Cleaning

## Drop Column

In [18]:
# drop column
data = data.drop(['Total Assets', 'Liabilities'], axis=1)

In [19]:
# data
data

Unnamed: 0,Candidate,Party,Criminal Cases,Education,Age,City
0,Bishnu Pada Ray,BJP,1,Graduate,64,ANDAMAN AND NICOBAR ISLANDS
1,A Pandian,AIFB,0,12th Pass,41,ANDAMAN AND NICOBAR ISLANDS
2,Anita Mondal,AITC,1,Graduate,48,ANDAMAN AND NICOBAR ISLANDS
3,C G Saji Kumar,IND,0,12th Pass,41,ANDAMAN AND NICOBAR ISLANDS
4,Gaur Chandra Majumder,BSP,0,Graduate,47,ANDAMAN AND NICOBAR ISLANDS
...,...,...,...,...,...,...
8479,Sabiruddin Molla,CPI(M),0,Graduate,43,ULUBERIA : BYE ELECTION ON 29-01-2018
8480,Sanjib Karmakar,IND,0,12th Pass,59,ULUBERIA : BYE ELECTION ON 29-01-2018
8481,Simal Saren,IND,0,Graduate,30,ULUBERIA : BYE ELECTION ON 29-01-2018
8482,Sk. Madassar Hossain Warsi,INC,0,Post Graduate,35,ULUBERIA : BYE ELECTION ON 29-01-2018


# Save Data into CSV

In [20]:
# save data
data.to_csv('LokSabha2014.csv', index=False)