#### Webscraping using Python ( Example 1 )

References: 
https://first-web-scraper.readthedocs.io/en/latest/
http://web.stanford.edu/~zlotnick/TextAsData/Web_Scraping_with_Beautiful_Soup.html
http://altitudelabs.com/blog/web-scraping-with-python-and-beautiful-soup/

Installation:
`pip install BS4`  | 
`pip install Requests`

In [1]:
!pip install BS4
!pip install Requests

Collecting BS4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Building wheels for collected packages: BS4
  Building wheel for BS4 (setup.py) ... [?25ldone
[?25h  Created wheel for BS4: filename=bs4-0.0.1-cp37-none-any.whl size=1273 sha256=9d85e7b7f31970d459db4495e2176bfe0be422c46483560dc10741d055f7fafa
  Stored in directory: /Users/sbharadwaj/Library/Caches/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built BS4
Installing collected packages: BS4
Successfully installed BS4-0.0.1


In [2]:
import csv
import requests
from bs4 import BeautifulSoup
from IPython.display import HTML

Scraping Rules
--------------
- You should check a website’s Terms and Conditions before you scrape it. Be careful to read the statements about legal use of data. Usually, the data you scrape should not be used for commercial purposes.
- Do not request data from the website too aggressively with your program (also known as spamming), as this may break the website. Make sure your program behaves in a reasonable manner (i.e. acts like a human). One request for one webpage per second is good practice.
- The layout of a website may change from time to time, so make sure to revisit the site and rewrite your code as needed

In [3]:
# scrape the current Detainees of Boone County Jail from webpage into CSV
url = 'https://report.boonecountymo.org/mrcjava/servlet/SH01_MP.I00290s'
headers = {'User-Agent': "Chrome/54.0.2840.90"}
response = requests.get(url, headers=headers)
html = response.content 

In [4]:
HTML('<iframe src=http://www.showmeboone.com/sheriff/JailResidents/JailResidents.asp width=900 height=350></iframe>')



In [5]:
soup = BeautifulSoup(html, "lxml")
table = soup.find('tbody', attrs={'class': 'stripe'})

In [6]:
tmpRow = (table.findAll('tr')[1:])
print (tmpRow)


[<tr class="even">
<td class="two td_left" data-th="Last Name">ADAMS</td>
<td class="two td_left" data-th="First Name">JEREMIAH</td>
<td class="two td_left" data-th="Middle Name">SETH</td>
<td class="two td_left" data-th="Suffix"> </td>
<td class="two td_left" data-th="Sex">M</td>
<td class="two td_left" data-th="Race">W</td>
<td class="two td_right" data-th="Age">45</td>
<td class="two td_left" data-th="City">COLUMBIA</td>
<td class="two td_left" data-th="State">MO</td>
<td class="two td_left" data-th="">
<a class="_lookup btn btn-primary" height="600" href="RMS01_MP.R00040s?run=2&amp;R001=&amp;R002=&amp;ID=2885&amp;hover_redir=&amp;width=950" linkedtype="I" mrc="returndata" target="_lookup" width="860"><i class="fa fa-large fa-fw fa-list-alt"> </i>Details</a>
</td>
</tr>, <tr class="odd">
<td class="one td_left" data-th="Last Name">ADAMS</td>
<td class="one td_left" data-th="First Name">LACEE</td>
<td class="one td_left" data-th="Middle Name">KABRINA</td>
<td class="one td_left" data

In [7]:
### Editing the code to fetch details of every record. The 'Case Number' and 'Charge Description' is Fetched for every Record.
list_of_rows = []
try:
    outfile = open("./inmates.csv", "w")
    writer = csv.writer(outfile)
    writer.writerow(["Last", "First", "Middle", "Gender", "Race", "Age", "City", "State","Case Number","Charge Description"])
    for row in table.findAll('tr'):
        list_of_cells = []
        for cell in row.findAll("td"):
            if(cell['data-th'] == ''):
                continue
            text = cell.text.replace('&nbsp;', '')
            list_of_cells.append(text)
        for anchor in row.findAll('a'):
            #href is fetched for every record and the response is parsed for every individual hit to href.
            details_href = "https://report.boonecountymo.org/mrcjava/servlet/"+anchor['href']
            details_response = requests.get(details_href)
            details_html = details_response.content
            details_soup = BeautifulSoup(details_html, "lxml")
            details_table = details_soup.find('tbody', attrs={'class': 'stripe'})
            if details_table is not None:
                details_table_tr = details_table.find('tr', attrs={"class": "detailBackground"})
                if details_table_tr is not None:
                    list_of_cells.append((details_table_tr.find('td', attrs = {'data-th' : 'CASE #'})).text)
                    list_of_cells.append((details_table_tr.find('td', attrs = {'data-th' : 'CHARGE DESCRIPTION'})).text)
        arrLength = len(list_of_cells)
        writer.writerow(list_of_cells)
finally:
    outfile.close()  

In [8]:
# data cleaning 
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('./inmates.csv')
df.head()

Unnamed: 0,Last,First,Middle,Gender,Race,Age,City,State,Case Number,Charge Description
0,ADAM,OMER,SIRAJ,,M,B,32,COLUMBIA,MO,
1,ADAMS,JEREMIAH,SETH,,M,W,45,COLUMBIA,MO,
2,ADAMS,LACEE,KABRINA,,F,B,24,ARMSTRONG,MO,
3,AHMED,ELSADIG,IBRAHIM SALIH,,M,B,34,COLUMBIA,MO,
4,ARMSTRONG,TYRONE,MARTEZ,,M,B,28,COLUMBIA,MO,


In [10]:
# Drop certain fields if they are not relevant for analysis
df.drop(['Race'], inplace=True, axis=1)
df.head()

Unnamed: 0,Last,First,Middle,Gender,Age,City,State,Case Number,Charge Description
0,ADAM,OMER,SIRAJ,,B,32,COLUMBIA,MO,
1,ADAMS,JEREMIAH,SETH,,W,45,COLUMBIA,MO,
2,ADAMS,LACEE,KABRINA,,B,24,ARMSTRONG,MO,
3,AHMED,ELSADIG,IBRAHIM SALIH,,B,34,COLUMBIA,MO,
4,ARMSTRONG,TYRONE,MARTEZ,,B,28,COLUMBIA,MO,


In [11]:
# Expanding M as Male and F as Female.
gender = df['Gender']
male = gender.str.contains('M')
female = gender.str.contains('F')
df['Gender'] = np.where(male, 'Male',
                       np.where(female, 'Female',
                               gender))
df.head()

Unnamed: 0,Last,First,Middle,Gender,Age,City,State,Case Number,Charge Description
0,ADAM,OMER,SIRAJ,,B,32,COLUMBIA,MO,
1,ADAMS,JEREMIAH,SETH,,W,45,COLUMBIA,MO,
2,ADAMS,LACEE,KABRINA,,B,24,ARMSTRONG,MO,
3,AHMED,ELSADIG,IBRAHIM SALIH,,B,34,COLUMBIA,MO,
4,ARMSTRONG,TYRONE,MARTEZ,,B,28,COLUMBIA,MO,


In [12]:
# Check for missing values in Case Number
df['Case Number'].isnull().values.any()

False

In [13]:
# Fill In missing values with a default value
df['Case Number'].fillna("No Case Number Available", inplace=True)
df['Charge Description'].fillna("No Description Available", inplace=True)
df.head()

Unnamed: 0,Last,First,Middle,Gender,Age,City,State,Case Number,Charge Description
0,ADAM,OMER,SIRAJ,,B,32,COLUMBIA,MO,No Description Available
1,ADAMS,JEREMIAH,SETH,,W,45,COLUMBIA,MO,No Description Available
2,ADAMS,LACEE,KABRINA,,B,24,ARMSTRONG,MO,No Description Available
3,AHMED,ELSADIG,IBRAHIM SALIH,,B,34,COLUMBIA,MO,No Description Available
4,ARMSTRONG,TYRONE,MARTEZ,,B,28,COLUMBIA,MO,No Description Available


In [14]:
# Remove whitespaces from Charge description and changing to uppercase
df['Charge Description'] = df['Charge Description'].str.strip()
df['Charge Description'] = df['Charge Description'].str.upper()
df.head()

Unnamed: 0,Last,First,Middle,Gender,Age,City,State,Case Number,Charge Description
0,ADAM,OMER,SIRAJ,,B,32,COLUMBIA,MO,NO DESCRIPTION AVAILABLE
1,ADAMS,JEREMIAH,SETH,,W,45,COLUMBIA,MO,NO DESCRIPTION AVAILABLE
2,ADAMS,LACEE,KABRINA,,B,24,ARMSTRONG,MO,NO DESCRIPTION AVAILABLE
3,AHMED,ELSADIG,IBRAHIM SALIH,,B,34,COLUMBIA,MO,NO DESCRIPTION AVAILABLE
4,ARMSTRONG,TYRONE,MARTEZ,,B,28,COLUMBIA,MO,NO DESCRIPTION AVAILABLE


In [15]:
# save final results
df.to_csv("./cleaninmates.csv")