You will scrape this <a href="https://sandeepmj.github.io/scrape-example-page/homework-site.html">mockup site</a> that lists a few data points for addiction centers.

In [1]:
pip install icecream

Note: you may need to restart the kernel to use updated packages.


In [2]:
## import library(ies)
from bs4 import BeautifulSoup ## package to parse HTML and XML
import icecream as ic ## for debugging
import requests ## The most widely downloaded package - captures content from web

In [3]:
## capture the contents of the site in a response object
url = "https://sandeepmj.github.io/scrape-example-page/homework-site.html"
response = requests.get(url)
response.status_code

200

In [4]:
## generate and print soup
soup = BeautifulSoup(response.text,'html.parser')
print(soup.prettify())

<html>
 <body>
  <div class="register-list">
   <div class="container">
    <h1>
     Addiction Treatment Center Inspections
    </h1>
    <div class="wrap">
     <h4>
      <a href="/public-register/recovery-foundation/">
       Recovery Foundation
      </a>
     </h4>
     <div class="row">
      <p class="status">
       Inspection: Passed
      </p>
      <p class="registration">
       Registration# 4235
      </p>
      <p class="regulation">
       Non Regulated
      </p>
      <p class="cost">
       $10,000
      </p>
      <p class="loans">
       Loans available
      </p>
     </div>
    </div>
    <div class="wrap">
     <h4>
      <a href="/public-register/new-horizons/">
       New Horizons
      </a>
     </h4>
     <div class="row">
      <p class="status">
       Inspection: Failed
      </p>
      <p class="registration">
       Registration# 4234
      </p>
      <p class="regulation">
       Non Regulated
      </p>
      <p class="cost">
       $15,000
      </p

In [5]:
## check data type of soup
type(soup)

bs4.BeautifulSoup

In [6]:
### Return the name of the first center (including the html)
soup.h4

<h4><a href="/public-register/recovery-foundation/">Recovery Foundation</a></h4>

In [7]:
### Return only the name of the first center (remove all the html)
soup.h4.get_text()

'Recovery Foundation'

In [9]:
### Return only the URL of the first center
soup.a.get("href")

'/public-register/recovery-foundation/'

In [10]:
### Find first instance of ALL a center's data
### Think of this as the first group of data associated with a company
soup.find(class_="wrap")

<div class="wrap">
<h4><a href="/public-register/recovery-foundation/">Recovery Foundation</a></h4>
<div class="row">
<p class="status">Inspection: Passed</p>
<p class="registration">Registration# 4235</p>
<p class="regulation">Non Regulated</p>
<p class="cost">$10,000</p>
<p class="loans">Loans available</p>
</div>
</div>

In [11]:
#### Find all the instances of every centers' data points.
soup.find_all(class_="row")

[<div class="row">
 <p class="status">Inspection: Passed</p>
 <p class="registration">Registration# 4235</p>
 <p class="regulation">Non Regulated</p>
 <p class="cost">$10,000</p>
 <p class="loans">Loans available</p>
 </div>,
 <div class="row">
 <p class="status">Inspection: Failed</p>
 <p class="registration">Registration# 4234</p>
 <p class="regulation">Non Regulated</p>
 <p class="cost">$15,000</p>
 <p class="loans">Loans available</p>
 </div>,
 <div class="row">
 <p class="status">Inspection: Passed</p>
 <p class="registration">Registration# 4231</p>
 <p class="regulation">Regulated</p>
 <p class="cost">$11,000</p>
 <p class="loans">Loans available</p>
 </div>]

In [12]:
### Find all the registration data
soup.find_all("p",class_="registration")

[<p class="registration">Registration# 4235</p>,
 <p class="registration">Registration# 4234</p>,
 <p class="registration">Registration# 4231</p>]

### Place all the registration data into a list with only the numbers in the format.
It should look like this:

```['4235', '4234', '4231']```

In [13]:
## do it here (create more cells if you need them)
registration_data = []
for item in soup.find_all(class_="registration"):  
    registration_data.append((item.get_text().replace("Registration# ","")))
registration_data

['4235', '4234', '4231']

### Place all the company names into a list.
It should look like this:

```['Recovery Foundation','New Horizons','Renewable Light']```

In [14]:
## do it here (create more cells if you need them)
company_names = []
for name in soup.find_all('a'):
    company_names.append(name.get_text())
company_names    

['Recovery Foundation', 'New Horizons', 'Renewable Light']

In [15]:
### lc
company_names_lc = [name.get_text() for name in soup.find_all('a')]
company_names_lc

['Recovery Foundation', 'New Horizons', 'Renewable Light']

### Place all the URLS into a list.


In [19]:
## do it here (create more cells if you need them)
centers = soup.find_all('a')
centers

[<a href="/public-register/recovery-foundation/">Recovery Foundation</a>,
 <a href="/public-register/new-horizons/">New Horizons</a>,
 <a href="/public-register/renewal-light/">Renewable Light</a>]

In [20]:
all_urls_lc = [url.get("href") for url in centers]
all_urls_lc

['/public-register/recovery-foundation/',
 '/public-register/new-horizons/',
 '/public-register/renewal-light/']

### Place all the status into a list.
It should look like this:

```['Active', 'Inactive', 'Active']```

In [40]:
## do it here (create more cells if you need them)
status = [item.get_text() for item in soup.find_all("p", class_="status")]
status

new_status = []

for word in status:
    if "Passed" in word:
        new_status.append('Active')
    else:
        new_status.append('Inactive')
new_status

['Active', 'Inactive', 'Active']

### Turn these lists into dataframes and export to a csv

In [41]:
## import pandas
import pandas as pd

In [48]:
### use pandas DataFrame method to zip files into a dataframe
centers_dict_list = []

for (name,in_active,url) in zip(company_names_lc, new_status, all_urls_lc):
    info = {"company_name": name, "status": in_active, "link": url}
    centers_dict_list.append(info)

centers_dict_list

[{'company_name': 'Recovery Foundation',
  'status': 'Active',
  'link': '/public-register/recovery-foundation/'},
 {'company_name': 'New Horizons',
  'status': 'Inactive',
  'link': '/public-register/new-horizons/'},
 {'company_name': 'Renewable Light',
  'status': 'Active',
  'link': '/public-register/renewal-light/'}]

In [49]:
## export to csv
filename = "centersdict.csv"
df = pd.DataFrame(centers_dict_list)
df

Unnamed: 0,company_name,status,link
0,Recovery Foundation,Active,/public-register/recovery-foundation/
1,New Horizons,Inactive,/public-register/new-horizons/
2,Renewable Light,Active,/public-register/renewal-light/


In [50]:
df.to_csv(filename, encoding = "UTF-8", index=False)
