# Web Scrapping - Fake Jobs
-  

In [76]:
import requests
import pandas as pd
import numpy as np

In [81]:
url = 'https://realpython.github.io/fake-jobs/'
url

'https://realpython.github.io/fake-jobs/'

In [82]:
page = requests.get(url)

In [83]:
page

<Response [200]>

In [84]:
page.text[1:100]  #first 100 characters in the html

'!DOCTYPE html>\n<html>\n  <head>\n    <meta charset="utf-8">\n    <meta name="viewport" content="width='

- page.text is like html
- static content
- https://webformatter.com/html - upload url and see the content
- Part of the html 
    - div class="media-content"
    - h2 class="title is-5">Senior Python Developer /h2
    - h3 class="subtitle is-6 company">Payne, Roberts and Davis /h3
    - /div
-  class = 'title is-5' -> contains title of the job
-  class = 'subtitle is-6' -> contains name of coy offering job
-  class = 'location' -> location of job

# Beautiful Soup
- parsing structure data
- interact with pages like developer tools
- has many function to parse and extact values
- different parsers to extract data
  - html.parser
  - xml
  - lxml
  - html5lib
- there are different encoding also
  - iso-8859-8 , xx-7

In [85]:
from bs4 import BeautifulSoup

In [86]:
page = requests.get(url) #from requests library
soup = BeautifulSoup(page.content, 'html.parser')
# page.content is better that page.text for parsing html text

## Find elements by ID
- in HTML web page, every element can have an id ATTRIBUTE assigned, which uniquely identifies element in the page
- parse page by selecting specific element by its ID
- the element we are looking for here, that has
  - div with id attribute ='ResultsContainer'
  - it has other attributes, but we will pick id here
  - 20: div id="ResultsContainer" class="columns is-multiline"
    - .. all job listings.. under this container


In [87]:
results = soup.find(id='ResultsContainer')

In [88]:
# results # will list all html under this container onwards
# now we work with this part of html 

In [89]:
results.prettify()[0:400]

'<div class="columns is-multiline" id="ResultsContainer">\n <div class="column is-half">\n  <div class="card">\n   <div class="card-content">\n    <div class="media">\n     <div class="media-left">\n      <figure class="image is-48x48">\n       <img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>\n      </figure>\n     </div>\n   '

## Find elements by Class Name
- every job is wrapped in a < div > element with class = 'card-content'
  <div class="card-content">
  <div class="media">
      <div class="media-left">
        <figure class="image is-48x48">
          <img src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1" alt="Real Python Logo">
        </figure>
      </div>
      <div class="media-content">
        <h2 class="title is-5">Senior Python Developer</h2>
        <h3 class="subtitle is-6 company">Payne, Roberts and Davis</h3>
      </div>
    </div>

In [90]:
job_elements = results.find_all('div', class_='card-content')

In [91]:
job_elements[0:1] #first job

[<div class="card-content">
 <div class="media">
 <div class="media-left">
 <figure class="image is-48x48">
 <img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
 </figure>
 </div>
 <div class="media-content">
 <h2 class="title is-5">Senior Python Developer</h2>
 <h3 class="subtitle is-6 company">Payne, Roberts and Davis</h3>
 </div>
 </div>
 <div class="content">
 <p class="location">
         Stewartbury, AA
       </p>
 <p class="is-small has-text-grey">
 <time datetime="2021-04-08">2021-04-08</time>
 </p>
 </div>
 <footer class="card-footer">
 <a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>
 <a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html" target="_blank">Apply</a>
 </footer>
 </div>]

In [92]:
i = 0
for job_element in job_elements[0:5]:
    print('Job No : ', i+1, '\n', job_element, end='\n'*2)
    i += 1

Job No :  1 
 <div class="card-content">
<div class="media">
<div class="media-left">
<figure class="image is-48x48">
<img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
</figure>
</div>
<div class="media-content">
<h2 class="title is-5">Senior Python Developer</h2>
<h3 class="subtitle is-6 company">Payne, Roberts and Davis</h3>
</div>
</div>
<div class="content">
<p class="location">
        Stewartbury, AA
      </p>
<p class="is-small has-text-grey">
<time datetime="2021-04-08">2021-04-08</time>
</p>
</div>
<footer class="card-footer">
<a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>
<a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html" target="_blank">Apply</a>
</footer>
</div>

Job No :  2 
 <div class="card-content">
<div class="media">
<div class="media-left">
<figure class="image is-48x48">
<img alt="Real 

# More finer extraction
- Still there are many elements to read/ extract

In [93]:
i=0
for job_element in job_elements[0:5]:
    print('Job No - ', i+1)
    title = job_element.find('h2',class_ ='title is-5')
    coy = job_element.find('h3', class_ ='subtitle is-6 company')
    loc = job_element.find('p', class_ ='location')
    print(title, '\n', coy, '\n', loc)
    i += 1
    print()

Job No -  1
<h2 class="title is-5">Senior Python Developer</h2> 
 <h3 class="subtitle is-6 company">Payne, Roberts and Davis</h3> 
 <p class="location">
        Stewartbury, AA
      </p>

Job No -  2
<h2 class="title is-5">Energy engineer</h2> 
 <h3 class="subtitle is-6 company">Vasquez-Davidson</h3> 
 <p class="location">
        Christopherville, AA
      </p>

Job No -  3
<h2 class="title is-5">Legal executive</h2> 
 <h3 class="subtitle is-6 company">Jackson, Chambers and Levy</h3> 
 <p class="location">
        Port Ericaburgh, AA
      </p>

Job No -  4
<h2 class="title is-5">Fitness centre manager</h2> 
 <h3 class="subtitle is-6 company">Savage-Bradley</h3> 
 <p class="location">
        East Seanview, AP
      </p>

Job No -  5
<h2 class="title is-5">Product manager</h2> 
 <h3 class="subtitle is-6 company">Ramirez Inc</h3> 
 <p class="location">
        North Jamieview, AP
      </p>



## Extract only text content
- add .text to BS object
- but it has whitespaces

In [94]:
i=0
for job_element in job_elements[0:2]:
    print('Job No - ', i+1)
    title = job_element.find('h2',class_ ='title is-5')
    coy = job_element.find('h3', class_ ='subtitle is-6 company')
    loc = job_element.find('p', class_ ='location')
    print(title.text, '\n', coy.text, '\n', loc.text)
    i += 1
    print()

Job No -  1
Senior Python Developer 
 Payne, Roberts and Davis 
 
        Stewartbury, AA
      

Job No -  2
Energy engineer 
 Vasquez-Davidson 
 
        Christopherville, AA
      



In [96]:
i=0
for job_element in job_elements[0:5]:
    print('Job No - ', i+1)
    title = job_element.find('h2',class_ ='title is-5')
    coy = job_element.find('h3', class_ ='subtitle is-6 company')
    loc = job_element.find('p', class_ ='location')
    print(title.text.strip(), '\t', coy.text.strip(), '\t', loc.text.strip())
    i += 1
    print()

Job No -  1
Senior Python Developer 	 Payne, Roberts and Davis 	 Stewartbury, AA

Job No -  2
Energy engineer 	 Vasquez-Davidson 	 Christopherville, AA

Job No -  3
Legal executive 	 Jackson, Chambers and Levy 	 Port Ericaburgh, AA

Job No -  4
Fitness centre manager 	 Savage-Bradley 	 East Seanview, AP

Job No -  5
Product manager 	 Ramirez Inc 	 North Jamieview, AP



In [112]:
i = 0
df = pd.DataFrame(columns=['ser','title','coy','loc'])
#for job_element
for job_element in job_elements[0:1000]:
    title = job_element.find('h2',class_ ='title is-5').text.strip()
    coy = job_element.find('h3', class_ ='subtitle is-6 company').text.strip()
    loc = job_element.find('p', class_ ='location').text.strip()
    i += 1
    dfDict = {'ser':int(i), 'title':title, 'coy':coy, 'loc':loc}
    df.loc[i] = dfDict
df.head()

Unnamed: 0,ser,title,coy,loc
1,1,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA"
2,2,Energy engineer,Vasquez-Davidson,"Christopherville, AA"
3,3,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA"
4,4,Fitness centre manager,Savage-Bradley,"East Seanview, AP"
5,5,Product manager,Ramirez Inc,"North Jamieview, AP"


In [115]:
df.describe(include='object')

Unnamed: 0,title,coy,loc
count,100,100,100
unique,92,99,100
top,Python Programmer (Entry-Level),Garcia PLC,"Stewartbury, AA"
freq,3,2,1


In [None]:
df.shape

# search for only python jobs