BeautifulSoup Guide
---

In [None]:
#imports
from bs4 import BeautifulSoup 
'''
BeautifulSoup beautifies html content, 
and basically turns html tags into python objects
'''


In [None]:
#accessing the files from webpage.
#say we have a file called home.html

#opens a (in our case hypothetical) file, 'r' stands for read, "html_file" will be the name we give to this file
with open("home.html", "r") as html_file:
    content = html_file.read()
    #print(content) just prints all the html
    soup = BeautifulSoup(content, 'lxml') #creates instance of beautifulsoup library, 1st argument is the file that's been read, 2nd is parser method 'lxml'
    #print(soup.prettify) allows us to visualize HTML in a nicer way
    tag = soup.find('h5') #searches for the 1st element of specific HTML tag in the page we're looking at, and stops the execution after the first element (html tag)
    tags = soup.find_all('h5') #does the same and finds all of the h5 tags. This variable actually has a list, which we can iterate through
    #print(tags)
    for content in tags:
        print(content.text) #prints the content for each element (tag) in the list

In [None]:
#program that goes through specific html tags, and extracts some info from children tags. Ex:) going through 3 <div> tags and extracting a price from each div tag's child <a> tag
with open("home.html", "r") as html_file:
    content = html_file.read()
    soup = BeautifulSoup(content, 'lxml') 
    tags_with_class = soup.find_all('div', class_='anyclass') #say you just want certain div tags, filter with class html attribute with 2nd parameter
    for content in tags_with_class:
        content = content.specifictag.text #sets a new variable equal to the text of a specific tag in the html. Example: <h5> tag children of the <div> tags in the list
        sub_content = content.anothertag.text #does the same, in this case equal to a child tag that has text we want to print. Example: <a> tag children of the <div> tags 


In [None]:
with open("home.html", "r") as html_file:
    content = html_file.read()
    soup = BeautifulSoup(content, 'lxml') 
    tags_with_class = soup.find_all('div', class_='anyclass')
    for content in tags_with_class:
        content = content.specifictag.text
        #Now say that anothertag has some text, and we need just the last word/info, like a price. We can split the text into indices and index from the last element (located at -1).
        sub_content = content.anothertag.text.split()[-1] 
        #We can display this summarized info in a smooth manner using an 'f string,' which embeds expression into string literal.
        print(f'{content} costs {sub_content}') #Given that 'content' describes a some online course, and 'sub_content' represents the price
        

---
Mini Project
---

Now scraping real websites for jobs. 
This program pulls the latest published job from a website.
- Request URL
- Instantiate of BeautifulSoup Class / create object
- variable set up for HTML tags
- All set

In [2]:
from bs4 import BeautifulSoup
import requests

In [3]:
#Request the html text from this url
html_text = requests.get('https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&txtKeywords=python&txtLocation=').text

#instantiate the BeautifulSoup class
soup = BeautifulSoup(html_text, 'lxml')

#If you go into inspect, there's an unordered list of all the jobs, listed with an <li> tag, and they each have a class. We'll reference both.
#We want to use just the first job we are given from the search, so just use soup.find()
job = soup.find('li', class_='clearfix job-bx wht-shd-bx') #finds the tag that has all our first job information
company_name = job.find('h3', class_="joblist-comp-name").text.replace(' ', '') #using job object we created to find it's first sub h3 tag. 'Replace' gets rid of whitespace, replaces it with nothing


Now what HTML element has the skills required for the job?

In [4]:
#The <span> tag has one skill in its text, and a child <strong> tag containing more skills in that tag's text. We just need the text of the whole span.
skills = job.find('span', class_='srp-skills').text.replace(' ','') #get rid of whitespace
skills = skills.replace(',',', ') #make it nicer
print(skills)



rest, python, security, debugging




In [5]:
'''
Now to find the published date.
Remember you can't just use the text attribute on this because a span is an inline element, it doesn't simply have text like a div.
The published date will be used for functionality. It will force our code to output just the results posted 'a few days ago' (keeping it recent)
'''
published_date = job.find('span', class_ = 'sim-posted').span.text
print(published_date)

Posted few days ago


Now we want the jobs just from the FIRST page (the recent ones)

In [None]:
#iterate over each job on the page regardless of date published
jobs = soup.find_all('li', class_='clearfix job-bx wht-shd-bx')

for job in jobs:
    published_date = job.find('span', class_ = 'sim-posted').span.text
    if 'few' in published_date:
        company_name = job.find('h3', class_="joblist-comp-name").text.replace(' ', '') 
        skills = (job.find('span', class_='srp-skills').text.replace(' ','')).replace(',', ', ') #get rid of whitespace 

        print(f'''
        Company Name: {company_name}
        Required Skills: {skills}
        ''')

        print('')

#we placed the published date first so we can end the loop if it isn't few days ago 

ALWAYS PRINT THE HTML YOU ARE PULLING TO SEE WHAT METHODS YOU WILL HAVE TO USE TO BEAUTIFY IT

---
Now for more features.
---

- Say we want to allow a user to input skills they want to FILTER from their job search
- We can accomplish this through a conditional statement that checks whether the unfamiliar skill is in the list of skills

In [1]:
from bs4 import BeautifulSoup
import requests
import time

#Allows user to input unfamiliar skill(s)
print('Input skill(s) you are not familiar with:')
unfamiliar_skills = input('>')
print(f'Filtering out {unfamiliar_skills}')
unfamiliar_skills = (unfamiliar_skills.replace(',', '')).split()

Input skill(s) you are not familiar with:
Filtering out git


In [None]:
#goes through every job on the webpage and finds the most recent ones, denoted by 'few days ago' in the HTML
html_text = requests.get('https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&txtKeywords=python&txtLocation=').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find_all('li', class_='clearfix job-bx wht-shd-bx')
    
#enumerate allows us to iterate through the jobs, but gives an index (a count) to each job. Example: First file job is 
for index, job in enumerate(jobs):
    published_date = job.find('span', class_ = 'sim-posted').span.text
    if 'few' in published_date:

        company_name = job.find('h3', class_="joblist-comp-name").text.replace(' ', '') 
        skills = (job.find('span', class_='srp-skills').text.replace(' ','')).replace(',', ', ') #gets rid of whitespace 
        more_info = job.header.h2.a['href']
        # the above line uses a bunch of tag attributes to find the link where we can find more info about the job.
        # Since the URL is not text, we need to grab it from the 'href' attribute of the <a> tag through the above technique
        
        for unfamiliar_skill in unfamiliar_skills:
            if unfamiliar_skill not in skills:
                print(f"Company Name: {company_name.strip()}") #remove whitespace at beginning/end of string
                print(f"Required Skills: {skills.strip()}")
                print(f"More Info: : {more_info}")
                print(f"{published_date}")

                print('')
