In [1]:
import glob

import pandas as pd
from bs4 import BeautifulSoup as bs

In [2]:
# get a list of the files in the html directory
files = glob.glob('E:\\liveproject\\job-postings\\html_job_postings\\*.html')

In [3]:
len(files)

1337

In [4]:
# load all HTML pages as text into a list -- one entry per HTML page
html_content = []
for file in files:
    with open(file, 'r', encoding="utf8") as f:
        html_content.append(f.read())

In [5]:
# inspect the first entry to make sure it looks ok
# html_content[1336]

In [6]:
# We use a dictionary to store data, which can easily be converted to a pandas DataFrame
html_dict = {}
for key in ['title', 'body', 'bullets']:
    html_dict[key] = []
    
# use first page for prototyping the code
first_page = html_content[0]
# converting the text to a beautifulsoup object makes it easily searchable
soup = bs(first_page, 'lxml')
title = soup.find('title').text
body = soup.find('body').text
bullets = soup.find_all('li')
html_dict['title'].append(title)
html_dict['body'].append(body)
# remove extra leading and trailing whitespace with strip()
html_dict['bullets'].append([b.text.strip() for b in bullets])

df = pd.DataFrame(data = html_dict)

In [7]:
df.head()

Unnamed: 0,title,body,bullets
0,"Data Engineer - Columbus, GA 31909","Data Engineer - Columbus, GA 31909\nCelebratin...","[Bachelor’s or Master’s degree in statistics, ..."


In [8]:
def get_html_content(html_pages):
    """
    Extracts title, body, and list items (bullets) from HTML job postings.
    Returns a pandas dataframe with separate columns for title, body, and bullet items.
    """
    html_dict = {}
    for key in ['title', 'body', 'bullets']:
        html_dict[key] = []
        
    for html in html_pages:
        soup = bs(html, 'lxml')
        title = soup.find('title').text
        body = soup.find('body').text
        bullets = soup.find_all('li')
        html_dict['title'].append(title)
        html_dict['body'].append(body)
        # remove extra leading and trailing whitespace with strip()
        html_dict['bullets'].append([b.text.strip() for b in bullets])

    df = pd.DataFrame(html_dict)

    return df

In [9]:
df = get_html_content(html_content)

In [10]:
df.head()

Unnamed: 0,title,body,bullets
0,"Data Engineer - Columbus, GA 31909","Data Engineer - Columbus, GA 31909\nCelebratin...","[Bachelor’s or Master’s degree in statistics, ..."
1,"Data Analyst - St. Louis, MO","Data Analyst - St. Louis, MO\nDuties\nSummary\...",[Job family (Series)\n1501 General Mathematics...
2,"Data Scientist - Newark, CA","Data Scientist - Newark, CA\nData Scientist\n\...","[Design, develop, document and maintain machin..."
3,Patient Care Assistant / PCA - Med/Surg (Fayet...,Patient Care Assistant / PCA - Med/Surg (Fayet...,[Provides all personal care services in accord...
4,"Scientific Programmer - Berkeley, CA","Scientific Programmer - Berkeley, CA\nCaribou ...","[Demonstrated proficiency with Python, JavaScr..."


In [11]:
df.shape

(1337, 3)

In [12]:
df['bullets'] = df['bullets'].apply(tuple, 1)

In [13]:
df.drop_duplicates(inplace=True)
df.shape

(1328, 3)

In [14]:
df.to_pickle('step_df.pk')