In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
import plotly.express as px

In [7]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
jobsite = "https://realpython.github.io/fake-jobs/"

In [9]:
#open the job listing page
with requests.get(jobsite) as target:
    soup = BeautifulSoup(target.content, 'html.parser')

In [10]:
#locate all links related to the 'Apply' button and return as list
tags = soup.find_all('a')
targets = [tag['href'] for tag in tags if 'Apply' in tag]

In [11]:
#parse each job add to list
job_ads = []
for target in targets:
    with requests.get(target) as target:
        soup = BeautifulSoup(target.content, 'html.parser')
        #title, company, description, location, posted
        title = soup.find_all('h1')[1].text
        company = soup.find('h2').text
        desc = soup.find_all('p')[1].text
        location = soup.find_all('p')[2].text
        date = soup.find_all('p')[3].text
        job_ads.append((title, company, desc, location, date))

In [12]:
df = pd.DataFrame(job_ads, columns=['Title','Company','Description','Location','Date'])

In [13]:
#create functions for NLP feature engineering
def remove_stopwords(tokens):
    tokens = [token for token in tokens if token not in stopwords.words('english')]

def tokenise(dataframe, series):
    dataframe['Tokens'] = series.apply(word_tokenize)
    dataframe['Tokens'].apply(remove_stopwords)

    

In [14]:
tokenise(df, df.Description)

In [15]:
def vectorise(dataframe, series):
    vectoriser = TfidfVectorizer()
    X = vectoriser.fit_transform(series)
    X_embedded = TSNE(n_components=2, init='random').fit_transform(X)
    coords = pd.DataFrame(X_embedded, columns=['x','y'])
    return coords
    



In [16]:
coords = vectorise(df, df.Description)

In [17]:
processed_data = pd.concat([df,coords], axis=1)

In [22]:
fig1 = px.scatter(processed_data, x='x',y='y', hover_name='Title')
fig1.show()