In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
import nltk
import numpy as np
import re
from nltk.corpus import wordnet

In [15]:
def raw_text(url):
    """Takes a URL as input and performs web scrapping to retrieve the body of the
    webpage (in this case a Linkedin ad)"""
    ad = requests.get(url) #Retrieve webpage
    Html = BeautifulSoup(ad.text, 'html.parser') #Convert html into a nicer format
    text_body = Html.find_all('div', 
                              {'class':"show-more-less-html__markup show-more-less-html__markup--clamp-after-5"})
    text_body = text_body[0].text
    return text_body
def clean_text(doc):
    """Take an unstructured document and tokenize it into a list of words. 
    Then standardize it by lowercasing and lemmatizing each word"""
    words = re.findall(r'(?:[a-zA-Z]|#|"+")+',doc) #Find all alphabetical words (Preserve + and # for C++ and C#)
    clean = [i for i in words if i.isupper() or i.islower()] #Retrieve all words that aren't glued to each other
    dirty = [i for i in words if not i.islower() and not i.isupper()] #Retrieve words stuck together
    dirty = [re.findall('[a-zA-Z][^A-Z]*',i) for i in dirty] #Split all the tangled words ie split 'ThisExample' into ['This','Example']
    clean2 = [j for i in dirty for j in i] #Unlist the list of lists
    words = clean + clean2 #Combine all the words together
    stopwords = nltk.corpus.stopwords.words("english")
    words = [i.lower() for i in words] #Lowercase all words
    words = [i for i in words if i not in stopwords] #Filter out stopwords
    tag_words = nltk.pos_tag(words) #Begin lemmatizing by tagging each word
    tag_words = [(i, wordnet_pos(j)) for (i, j) in tag_words] #Convert the tags into something the lemmatizer understands
    lemmatizer = nltk.WordNetLemmatizer()
    clean_words = [lemmatizer.lemmatize(i, j) for i, j in tag_words] #Lemmatize the words
    #Document should be cleaned up
    return clean_words
def wordnet_pos(tag):
    """Map a Brown POS tag to a WordNet POS tag."""
    
    table = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    
    # Default to a noun.
    return table.get(tag[0], wordnet.NOUN) #Function created by Bo Ning in Week 6-2
def lang_count(TXT):
    """ Take a body of clean text and count the number of programming languages present"""
    languages = ['python','r','sql','sa','c',
                 'c++','c#','java','javascript',
                 'julia','matlab','swift','tableau',
                'microsoft','github','excel'] #SAS turns into sa after lemmatization
    #ADD MORE LANGUAGES IF NECESSARY
    count = sum([i in TXT for i in languages]) #Check if each language is in the ad
    #And sum the number of programming languages present
    return count
def get_salary(TXT):
    """From a body of raw text, retrieve the salary"""
    salaries = re.findall(r"(\$\d+\,\d+\.\d{1,2})",TXT) #Find all numbers with $ , and .
    if salaries != []:
        return salaries[-1] #Let's work with the maximum salary
    else:
        salaries = re.findall(r"(\$\d+\,\d+)",TXT)  #Account for no decimals too!
        if salaries != []:
            return salaries[-1]
        else: #If list is empty no salary is present and return NA
            return "NaN"
def ML_skill(TXT):
    """Using a body of clean text, check whether the words machine learning is present
    to see if it is a required skill"""
    if ('machine' in TXT and 'learn' in TXT) or ('ml' in TXT): #Check for the words related to machine learning
        return 'Yes' #If it's present return yes as in machine learning is required
    else:
        return 'No'
def get_edu(TXT):
    """Using a body of raw text, retrieve the highest education level"""
    if "PhD" in TXT or 'Ph.D' in TXT: #Start looking for PhD to see if it's the highest education listed
        return "PhD"
    elif "Master" in TXT or 'MS' in TXT or 'MA' in TXT:#If PhD is absent do the same thing with masters
        return "Master"
    elif "Bachelor" in TXT or 'BS' in TXT or 'BA' in TXT:
        return "Bachelor"
    else:
        return "NaN" #No education specified
def benefits(TXT):
    """Using a body of raw text, check if benefits are included"""
    if 'Benefit' in TXT or 'benefit' in TXT: #Check if benefit is in the ad to determine whether benefits are included
        return 'Yes'
    else:
        return 'No'
def exp(TXT):
    """Using a body of raw text, check if experience is required/preferred"""
    sentences = nltk.sent_tokenize(TXT) #Split text into sentences
    years = [re.findall(r"\d+.*year", i) for i in sentences] #Find sentences with years in it
    for items in years:
        if items != []:
            years = [i for i in years if i != []][0][0] #Get rid of empty values and turn the years of experience into a string
            year = re.findall(r'\d+',years)[0]
            return year
    return "NaN" #If we make it through the loop years of experience is absent and return NA
def collect_data(url):
    """Input a URL for a Linkedin Ad and retrieve all relevant data"""
    raw = raw_text(url)
    clean = clean_text(raw)
    return {'Languages':lang_count(clean),
            'Salary':get_salary(raw),
            'Machine Learning':ML_skill(clean),
            'Education':get_edu(raw),
            'Benefits':benefits(raw),
            'Experience':exp(raw),
            'url':url}

In [13]:
url = 'https://www.linkedin.com/jobs/view/3509038110/?alternateChannel=search&refId=G%2BHojJUbFYAcF8KaPjs4lw%3D%3D&trackingId=3fxp2x2KsGBgYD%2FIzR%2FPyw%3D%3D'

In [22]:
a = raw_text(url)
a

'\n        As an intern at Synopsys, you will gain hands-on experience while working alongside industry professionals. You will develop and refine skills relevant to your major and future career by contributing to high-impact business projects. The Synopsys intern program integrates our interns into the culture with career opportunities upon graduation.This summer internship is full-time (40 hours/week) beginning May/June 2023 and will last three monthResponsibilitiesIntern will be part of a dynamic Business Intelligence team under Enterprise Marketing Communications, supporting the Data Over Time (DOT) initiative and DOT-managed projects at Synopsys. The Data Over Time (DOT) initiative is designed to accelerate digital transformation at One Synopsys through effective data governance and delivery of data analytics and insights resulting in data-driven decisions and actions. The Intern will contribute to data analytics and data science projects that provide value to various business gro

In [8]:
clean_text(a)

['usa',
 'department',
 'aspects',
 'clinical',
 'development',
 'process',
 'range',
 'clinical',
 'trial',
 'design',
 'regulatory',
 'submission',
 'apply',
 'scientific',
 'rigor',
 'statistical',
 'method',
 'interpretation',
 'result',
 'also',
 'advise',
 'conduct',
 'clinical',
 'study',
 'database',
 'development',
 'data',
 'quality',
 'assurance',
 'analysis',
 'clinical',
 'endpoint',
 'mind',
 'lead',
 'one',
 'clinical',
 'trial',
 'lead',
 'coordination',
 'analysis',
 'study',
 'report',
 'document',
 'provide',
 'program',
 'validation',
 'support',
 'core',
 'stakeholder',
 'provide',
 'statistical',
 'expertise',
 'support',
 'new',
 'product',
 'development',
 'npd',
 'regulatory',
 'submission',
 'e',
 'g',
 'pma',
 'ce',
 'pmda',
 'regulatory',
 'document',
 'project',
 'study',
 'team',
 'coordinate',
 'communicate',
 'management',
 'team',
 'member',
 'regard',
 'project',
 'study',
 'status',
 'timeline',
 'statistical',
 'expertise',
 'ad',
 'hoc',
 'data',
 '

In [16]:
collect_data(url)

{'Languages': 2,
 'Salary': 'NaN',
 'Machine Learning': 'Yes',
 'Education': 'Master',
 'Benefits': 'Yes',
 'Experience': 'NaN',
 'url': 'https://www.linkedin.com/jobs/view/3509038110/?alternateChannel=search&refId=G%2BHojJUbFYAcF8KaPjs4lw%3D%3D&trackingId=3fxp2x2KsGBgYD%2FIzR%2FPyw%3D%3D'}

In [12]:
clean_text('ML')

['ml']

In [11]:
'benefit' in 'benefits'

True

In [17]:
sum(1 == 1)

TypeError: 'bool' object is not iterable

In [20]:
sum([1 == 0])

0