In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
import nltk
import numpy as np
import re
from nltk.corpus import wordnet

In [2]:
def raw_text(url):
    """Takes a URL as input and performs web scrapping to retrieve the body of the
    webpage (in this case a Linkedin ad)"""
    ad = requests.get(url) #Retrieve webpage
    Html = BeautifulSoup(ad.text, 'html.parser') #Convert html into a nicer format
    text_body = Html.find_all('div', 
                              {'class':"show-more-less-html__markup show-more-less-html__markup--clamp-after-5"})
    text_body = text_body[0].text
    return text_body
def clean_text(doc):
    """Take an unstructured document and tokenize it into a list of words. 
    Then standardize it by lowercasing and lemmatizing each word"""
    words = re.findall(r'(?:[a-zA-Z]|#|"+")+',doc) #Find all alphabetical words (Preserve + and # for C++ and C#)
    clean = [i for i in words if i.isupper() or i.islower()] #Retrieve all words that aren't glued to each other
    dirty = [i for i in words if not i.islower() and not i.isupper()] #Retrieve words stuck together
    dirty = [re.findall('[a-zA-Z][^A-Z]*',i) for i in dirty] #Split all the tangled words ie split 'ThisExample' into ['This','Example']
    clean2 = [j for i in dirty for j in i] #Unlist the list of lists
    words = clean + clean2 #Combine all the words together
    stopwords = nltk.corpus.stopwords.words("english")
    words = [i.lower() for i in words] #Lowercase all words
    words = [i for i in words if i not in stopwords] #Filter out stopwords
    tag_words = nltk.pos_tag(words) #Begin lemmatizing by tagging each word
    tag_words = [(i, wordnet_pos(j)) for (i, j) in tag_words] #Convert the tags into something the lemmatizer understands
    lemmatizer = nltk.WordNetLemmatizer()
    clean_words = [lemmatizer.lemmatize(i, j) for i, j in tag_words] #Lemmatize the words
    #Document should be cleaned up
    return clean_words
def wordnet_pos(tag):
    """Map a Brown POS tag to a WordNet POS tag."""
    
    table = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    
    # Default to a noun.
    return table.get(tag[0], wordnet.NOUN) #Function created by Bo Ning in Week 6-2
def lang_count(TXT):
    """ Take a body of clean text and count the number of programming languages present"""
    languages = ['python','r','sql','sa','c',
                 'c++','c#','java','javascript',
                 'julia','matlab','swift','tableau'
                'microsoft','github'] #SAS turns into sa after lemmatization
    #ADD MORE LANGUAGES IF NECESSARY
    count = sum([i in TXT for i in languages]) #Check if each language is in the ad
    #And sum the number of programming languages present
    return count
def get_salary(TXT):
    """From a body of raw text, retrieve the salary"""
    salaries = re.findall(r"(\$\d+\,\d+\.\d{1,2})",TXT) #Find all numbers with $ , and .
    if salaries != []:
        return salaries[-1] #Let's work with the maximum salary
    else:
        salaries = re.findall(r"(\$\d+\,\d+)",TXT) 
        if salaries != []:
            return salaries[-1]
        else:
            return "NaN"
def ML_skill(TXT):
    """Using a body of clean text, check whether the words machine learning is present
    to see if it is a required skill"""
    return str(int('machine' in TXT and 'learning' in TXT))
def get_edu(TXT):
    """Using a body of raw text, retrieve the education level"""
    if "Master" in TXT and "Bachelor" in TXT:
        return "4" #Category where Bachlor's is minimum but higher level preferred
    elif "PhD" in TXT:
        return "3"
    elif "Master" in TXT:
        return "2"
    elif "Bachelor" in TXT:
        return "1"
    else:
        return "0" #No education specified
def ad_type(TXT):
    """Using a body of clean text, check whether this is an ad for data analyst or scientists"""
    return str(int("science" in TXT))
def benefits(TXT):
    """Using a body of raw text, check if benefits are included"""
    return str(int('Benefits' in TXT or 'benefits' in TXT))
def exp(TXT):
    """Using a body of raw text, check if experience is required/preferred"""
    sentences = nltk.sent_tokenize(TXT) #Split text into sentences
    years = [re.findall(r"\d+.*year", i) for i in sentences] #Find sentences with years in it
    years = [i for i in years if i != []][0][0] #Get rid of empty values and turn the years of experience into a string
    year = re.findall(r'\d+',years)[0]
    return year
def collect_data(url):
    """Input a URL for a Linkedin Ad and retrieve all relevant data"""
    raw = raw_text(url)
    clean = clean_text(raw)
    return {'Languages':lang_count(clean),
            'Salary':get_salary(raw),
            'Machine Learning':ML_skill(clean),
            'Education':get_edu(raw),
            'Type': ad_type(clean),
            'Benefits':benefits(raw),
            'Experience':exp(raw),
            'url':url}

In [5]:
url = 'https://www.linkedin.com/jobs/view/3503242054/?alternateChannel=search&refId=%2Fh5nZgpQ9Su8xDDBZZA1IQ%3D%3D&trackingId=Eoeykt8O6VJhW%2FaGgfAWQA%3D%3D'

In [7]:
a = raw_text(url)

In [8]:
clean_text(a)

['usa',
 'department',
 'aspects',
 'clinical',
 'development',
 'process',
 'range',
 'clinical',
 'trial',
 'design',
 'regulatory',
 'submission',
 'apply',
 'scientific',
 'rigor',
 'statistical',
 'method',
 'interpretation',
 'result',
 'also',
 'advise',
 'conduct',
 'clinical',
 'study',
 'database',
 'development',
 'data',
 'quality',
 'assurance',
 'analysis',
 'clinical',
 'endpoint',
 'mind',
 'lead',
 'one',
 'clinical',
 'trial',
 'lead',
 'coordination',
 'analysis',
 'study',
 'report',
 'document',
 'provide',
 'program',
 'validation',
 'support',
 'core',
 'stakeholder',
 'provide',
 'statistical',
 'expertise',
 'support',
 'new',
 'product',
 'development',
 'npd',
 'regulatory',
 'submission',
 'e',
 'g',
 'pma',
 'ce',
 'pmda',
 'regulatory',
 'document',
 'project',
 'study',
 'team',
 'coordinate',
 'communicate',
 'management',
 'team',
 'member',
 'regard',
 'project',
 'study',
 'status',
 'timeline',
 'statistical',
 'expertise',
 'ad',
 'hoc',
 'data',
 '

In [None]:
a