In [None]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Document Ranking: BM25 -- Whole Dataset


## Importing libraries and defining functions
- Importing all necessary packages for the project (BM25, NLTK, spaCy, etc.)
- Defining our personal helper functions necessary for projects
- Loading our spaCy trained model for entity extraction and further pre-processing

In [None]:
!pip install rank-bm25



In [None]:
## import from BM25 package: https://github.com/dorianbrown/rank_bm25
from rank_bm25 import BM25Plus, BM25Okapi, BM25L

import pandas as pd
import nltk
import ast
import string
import spacy

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
## Loading model (nlp) to pre-process description column later --> DO ENTITY EXTRACTION
nlp = spacy.load("/content/drive/Shareddrives/SI650 Project [Info Retrieval]/entity-model/")

### Helper Functions
- <b>process_text(df_col):</b> function to clean the description column (and title, in this notebook)
- <b>standard_query(q):</b> function to standardize query for search (lemmatized, cleaned)
- <b>query_scores(q):</b> function to get the scores for a given query (used in next helper function)
- <b>top_n_queries(query, corpus, n):</b> return a full dataframe with data, based on the top query scores from previous function
- <b>retrieve_docs (query):</b> return a df subset with our results
- <b>evaluate_mAP (retrieved_docs, rel_column):</b> evaluate performance of retrieval using mean Average Precision @ 20 (Note: just for ground truth annotated data)

In [None]:
def process_text(df_col):
    """
    Helper function to help up process the description columns. Several NLP techniques implemented to get lemmatized, clean tokens
    :input: dataframe column (description)
    :output: list of stemmed, clean tokens
    """

    ret_list = []

    stop_words = set(stopwords.words("english"))

    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(pattern='\w+|\$[\d\.]+|\S+')

    to_remove = ['show', 'moreshow', 'less']

    for d in df_col:
        ## lowercase text
        text = d.lower()
        ## remove punctuation, keep it as string
        text = "".join([c for c in text if c not in string.punctuation])
        ## tokenization (definitely would use RegExp)
        token_text = tokenizer.tokenize(text)
        ## filter out stopwords
        filt_tokens = [t for t in token_text if t not in stop_words]

        ## get token stems or lemmas? (lemmas seems to have better results, according to below paper)
        # stem_tokens = [stemmer.stem(t) for t in filt_tokens]
        lemma_tokens = [lemmatizer.lemmatize(t) for t in filt_tokens]

        ## removing "show more/less" idiosincracy 
        lemma_tokens = [i for i in lemma_tokens if i not in to_remove]

        # ret_list.append(stem_tokens)
        ret_list.append(lemma_tokens)

    return ret_list

# input: query string, return: lemmatized query string
def standard_query(q):
    """
    Similar to description, standardize the query input for appropriate results
    :input: query string
    :output: clean, standardized query string
    """
    stop_words = set(stopwords.words("english"))
    tokenizer = RegexpTokenizer(pattern='\w+|\$[\d\.]+|\S+') 
    # stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # same preprocessing applied in "process_description", but for a given query
    text = q.lower()
    text = "".join([c for c in q if c not in string.punctuation])
    token_text = tokenizer.tokenize(text)
    filt_tokens = [t for t in token_text if t not in stop_words]

    ## stemming or lemmatizing
    # stemmed_query = [stemmer.stem(t) for t in filt_tokens]
    lemmatized_query = [lemmatizer.lemmatize(t) for t in filt_tokens]

    # query_str = " ".join([c for c in stemmed_query])
    query_str = " ".join([c for c in lemmatized_query]).lower()

    return query_str
# standard_query("devops engineer")


# input: query string, return: bm25 scores list (can get top n scores)
def query_scores(q):
    """
    Function to get all the query scores given a query
    :input: query
    :output: document scores for a given query
    """
    stop_words = set(stopwords.words("english"))
    tokenizer = RegexpTokenizer(pattern='\w+|\$[\d\.]+|\S+')
    # stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # same preprocessing applied in "process_description"
    text = q.lower()
    text = "".join([c for c in text if c not in string.punctuation])
    token_text = tokenizer.tokenize(text)
    filt_tokens = [t for t in token_text if t not in stop_words]

    # stemmed_query = [stemmer.stem(t) for t in filt_tokens]
    lemmatized_query = [lemmatizer.lemmatize(t) for t in filt_tokens]

    # get bm25 plus scores
    # doc_scores = bm25.get_scores(stemmed_query)
    doc_scores = bm25.get_scores(lemmatized_query)

    # return sorted(doc_scores, reverse=True)[:10]
    return doc_scores
# query_scores("machine learning engineer")


# input: query string, corpus, top_n; return: df subset with entries from Full_DF, including naive relevance score
def top_n_queries(query, corpus, n, column):
    """
    Function to get top N queries, not only scores, but returning the dataframe with our relevant data
    :input: query, corpus, top n
    :output: df subset of entries from our complete df, with a naive relevance score
    """
    query = standard_query(query)
    tokenized_query = query.split(" ")

    top_n = bm25.get_top_n(query, corpus, n=n)
    copy = df.copy()

    # making clean_string column to return nice df of our results
    ### adapted slightly, since now we're gonna find the matching title_and_description, no "clean_string" anymore
    # copy["clean_string"] = copy.description_clean.apply(lambda x: " ".join([i for i in x]))

    # creating simple relevance measure (ground truth) for the given query (if query in description, 1, else 0)
    relevance = []

    for cs in copy[column]:
        if query in cs:
            relevance.append(1)
        else: 
            relevance.append(0)

    copy["in_title"] = relevance

    return copy[copy[column].isin(top_n)]#.reset_index()
    # return copy


def retrieve_docs(query, **filters):
    """
    Retrieve documents given a query, based on fitted BM25L (could vary algos) --- For this variation, 
    don't need to pick column to return (was for evaluation)
    :input: query string, string of filters to exclude from results
    :output: top 20 postings, alongside mAP evaluation score if set to True -- for final version, just return all postings? just 50
    """
    qq = query

    clean_filters = standard_query(str(list(filters.values())))
    # print(clean_filters)

    ## df with our results, will use to filter out stuff 
    given_query = top_n_queries(qq, corpus, len(df), "company_title_entities")
    given_query["bm25_score"] = query_scores(qq)

    ## tokenize filter string to exclude from results 
    filter_tokens = word_tokenize(clean_filters)
    # print(filter_tokens) 

    ## making a filter string, delimited by | as OR for multiple filters
    masking_filter_string = (["|".join(filter_tokens)])[0]

    ## filtering our results based on the filter string entered by the user
    ret_df = given_query[~given_query.company_title_entities.str.contains(masking_filter_string)]

    ## now we have all columns available, so pick and choose to get a nice return df --> pick which !!!
    ret_df = ret_df.sort_values(by="bm25_score", ascending=False)[:50]#[['title','ds_rel','security_rel','ux_rel','bm25_score']]

    return ret_df.reset_index(drop=True)

def evaluate_mAP(retrieved_docs, rel_column):
    """
    Function computes mean average precision (mAP) for the given query with ground truth annotations
    :input: retrieved docs df, string of relevant column (either 'ds_rel', 'ux_rel', 'security_rel')
    :output: mAP@20 score for given query
    """
    # for security_docs
    prec_list = []
    relevant_count = 0

    for i, b in enumerate(retrieved_docs[rel_column], 1):
        if b > 0:
            relevant_count += 1
            prec_list.append(relevant_count/i)
            # relevant_count += 1
        if b <= 0:
            prec_list.append(0/i)

    mAP = sum(prec_list) / relevant_count

    # print(prec_list)
    # print(relevant_count)
    return round(mAP, 6)


## Data Preparation and Further Pre-Processing
- Read in Full data (from our folder) to build our corpus
- Creating different variations of df columns (detailed below

In [None]:
## need to read in FULL_DF to merge and get the description as well
Full_DF = pd.read_csv("/content/drive/Shareddrives/SI650 Project [Info Retrieval]/data/FULL_DF.csv")

# description_clean was string of list of strings, use literal_eval to make it list of strings
Full_DF.description_clean = Full_DF.description_clean.apply(lambda x: ast.literal_eval(x))

In [None]:
Full_DF.sample(1)

Unnamed: 0,job_id,link,apply_link,title,company,place,description,date,seniority,job_function,employment_type,industries,description_clean,english
6611,2220588268,https://www.linkedin.com/jobs/view/data-scien...,https://www.linkedin.com/jobs/view/externalAp...,Data Scientist,Virgin Galactic,"Los Angeles, California, United States",Who We Are\n\nVirgin Galactic www.VirginGalac...,2020-10-23,Not Applicable,"Engineering, Information Technology",Full-time,"Information Technology and Services, Aviation...","[virgin, galactic, wwwvirgingalacticcom, track...",1


In [None]:
## assignment of Full_DF to df for proper functioning with our helper functions. Now using full data!
df = Full_DF

In [None]:
df.sample(1)

Unnamed: 0,job_id,link,apply_link,title,company,place,description,date,seniority,job_function,employment_type,industries,description_clean,english
7021,2248381553,https://www.linkedin.com/jobs/view/product-gr...,https://www.linkedin.com/jobs/view/externalAp...,Product Growth Analyst,Facebook,"New York, New York, United States","A prominent, data based global technology fir...",2020-10-27,Entry level,"Engineering, Information Technology",Full-time,"Information Technology and Services, Computer...","[prominent, data, based, global, technology, f...",1


In [None]:
df.shape

(12546, 14)

### Adding Columns to our DataFrame
- "title_clean" (list and string)
- "description_clean_string"
- "title_and_description"
- "entities"
- "entities_list"
- "entities_clean_string"
- "title_and_entities"
- "company_title_entities"

In [None]:
## didn't have clean title yet
df["title_clean"] = process_text(df.title)

## make strings out of clean title and description to concatenate
df["title_clean_string"] = [" ".join(i) for i in df.title_clean]
df["description_clean_string"] = [" ".join(i) for i in df.description_clean]

## create a new column with clean title + clean description for "better" retrieval, prolly use BM25L?
df["title_and_description"] = df.title_clean_string + " " + df.description_clean_string


In [None]:
## create an entities column, not clean yet but process the text after probably
df["entities"] = df["description"].apply(lambda x: str((nlp(x).ents)).replace("(", "").replace(")", "").replace(",", ""))

## process_text returns a clean list of strings
df["entities_list"] = process_text(df.entities)

## let's make the previous defined list into a clean string for concatenation with title
df["entities_clean_string"] = [" ".join(i) for i in df.entities_list]

## concatenating title and entities (similar to concatenating title and description)
df["title_and_entities"] = df.title_clean_string + " " + df.entities_clean_string

In [None]:
## creating token list for company, and a clean string of company name for concatenation
df["company_token_list"] = process_text(df.company)
df["company_clean_string"] = [" ".join(i) for i in df.company_token_list]

## concatenate company name, job title, and extracted entities
df["company_title_entities"] = df.company_clean_string + " " + df.title_clean_string + " " + df.entities_clean_string

In [None]:
df.sample(1)

Unnamed: 0,job_id,link,apply_link,title,company,place,description,date,seniority,job_function,employment_type,industries,description_clean,english,title_clean,title_clean_string,description_clean_string,title_and_description,entities,entities_list,entities_clean_string,title_and_entities,company_token_list,company_clean_string,company_title_entities
8289,2251347305,https://www.linkedin.com/jobs/view/c%23-dw-fu...,https://www.linkedin.com/jobs/view/externalAp...,C# DW Full Stack Developer,CareerAddict,"Chicago, IL",\nC# Full Stack Developer\n\n*We are unable t...,2020-10-29,Entry level,"Engineering, Information Technology",Full-time,Computer & Network Security\n,"[c, full, stack, developer, unable, sponsor, p...",1,"[c, dw, full, stack, developer]",c dw full stack developer,c full stack developer unable sponsor permanen...,c dw full stack developer c full stack develop...,C# unable to sponsor API Report development SS...,"[c, unable, sponsor, api, report, development,...",c unable sponsor api report development ssrs d...,c dw full stack developer c unable sponsor api...,[careeraddict],careeraddict,careeraddict c dw full stack developer c unabl...


## BM25 Model Retrieval
- For our final model, we use BM25L and the "company_title_entities" column as the corpus


In [None]:
df.sample(1)

Unnamed: 0,job_id,link,apply_link,title,company,place,description,date,seniority,job_function,employment_type,industries,description_clean,english,title_clean,title_clean_string,description_clean_string,title_and_description,entities,entities_list,entities_clean_string,title_and_entities,company_token_list,company_clean_string,company_title_entities
10153,2243755423,https://www.linkedin.com/jobs/view/automotive...,https://www.linkedin.com/jobs/view/externalAp...,Automotive Systems Engineer,ESG Automotive USA,"Detroit, Michigan, United States",Mission of the Position:\n\n\n\n\n\nThe Syste...,2020-10-01,Mid-Senior level,Engineering,Full-time,Automotive\n,"[mission, position, system, engineer, advanced...",1,"[automotive, system, engineer]",automotive system engineer,mission position system engineer advanced engi...,automotive system engineer mission position sy...,Mission leading- team IP Magna's Health Health...,"[mission, leading, team, ip, magnas, health, h...",mission leading team ip magnas health health f...,automotive system engineer mission leading tea...,"[esg, automotive, usa]",esg automotive usa,esg automotive usa automotive system engineer ...


In [None]:
### CHANGE THE BM25 VARIANT AND THE DIFFERENT COLUMNS TO BE FIT INTO BM25 -- Final model uses company+title+entities with BM25L

## define corpus from our target data (input should be the string version of the column)
corpus = []

## hardcoding "company_title_entities" to use as corpus for indexing
for i in df["company_title_entities"].values:
    corpus.append(i)

tokenized_corpus = [doc.split(" ") for doc in corpus]

## implement BM25 algo for retrieval
## https://github.com/dorianbrown/rank_bm25/blob/master/rank_bm25.py

bm25 = BM25L(tokenized_corpus) #k1=1.5, b=0.75, delta=0.5
bm25

<rank_bm25.BM25L at 0x7f72fd26c5f8>

## Retrieve Documents
- input: query string, [optional] filters string
- output: top 50 job postings according to our retrieval model

In [None]:
## the column variable, make sure it matches with the column that was fitted into BM25 model

## to test out the retrieval model, change the query and filters to whatever.
## QUERY: a string with a job title, maybe some skills
## FILTERS: a string with a few keywords you want to exclude from your search results

query = "developer engineer sql python"
filters = "jp morgan security"

retrieved_documents = retrieve_docs(query=query, filters=filters)

In [None]:
## start picking nice output for person looking for the job (should we include description? Not very "clean", better to just give a link?)
## returning top 5 documents

cols_return = ["title", "company", "seniority", "job_function", "employment_type", "link"]#, "bm25_score"]
retrieved_documents[cols_return][:5]

Unnamed: 0,title,company,seniority,job_function,employment_type,link
0,SQL Developer,EdgeLink,Mid-Senior level,Information Technology,Contract,https://www.linkedin.com/jobs/view/sql-develo...
1,Python / Django Developer,"OneinaMil, LLC",Entry level,"Engineering, Information Technology",Full-time,https://www.linkedin.com/jobs/view/python-dja...
2,Financial Analyst II,Adventist HealthCare,Mid-Senior level,Information Technology,Contract,https://www.linkedin.com/jobs/view/financial-...
3,"Data Analyst, Finance",Faire,Associate,"Business Development, Sales",Full-time,https://www.linkedin.com/jobs/view/data-analy...
4,"Operations Analyst, Inspection Center Team",Carvana,Associate,"Business Development, Sales",Full-time,https://www.linkedin.com/jobs/view/operations...
