In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Document Ranking: BM25 --  Ground Truth Dataset

## Importing functions and libraries
- Importing all necessary packages for the project (BM25, NLTK, spaCy, etc.)
- Defining our personal helper functions necessary for project
- Loading our spaCy trained model for entity extraction and further pre-processing

In [None]:
!pip install rank-bm25

Collecting rank-bm25
  Downloading https://files.pythonhosted.org/packages/16/5a/23ed3132063a0684ea66fb410260c71c4ffda3b99f8f1c021d1e245401b5/rank_bm25-0.2.1-py3-none-any.whl
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.1


In [None]:
## import from BM25 package: https://github.com/dorianbrown/rank_bm25
from rank_bm25 import BM25Plus, BM25Okapi, BM25L

import pandas as pd
import nltk
import ast
import string
import spacy

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
## Loading model (nlp) to pre-process description column later --> DO ENTITY EXTRACTION
nlp = spacy.load("/content/drive/Shareddrives/SI650 Project [Info Retrieval]/entity-model/")

### Helper Functions
- <b>process_text(df_col):</b> function to clean the description column (and title, in this notebook)
- <b>standard_query(q):</b> function to standardize query for search (lemmatized, cleaned)
- <b>query_scores(q):</b> function to get the scores for a given query (used in next helper function)
- <b>top_n_queries(query, corpus, n):</b> return a full dataframe with data, based on the top query scores from previous function
- <b>retrieve_docs (query):</b> return a df subset with our results
- <b>evaluate_mAP (retrieved_docs, rel_column):</b> evaluate performance of retrieval using mean Average Precision @ 20

In [None]:
def process_text(df_col):
    """
    Helper function to help up process the description columns. Several NLP techniques implemented to get lemmatized, clean tokens
    :input: dataframe column (description)
    :output: list of stemmed, clean tokens
    """

    ret_list = []

    stop_words = set(stopwords.words("english"))

    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(pattern='\w+|\$[\d\.]+|\S+')

    to_remove = ['show', 'moreshow', 'less']

    for d in df_col:
        ## lowercase text
        text = d.lower()
        ## remove punctuation, keep it as string
        text = "".join([c for c in text if c not in string.punctuation])
        ## tokenization (definitely would use RegExp)
        token_text = tokenizer.tokenize(text)
        ## filter out stopwords
        filt_tokens = [t for t in token_text if t not in stop_words]

        ## get token stems or lemmas? (lemmas seems to have better results, according to below paper)
        # stem_tokens = [stemmer.stem(t) for t in filt_tokens]
        lemma_tokens = [lemmatizer.lemmatize(t) for t in filt_tokens]

        ## removing "show more/less" idiosincracy 
        lemma_tokens = [i for i in lemma_tokens if i not in to_remove]

        # ret_list.append(stem_tokens)
        ret_list.append(lemma_tokens)

    return ret_list

# input: query string, return: stemmed query string
def standard_query(q):
    """
    Similar to description, standardize the query input for appropriate results
    :input: query string
    :output: clean, standardized query string
    """
    stop_words = set(stopwords.words("english"))
    tokenizer = RegexpTokenizer(pattern='\w+|\$[\d\.]+|\S+') 
    # stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # same preprocessing applied in "process_description", but for a given query
    text = q.lower()
    text = "".join([c for c in q if c not in string.punctuation])
    token_text = tokenizer.tokenize(text)
    filt_tokens = [t for t in token_text if t not in stop_words]

    ## stemming or lemmatizing
    # stemmed_query = [stemmer.stem(t) for t in filt_tokens]
    lemmatized_query = [lemmatizer.lemmatize(t) for t in filt_tokens]

    # query_str = " ".join([c for c in stemmed_query])
    query_str = " ".join([c for c in lemmatized_query])

    return query_str
# standard_query("devops engineer")


# input: query string, return: bm25 scores list (can get top n scores)
def query_scores(q):
    """
    Function to get all the query scores given a query
    :input: query
    :output: document scores for a given query
    """
    stop_words = set(stopwords.words("english"))
    tokenizer = RegexpTokenizer(pattern='\w+|\$[\d\.]+|\S+')
    # stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # same preprocessing applied in "process_description"
    text = q.lower()
    text = "".join([c for c in text if c not in string.punctuation])
    token_text = tokenizer.tokenize(text)
    filt_tokens = [t for t in token_text if t not in stop_words]

    # stemmed_query = [stemmer.stem(t) for t in filt_tokens]
    lemmatized_query = [lemmatizer.lemmatize(t) for t in filt_tokens]

    # get bm25 plus scores
    # doc_scores = bm25.get_scores(stemmed_query)
    doc_scores = bm25.get_scores(lemmatized_query)

    # return sorted(doc_scores, reverse=True)[:10]
    return doc_scores
# query_scores("machine learning engineer")


# input: query string, corpus, top_n; return: df subset with entries from Full_DF, including naive relevance score
def top_n_queries(query, corpus, n, column):
    """
    Function to get top N queries, not only scores, but returning the dataframe with our relevant data
    :input: query, corpus, top n
    :output: df subset of entries from our complete df, with a naive relevance score
    """
    query = standard_query(query)
    tokenized_query = query.split(" ")

    top_n = bm25.get_top_n(query, corpus, n=n)
    copy = df.copy()

    # making clean_string column to return nice df of our results
    ### adapted slightly, since now we're gonna find the matching title_and_description, no "clean_string" anymore
    # copy["clean_string"] = copy.description_clean.apply(lambda x: " ".join([i for i in x]))

    # creating simple relevance measure (ground truth) for the given query (if query in description, 1, else 0)
    relevance = []

    for cs in copy[column]:
        if query in cs:
            relevance.append(1)
        else: 
            relevance.append(0)

    copy["in_title"] = relevance

    return copy[copy[column].isin(top_n)]#.reset_index()
    # return copy


def retrieve_docs(query, column):
    """
    Retrieve documents given a query, based on fitted BM25L (could vary algos) --- NEED TO PICK THE COLUMNS TO RETURN
    :input: query string
    :output: top 20 postings, alongside mAP evaluation score if set to True
    """
    qq = query

    given_query = top_n_queries(qq, corpus, len(df), column)
    given_query["bm25_score"] = query_scores(qq)

    ## now we have all columns available, so pick and choose to get a nice return df --> pick which !!!
    ret_df = given_query.sort_values(by="bm25_score", ascending=False)[['title','ds_rel','security_rel','ux_rel','bm25_score']][:20]

    return ret_df.reset_index(drop=True)

def evaluate_mAP(retrieved_docs, rel_column):
    """
    Function computes mean average precision (mAP) for the given query with ground truth annotations
    :input: retrieved docs df, string of relevant column (either 'ds_rel', 'ux_rel', 'security_rel')
    :output: mAP@20 score for given query
    """
    # for security_docs
    prec_list = []
    relevant_count = 0

    for i, b in enumerate(retrieved_docs[rel_column], 1):
        if b > 0:
            relevant_count += 1
            prec_list.append(relevant_count/i)
            # relevant_count += 1
        if b <= 0:
            prec_list.append(0/i)

    mAP = sum(prec_list) / relevant_count

    # print(prec_list)
    # print(relevant_count)
    return round(mAP, 6)


## Data Preparation and Further Pre-Processing
- Read in ground truth data and full dataframe (from code folder) to obtain all relevant ground_truth dimensions 
- Creating different variations of df columns: 
    - "title" and "description" strings from list
    - "title_and_description" feature
    - extracted entities with spaCy

In [None]:
## read in ground truth annotated data
ground_truth_path = 'drive/Shared drives/SI650 Project [Info Retrieval]/data/ground_truth1.csv' 
ground_truth = pd.read_csv(ground_truth_path)
ground_truth.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
## need to read in FULL_DF to merge and get the description as well
Full_DF = pd.read_csv("/content/drive/Shareddrives/SI650 Project [Info Retrieval]/data/FULL_DF.csv")

# description_clean was string of list of strings, use literal_eval to make it list of strings
Full_DF.description_clean = Full_DF.description_clean.apply(lambda x: ast.literal_eval(x))

In [None]:
Full_DF.sample(1)

Unnamed: 0,job_id,link,apply_link,title,company,place,description,date,seniority,job_function,employment_type,industries,description_clean,english
5416,2233953256,https://www.linkedin.com/jobs/view/pentester-...,https://www.linkedin.com/jobs/view/externalAp...,Pentester - for an innovative security as a p...,Wiley Job Network,"Flensburg, Minnesota, United States",Hi InfiCare has been providing Contingent Sta...,2020-10-20,Entry level,Other,Full-time,"Information Technology and Services, Computer...","[hi, inficare, providing, contingent, staffing...",1


In [None]:
ground_truth.sample(1)

Unnamed: 0,job_id,title,ds_rel,security_rel,ux_rel
67,2222623935,Java Developer,-2,-1,-2


In [None]:
## let's join our 200 ground truth data entries with our Full_DF --> drop title, since we have that in Full_DF as well
### ALSO, WE CAN DEFINE df=Full_DF for our last retrieval model, fitting the entire corpus we have into BM25.
df = pd.merge(Full_DF, ground_truth.drop("title", axis=1), left_on="job_id", right_on="job_id", how="right").drop_duplicates("job_id")

In [None]:
df.sample(1)

Unnamed: 0,job_id,link,apply_link,title,company,place,description,date,seniority,job_function,employment_type,industries,description_clean,english,ds_rel,security_rel,ux_rel,title_clean,title_clean_string,description_clean_string,title_and_description,entities,entities_list,entities_clean_string,title_and_entities
73,2176252488,https://www.linkedin.com/jobs/view/cloud-devo...,,Cloud DevOps Engineering,Benchmark IT - Technology Talent,"Tarrytown, New York, United States",Why We Work at Dun & Bradstreet\nWe are at a ...,2020-10-13,Entry level,Other,Full-time,"Information Technology and Services, Computer...","[work, dun, bradstreet, transformational, mome...",1,0,1,-2,"[cloud, devops, engineering]",cloud devops engineering,work dun bradstreet transformational moment co...,cloud devops engineering work dun bradstreet t...,Why growth San Mateo Short Hills ; Center Vall...,"[growth, san, mateo, short, hill, center, vall...",growth san mateo short hill center valley anal...,cloud devops engineering growth san mateo shor...


In [None]:
df.shape

(199, 17)

### Adding Columns to our DataFrame
- "title_clean" (list and string)
- "description_clean_string"
- "title_and_description"
- "entities"
- "entities_list"
- "entities_clean_string"
- "title_and_entities"
- "company_title_entities"

In [None]:
## didn't have clean title yet
df["title_clean"] = process_text(df.title)

## make strings out of clean title and description to concatenate
df["title_clean_string"] = [" ".join(i) for i in df.title_clean]
df["description_clean_string"] = [" ".join(i) for i in df.description_clean]

## create a new column with clean title + clean description for "better" retrieval, prolly use BM25L?
df["title_and_description"] = df.title_clean_string + " " + df.description_clean_string


In [None]:
## create an entities column, not clean yet but process the text after probably
df["entities"] = df["description"].apply(lambda x: str((nlp(x).ents)).replace("(", "").replace(")", "").replace(",", ""))

## process_text returns a clean list of strings
df["entities_list"] = process_text(df.entities)

## let's make the previous defined list into a clean string for concatenation with title
df["entities_clean_string"] = [" ".join(i) for i in df.entities_list]

## concatenating title and entities (similar to concatenating title and description)
df["title_and_entities"] = df.title_clean_string + " " + df.entities_clean_string

In [None]:
## creating token list for company, and a clean string of company name for concatenation
df["company_token_list"] = process_text(df.company)
df["company_clean_string"] = [" ".join(i) for i in df.company_token_list]

## concatenate company name, job title, and extracted entities
df["company_title_entities"] = df.company_clean_string + " " + df.title_clean_string + " " + df.entities_clean_string

In [None]:
df.sample(1)

Unnamed: 0,job_id,link,apply_link,title,company,place,description,date,seniority,job_function,employment_type,industries,description_clean,english,ds_rel,security_rel,ux_rel,title_clean,title_clean_string,description_clean_string,title_and_description,entities,entities_list,entities_clean_string,title_and_entities,company_token_list,company_clean_string,company_title_entities
207,2220771341,https://www.linkedin.com/jobs/view/senior-sof...,https://www.linkedin.com/jobs/view/externalAp...,Senior Software Engineer in Test (San Francis...,"Okta, Inc.","San Francisco, CA",Okta is looking for a Software Engineer in Te...,2020-10-29,Not Applicable,"Engineering, Information Technology",Full-time,"Computer & Network Security, Computer Softwar...","[okta, looking, software, engineer, test, join...",1,-1,-2,-2,"[senior, software, engineer, test, san, franci...",senior software engineer test san francisco ca...,okta looking software engineer test join end u...,senior software engineer test san francisco ca...,usability scalable software APIs UX Java UI Se...,"[usability, scalable, software, apis, ux, java...",usability scalable software apis ux java ui se...,senior software engineer test san francisco ca...,"[okta, inc]",okta inc,okta inc senior software engineer test san fra...


## BM25 Model Retrieval
- Try out all variations of BM25 with all variations of our column dimensions 
- Only using ground_truth subset, since those are the only ones annotated for relevance and therefore the only ones that can be empirically evaluated
- BM25 variants:
    - BM25L
    - BM25Plus
    - BM25Okapi
- Columns to be fit into BM25:
    - title_clean_string
    - description_clean_string
    - title_and_description
    - entities_clean_string
    - title_and_entities
    - company_title_entities
- Evaluating variants using mAP @ 20

In [None]:
df.sample(1)

Unnamed: 0,job_id,link,apply_link,title,company,place,description,date,seniority,job_function,employment_type,industries,description_clean,english,ds_rel,security_rel,ux_rel,title_clean,title_clean_string,description_clean_string,title_and_description,entities,entities_list,entities_clean_string,title_and_entities
134,2187591553,https://www.linkedin.com/jobs/view/backend-ja...,https://www.linkedin.com/jobs/view/externalAp...,Backend Java Developer,"Ab Ovo, Inc.","Mountain View, California, United States","Backend Developer\n\nIn a matter of minutes, ...",2020-10-16,Entry level,"Engineering, Information Technology",Full-time,"Information Technology and Services, Computer...","[backend, developer, matter, minute, aible, he...",1,-1,0,-2,"[backend, java, developer]",backend java developer,backend developer matter minute aible help bus...,backend java developer backend developer matte...,scalable APIs Python Jinja2 PostgresQL Redis G...,"[scalable, apis, python, jinja2, postgresql, r...",scalable apis python jinja2 postgresql redis g...,backend java developer scalable apis python ji...


In [None]:
### CHANGE THE BM25 VARIANT AND THE DIFFERENT COLUMNS TO BE FIT INTO BM25

## define corpus from our target data (input should be the string version of the column)
corpus = []

## change this variable to fit with different column -- WHAT TO FIT TO BM25 MODEL

possible_columns = ["title_clean_string", "description_clean_string", "title_and_description", 
                    "entities_clean_string", "title_and_entities", "company_title_entities"]

COLUMN_VAR = possible_columns[5]

for i in df[COLUMN_VAR].values:
    corpus.append(i)

# corpus

tokenized_corpus = [doc.split(" ") for doc in corpus]
# tokenized_corpus

## uncomment the current BM25 variation to be used, can also give hyperparams here
## https://github.com/dorianbrown/rank_bm25/blob/master/rank_bm25.py

bm25 = BM25L(tokenized_corpus) #k1=1.5, b=0.75, delta=0.5
# bm25 = BM25Okapi(tokenized_corpus)
# bm25 = BM25Plus(tokenized_corpus)
bm25

<rank_bm25.BM25L at 0x7f903e2a4c88>

In [None]:
## the column variable, make sure it matches with the column that was fitted into BM25 model

# ds_docs = retrieve_docs("data science", column="title_clean_string")
# ds_docs = retrieve_docs("data science", column="description_clean_string")
# ds_docs = retrieve_docs("data science", column="title_and_description")
# ds_docs = retrieve_docs("data science", column="entities_clean_string")
# ds_docs = retrieve_docs("data science", column="title_and_entities")
ds_docs = retrieve_docs("data science", column="company_title_entities")


ds_docs

Unnamed: 0,title,ds_rel,security_rel,ux_rel,bm25_score
0,Lead Data Architect,1,-2,1,29.168741
1,Senior Data Scientist,2,-2,-2,18.587222
2,Hadoop/ Spark Developer,1,-1,-2,15.499726
3,Data Analyst/Integration Specialist,1,-1,-1,10.240076
4,Data Architect,1,-2,0,9.772508
5,"Practice Director, Data Science",1,-2,-1,9.43796
6,Data Scientist - Adtech - Remote,2,-2,-2,9.363572
7,Senior Software Engineer - Scan Services,-1,-2,-2,8.897336
8,Tire Intelligence Algorithm Development Co-op...,-1,-2,-2,6.989392
9,Software Engineer III - Big Data Platform - S...,0,-2,-2,6.776567


In [None]:
evaluate_mAP(ds_docs, rel_column="ds_rel")

0.891453

In [None]:
## the column variable, make sure it matches with the column that was fitted into BM25 model

# ux_docs = retrieve_docs("ux designer", column="title_clean_string")
# ux_docs = retrieve_docs("ux designer", column="description_clean_string")
# ux_docs = retrieve_docs("ux designer", column="title_and_description")
# ux_docs = retrieve_docs("ux designer", column="entities_clean_string")
# ux_docs = retrieve_docs("ux designer", column="title_and_entities")
ux_docs = retrieve_docs("ux designer", column="company_title_entities")

ux_docs

Unnamed: 0,title,ds_rel,security_rel,ux_rel,bm25_score
0,Salesforce Project Manager,-2,-2,-2,9.211409
1,DESIGNER 3,-2,-2,1,5.495823
2,Mechanical Engineer / Machine Designer - Entr...,-2,-2,-2,5.290321
3,Senior Test Specialist,0,1,-2,4.79372
4,Jr Web Developer,0,0,1,4.269083
5,Lead Data Engineer,1,0,-2,4.135054
6,Senior Software Engineer in Test (San Francis...,-1,-2,-2,4.103957
7,Principal Software Engineer,-1,-2,-2,3.960996
8,Product Manager Fusion 360 Documentation,-2,-2,-1,3.646645
9,Software Engineer III - Big Data Platform - S...,0,-2,-2,3.404213


In [None]:
evaluate_mAP(ux_docs, rel_column="ux_rel")

0.358824

In [None]:
## the column variable, make sure it matches with the column that was fitted into BM25 model

# security_docs = retrieve_docs("security", column="title_clean_string")
# security_docs = retrieve_docs("security", column="description_clean_string")
# security_docs = retrieve_docs("security", column="title_and_description")
# security_docs = retrieve_docs("security", column="entities_clean_string")
# security_docs = retrieve_docs("security", column="title_and_entities")
security_docs = retrieve_docs("security", column="company_title_entities")

security_docs

Unnamed: 0,title,ds_rel,security_rel,ux_rel,bm25_score
0,Customer Success Security Sales Engineer,-2,-2,-2,7.906124
1,Security Engineer,-1,2,-2,6.857971
2,Penetration Tester with Security Clearance,-2,2,-2,6.818188
3,Application Security Engineer,-2,1,-2,6.728213
4,Network Administrator,-1,1,-2,6.408393
5,Senior Security Engineer,-1,2,-2,6.088032
6,Senior Investigator,-2,-1,-2,5.822962
7,"Security Software Engineer - Cryptography, C/...",0,2,-2,5.722788
8,Senior Security Engineer (Cryptography),-2,2,-2,5.67495
9,Software Development Engineer 1,-1,-2,-2,5.583443


In [None]:
evaluate_mAP(security_docs, rel_column="security_rel")

0.70123