# Entity Resolution using Jaccard Similarity

In [None]:
import pandas as pd
import re
import sqlite3

In [1]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("collectors/data.sqlite3")
job_df = pd.read_sql_query("SELECT * from job_post", con)
con.close()

NameError: name 'sqlite3' is not defined

In [3]:
job_df

Unnamed: 0,id,title,company,location,description,source,search_kw
0,1,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist
1,2,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist
2,3,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist
3,4,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist
4,5,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist
...,...,...,...,...,...,...,...
1445,1456,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist
1446,1457,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist
1447,1458,Principal Data Engineer,FreshBooks,Toronto,FreshBooks has a big vision. We launched in 20...,glassdoor.com,data engineer
1448,1459,Data Engineer,Prodigy Game,Oakville,"Prodigy Education connects students, parents, ...",glassdoor.com,data engineer


In [4]:
# Glassdoor jobs dataframe
glassdoor_df = job_df.where(job_df.source=='glassdoor.com').dropna()

# Indeed jobs dataframe
indeed_df = job_df.where(job_df.source=='indeed.com').dropna()

In [5]:
glassdoor_df.head(5)

Unnamed: 0,id,title,company,location,description,source,search_kw
1004,1015.0,Data Scientist,Unbounce,Vancouver,We are looking for a Data Scientist to help us...,glassdoor.com,data scientist
1005,1016.0,Machine Learning Engineer,PressReader,Richmond,PressReader is the “Netflix” of newspapers and...,glassdoor.com,data scientist
1006,1017.0,Data Scientist I,Amazon,Vancouver,"Terms of employment: Full time, permanent\n\nJ...",glassdoor.com,data scientist
1007,1018.0,Data Scientist,Jungle Scout,Vancouver,Did you know that Jungle Scout collects and pr...,glassdoor.com,data scientist
1008,1019.0,Data Scientist,SkyHive Technologies,Vancouver,The Opportunity\n\nDo you enjoy solving comple...,glassdoor.com,data scientist


In [6]:
indeed_df.head(5)

Unnamed: 0,id,title,company,location,description,source,search_kw
0,1.0,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist
1,2.0,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist
2,3.0,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist
3,4.0,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist
4,5.0,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist


## Data Preprocessing (Record --> Token Set)

In [None]:
def preprocess_df(df, cols):
    all_tokens=[]

    #Concatenate $cols in $df
    joined_val = df[cols[0]].map(str) + " " + df[cols[1]].map(str) + " " + df[cols[2]].map(str)

    #Tokenizer
    for string in joined_val:
        #Getting a set of tokens from each string in joined_val
        tokens = re.split(r'\W+',string.lower())
        tokens = list(filter(None,tokens))

        #Filtering unwanted values in each token list
        if 'nan' in tokens:
            tokens.remove('nan')
        all_tokens.append(tokens)

    #Adding a new column 'joinKey' with all tokens as values
    df['joinKey'] = all_tokens
    return df

## Filtering Obviously Non-matching Pairs

In [None]:
def filtering(df1, df2):

    #Flattening the column values 
    df1_f = df1.explode('joinKey')
    df2_f = df2.explode('joinKey')

    #Renaming the column names as required
    df1_new = df1_f[['id','joinKey']].rename(columns={"id": "id1", "joinKey": "joinKey1"})
    df2_new = df2_f[['id','joinKey']].rename(columns={"id": "id2", "joinKey": "joinKey2"})

    #Joining the two dataframes on the condition that their joinKeys share at least one token 
    cand_df = df1_new.merge(df2_new, left_on=['joinKey1'], right_on=['joinKey2'])
    cand_df.drop_duplicates(subset=['id1','id2'],inplace=True)

    #Merging with original dataframes to get the unflattened column values
    cand_df = df1.merge(cand_df, left_on='id', right_on='id1')
    cand_df = df2.merge(cand_df, left_on='id', right_on='id2')

    cand_df = cand_df[['id1','joinKey_x','id2','joinKey_y']].\
              rename(columns={"joinKey_x": "joinKey1", "joinKey_y": "joinKey2"})
    return cand_df

## Computing Jaccard Similarity for Survived Pairs

In [7]:
def verification(cand_df, threshold):
    jacc = []
    result_df = pd.DataFrame()

    #Iterating through all the rows and converting it to set type
    for index,row in cand_df.iterrows():
        r = set(row['joinKey1'])
        s = set(row['joinKey2'])

        #Finding the jaccard distance of all record pairs (r,s)
        j = len(r.intersection(s)) / len(r.union(s))
        jacc.append(j)
    result_df['jaccard'] = jacc
    result_df = pd.concat([cand_df,result_df],axis=1)

    #Restoring only the rows which has jaccard value greater or equal to the threshold (0.5)
    result_df = result_df[result_df['jaccard'] >= threshold].reset_index().drop(['index'],axis=1)
    return result_df


def jaccard_join(cols1, cols2, threshold):
    new_df1 = preprocess_df(glassdoor_df, glassdoor_cols)
    new_df2 = preprocess_df(indeed_df, indeed_cols)
    print ("Before filtering: %d pairs in total" %(glassdoor_df.shape[0] *indeed_df.shape[0])) 

    cand_df = filtering(new_df1, new_df2)
    print ("After Filtering: %d pairs left" %(cand_df.shape[0]))

    result_df = verification(cand_df, threshold)
    print ("After Verification: %d similar pairs" %(result_df.shape[0]))

    return result_df


glassdoor_cols = ["title", "company","location"]
indeed_cols = ["title", "company","location"]
result_df = jaccard_join(glassdoor_cols, indeed_cols, 0.5)

Before filtering: 443256 pairs in total
After Filtering: 312016 pairs left
After Verification: 873 similar pairs
