## Inputs and output


In [2]:
fname_dblp = "../inputs/DBLP1.csv"
fname_scholar = "../inputs/Scholar.csv"
fname_result = "../outputs/DBLP_Scholar_perfectMapping_GuofengZhao.csv"

## Import required packages
Difflib - Ratcliff/Obershelp algorithm
distance - Measure Levenshtein/Hamming/Jaccard/Sorensen distance

In [3]:
import pandas as pd
import numpy as np
import re
import difflib
import distance
from datetime import datetime

title_similarity_threshold = 0.90
authors_similarity_threshold = 0.50
venue_similarity_threshold = 0.90
year_similarity_threshold = 0.99

## load DBLP csv files into data frame

In [7]:
dblp = pd.read_csv(fname_dblp, quotechar='"', encoding='ansi', engine='python')


## Cleaning DBLP

In [4]:
print ("Total DBLP rows: {}".format(dblp.shape))
print(dblp.count())
#dblp.head(5)
dblp['title'] = dblp['title'].map(lambda x: x if type(x)!=str else x.lower().strip())
dblp['authors'] = dblp['authors'].map(lambda x: x if type(x)!=str else x.lower().strip())
dblp['venue'] = dblp['venue'].map(lambda x: x if type(x)!=str else x.lower().strip())
dblp.head(5)


Total DBLP rows: (2615, 6)
idDBLP     2615
title      2615
authors    2397
venue      2404
year       2615
Row_ID     2615
dtype: int64


Unnamed: 0,idDBLP,title,authors,venue,year,Row_ID
0,journals/sigmod/EisenbergM02,sql/xml is making good progress,"a eisenberg, j melton",sigmod record,2002,2
1,conf/vldb/AmmannJR95,using formal methods to reason about semantics...,"p ammann, s jajodia, i ray",vldb,1995,3
2,journals/sigmod/Liu02,editor's notes,l liu,sigmod record,2002,4
3,journals/sigmod/Hammer02,report on the acm fourth international worksho...,,,2002,5
4,conf/vldb/FerrandinaMZFM95,schema and database evolution in the o2 object...,"f ferrandina, t meyer, r zicari, g ferran, j m...",vldb,1995,6


## Deduplicate for DBLP

In [None]:
# define a few helper functions
def title_similarity_difflib (row):
    return difflib.SequenceMatcher(None, row["title"], row["title_R"]).quick_ratio()

def title_similarity_levenshtein (row):
    return 1- distance.nlevenshtein(row["title"], row["title_R"], method=1)

def title_similarity_sorensen (row):
    return 1- distance.sorensen(row["title"], row["title_R"])

def title_similarity_jaccard (row):
    return 1- distance.jaccard(row["title"], row["title_R"])

def title_similarity(row, method = "difflib"):
    if method.lower() == "levenshtein":
        return title_similarity_levenshtein (row)
    if method.lower() == "sorensen":
        return title_similarity_sorensen (row)
    if method.lower() == "jaccard":
        return title_similarity_jaccard (row)
    return title_similarity_difflib (row)
    
def authors_similarity (row):
    if pd.isnull(row["authors"]):
        return 1
    if pd.isnull(row["authors_R"]):
        return 1
    return difflib.SequenceMatcher(None, re.split(r'[;,\s]\s*', row["authors"]), re.split(r'[;,\s]\s*', row["authors_R"])).quick_ratio()

def venue_similarity (row):
    if pd.isnull(row["venue"]):
        return 1
    if pd.isnull(row["venue_R"]):
        return 1
#    return difflib.SequenceMatcher(None, row["venue_L"], row["venue_R"]).quick_ratio()
    return difflib.SequenceMatcher(None, row["venue"], row["venue_R"]).ratio()

def year_similarity (row):
    if pd.isnull(row["year"]):
        return 1
    if pd.isnull(row["year_R"]):
        return 1
    return 1 if row["year"] == row["year_R"] else 0

def row_similar (row):
    if title_similarity(row) < title_similarity_threshold:
        return False
    if authors_similarity(row) < authors_similarity_threshold:
        return False
    if venue_similarity(row) < venue_similarity_threshold:
        return False
    if year_similarity(row) < year_similarity_threshold:
        return False
    return True

In [None]:
# generate the N*N pair matrix and fold diagnally
dblp['tmp'] = 1
max_row_id = dblp.loc[:, ["Row_ID"]].max().at["Row_ID"]
dblp_pair = dblp.merge(dblp, on='tmp', suffixes=["","_R"])
print("Total pairs after merge: {}".format(dblp_pair.shape))
#dblp_pair.drop(labels=['tmp'], axis=1, inplace=True)
dblp_pair = dblp_pair[(dblp_pair["Row_ID"] < dblp_pair["Row_ID_R"]) | (dblp_pair["Row_ID_R"] == max_row_id)]
print("Total pairs after half-fold deduplicate: {}".format(dblp_pair.shape))
print("Start calculating similarity - " + str(datetime.now()))
dblp_pair['row_similar'] = dblp_pair.apply(row_similar, axis = 1)
print("Finished calculating similarity - " + str(datetime.now()))
print("Total pairs after similarity evaluation: {}".format(dblp_pair.shape))
dblp_pair.head(5)

In [None]:
x = dblp_pair.groupby(by="Row_ID", as_index=False).agg("max")
similar_rows = x[x["row_similar"] == True]
print("similar pairs: {}".format(similar_rows.shape))
unique_rows = x[x["row_similar"] == False]
print("unique Row_IDs kept: {}".format(unique_rows.shape))
dblp_clean = unique_rows.loc[:, ['idDBLP', 'title', 'authors', 'venue', 'year', 'Row_ID', 'tmp']]
#dblp_clean.rename(columns={'idDBLP_L':'idDBLP', 'title_L':'title', 'authors_L':'authors', 'venue_L':'venue', 'year_L':'year', 'Row_ID_L':'Row_ID'}, inplace = True)
print("clean DBLP: {}".format(dblp_clean.shape))
dblp_clean.head(10)

## load Scholar csv into data frame

In [6]:
scholar = pd.read_csv(fname_scholar, quotechar='"', encoding='ansi', engine='python')
scholar.rename(columns={'ROW_ID':'Row_ID'}, inplace = True)

## cleaning Scholar

In [None]:
print ("Total Scholar rows: {}".format(scholar.shape))
print(scholar.count())

scholar['title'] = scholar['title'].map(lambda x: x if type(x)!=str else x.lower().strip())
scholar['authors'] = scholar['authors'].map(lambda x: x if type(x)!=str else x.lower().strip())
scholar['venue'] = scholar['venue'].map(lambda x: x if type(x)!=str else x.lower().strip())
scholar.head(5)

## link records from DBLP to Scholar

In [None]:
# generate the N*M pair matrix
scholar['tmp'] = 1
#max_row_id = scholar.loc[:, ["Row_ID"]].max().at["Row_ID"]
dblp_scholar = dblp_clean.merge(scholar, on='tmp', suffixes=["","_R"])
print("Total pairs after merge: {}".format(dblp_scholar.shape))
dblp_scholar.drop(labels=['tmp'], axis=1, inplace=True)
#dblp_scholar = dblp_scholar[(dblp_scholar["Row_ID"] < dblp_scholar["Row_ID_R"]) | (dblp_scholar["Row_ID_R"] == max_row_id)]
#print("Total pairs after half-fold deduplicate: {}".format(scholar_pair.shape))
print("Start calculating similarity - " + str(datetime.now()))
dblp_scholar['row_similar'] = dblp_scholar.apply(row_similar, axis = 1)
print("Finished calculating similarity - " + str(datetime.now()))
print("Total pairs after similarity evaluation: {}".format(dblp_scholar.shape))
dblp_scholar.head(5)