In [1]:
import requests
import pandas as pd 
import time

from tqdm import tqdm


In [2]:
venues = 'SIGMOD|Management of Data|EDBT|Extending Database Technology|ICDE|Data Engineering|CIKM|Information and Knowledge Management|PVLDB|VLDB|Very Large|KDD|Knowledge Discovery and Data|CIDR|Innovative Data Systems|TODS|TKDE|Transactions on Knowledge and Data Engineering|PODS'
partial_url = 'https://api.semanticscholar.org/v1/paper/'
unk_ref = '?include_unknown_references=true'

# Get citations/references from a given papers

original_paper_url: Semantic scholar paper url 

type_of_papers: 'citations' or 'references' 

include_venues: String containing the name of the venues or keywords describing the venues separated by '|' 

__Return__: pandas DataFrame containing a list of papers and their details as returned from sematic scholars

In [8]:
def get_papers_from_original_paper(original_paper_id, type_of_papers, include_venues, include_unk_ref=True):
    
    if include_unk_ref:
        url = '{}{}{}'.format(partial_url, original_paper_id, unk_ref)
    else:
        url = '{}{}'.format(partial_url, original_paper_id)
    
    r = requests.get(url=url)
    data = r.json()
    
    papers_from_venue = list()
    
    if len(data[type_of_papers]) > 0:
        df = pd.DataFrame.from_dict(data[type_of_papers])
        papers_from_venue = df[df['venue'].str.contains(include_venues)]
   
    return papers_from_venue


# Get details about each citation/reference

papers_from_venue: DataFrame computed with 'get_papers_from_original_paper'

__Return__: The number of remaining papers (semantic scholars only accepts a limited number of requests, 
so we have to ensure that all the papers where retrieved) __and__ a list of papers with details such as: paperId, title, number of citations and citation velocity. 

In [4]:
def get_papers_with_citation_velocity(papers_from_venue):
    papers = list()
    print('Total number of papers: {}'.format(len(papers_from_venue)))
    iterations = 0
    
    for index, row in tqdm(papers_from_venue.iterrows()):
        r = requests.get(url='{}{}'.format(partial_url, row['paperId']))
        data = r.json()

        if 'message' in data:
            if data['message'] == 'Forbidden':
                break

        paper_details = dict()
        paper_details['paperId'] = row['paperId']
        paper_details['title'] = data['title']
        paper_details['nrCitations'] = len(data['citations'])

        if 'citationVelocity' in data:
            paper_details['citationVelocity'] = data['citationVelocity']

        papers.append(paper_details)
        iterations = iterations + 1
   
    remaining = len(papers_from_venue)-iterations
    print('Remaining papers: {}'.format(remaining))
    
    return remaining, papers

# Get details about all the citations/references 
Using the functions above, get the details about the papers 

type_of_papers: 'citations' or 'references'

__Return__: pandas DataFrame containing all the papers

In [9]:
def get_papers(original_paper_id, type_of_papers, unk_ref=True):
    papers = get_papers_from_original_paper(original_paper_id, type_of_papers, venues, unk_ref)

    if len(papers) == 0:
        return None
    
    remaining = len(papers)
    all_papers = list()

    while remaining != 0:
        p = papers.iloc[-remaining:]
        remaining, papers_with_velocity = get_papers_with_citation_velocity(p)

        all_papers = all_papers + papers_with_velocity
        
        if remaining != 0:
            print('Sleep 90s')
            time.sleep(90) # the requests are blocked if too many are executed, so let's wait a bit
        
    return pd.DataFrame.from_dict(all_papers)

# Filter the papers and save them in csv file

papers_df: The DataFrame computed with 'get_papers'
file_name: The name of the output csv file (without extension) 

__Return__: The filtered DataFrame

In [6]:
def filter_save_selected_papers(papers_df, file_name, original_paper_identifier):
    papers_with_citations = papers_df[papers_df['citationVelocity'] > 0]
    papers_with_citations = papers_with_citations.sort_values(by=['citationVelocity'], ascending=False)
    papers_with_citations = papers_with_citations.drop_duplicates(['title'])
    papers_with_citations.to_csv('{}_{}.csv'.format(original_paper_identifier, file_name), index=False)
    
    return papers_with_citations

# Select papers after manually inspection

file_name: The name of the file with the list of papers 
select_indexes: A list with the indexes of the manually selected papers

__Return__: pandas DataFrame with the selected papers

In [15]:
def select_papers(file_name, paper_ids, out_name=None):
    if 'csv' in file_name:
        df = pd.read_csv(file_name)
    else:
        file_name = '{}.csv'.format(file_name)
        df = pd.read_csv(file_name)

#     selected = df.loc[select_indexes, :]
    selected = df[df['paperId'].isin(paper_ids)]

    if out_name:
        selected.to_csv('selected_{}'.format(out_name), index=False)
    else:
        selected.to_csv('selected_{}'.format(file_name), index=False)
    
    return selected

In [11]:
def get_year_venue(file):
    df = pd.read_csv(file)
    df['year'] = 0
    df['venue'] = 0
    
    for index, row in tqdm(df.iterrows()):
        url = '{}{}'.format(partial_url, row['paperId'])
        r = requests.get(url=url)
        data = r.json()
        df.loc[df['paperId'] == row['paperId'], 'venue'] = data['venue']
        df.loc[df['paperId'] == row['paperId'], 'year'] = data['year']
    
    df.to_csv(file, index=False)
        

In [18]:
from os import listdir
from os.path import isfile, join

path = '.'
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
onlyfiles = [f for f in onlyfiles if '_2' in f]

processed = list(map(lambda f: get_year_venue(f), onlyfiles))

5it [00:06,  1.26s/it]
3it [00:03,  1.12s/it]


# Examples

## Schema matching

In [41]:
# A survey of approaches to automatic schema matching
schema_matching_survey = '580221d63ae75bdc7d68829916cf608e44a56b27'

references_df = get_papers(schema_matching_survey, 'references')
citations_df = get_papers(schema_matching_survey, 'citations')

schema_matching_df = pd.concat([citations_df, references_df])
filtered_schema_matching = filter_save_selected_papers(schema_matching_df, 
                                'schema_matching', schema_matching_survey)




0it [00:00, ?it/s][A[A

Total number of papers: 19




1it [00:00,  1.07it/s][A[A

2it [00:03,  1.33s/it][A[A

3it [00:04,  1.24s/it][A[A

4it [00:04,  1.06s/it][A[A

5it [00:08,  1.85s/it][A[A

6it [00:08,  1.39s/it][A[A

7it [00:10,  1.45s/it][A[A

8it [00:13,  1.92s/it][A[A

9it [00:15,  2.01s/it][A[A

10it [00:16,  1.64s/it][A[A

11it [00:18,  1.71s/it][A[A

12it [00:19,  1.52s/it][A[A

13it [00:20,  1.39s/it][A[A

14it [00:21,  1.37s/it][A[A

15it [00:22,  1.18s/it][A[A

16it [00:26,  1.99s/it][A[A

17it [00:27,  1.72s/it][A[A

18it [00:28,  1.58s/it][A[A

19it [00:30,  1.58s/it][A[A


Remaining papers: 0




0it [00:00, ?it/s][A[A

Total number of papers: 260




1it [00:00,  1.35it/s][A[A

2it [00:02,  1.09s/it][A[A

3it [00:03,  1.02s/it][A[A

4it [00:04,  1.11it/s][A[A

5it [00:04,  1.15it/s][A[A

6it [00:05,  1.19it/s][A[A

7it [00:06,  1.22it/s][A[A

8it [00:07,  1.01it/s][A[A

9it [00:08,  1.06it/s][A[A

10it [00:09,  1.30it/s][A[A

11it [00:11,  1.16s/it][A[A

12it [00:13,  1.53s/it][A[A

13it [00:14,  1.49s/it][A[A

14it [00:15,  1.33s/it][A[A

15it [00:17,  1.30s/it][A[A

16it [00:17,  1.15s/it][A[A

17it [00:19,  1.30s/it][A[A

18it [00:20,  1.19s/it][A[A

19it [00:21,  1.09s/it][A[A

20it [00:22,  1.10s/it][A[A

21it [00:23,  1.00s/it][A[A

22it [00:24,  1.01s/it][A[A

23it [00:25,  1.07s/it][A[A

24it [00:26,  1.15s/it][A[A

25it [00:28,  1.17s/it][A[A

26it [00:29,  1.15s/it][A[A

27it [00:31,  1.42s/it][A[A

28it [00:32,  1.29s/it][A[A

29it [00:32,  1.06s/it][A[A

30it [00:33,  1.10it/s][A[A

31it [00:33,  1.38it/s][A[A

32it [00:34,  1.33it/s][A[A

33it [00:35,  1

Remaining papers: 157
Sleep 90s





0it [00:00, ?it/s][A[A[A

Total number of papers: 157


0it [00:01, ?it/s]


Remaining papers: 157
Sleep 90s





0it [00:00, ?it/s][A[A[A

Total number of papers: 157





1it [00:00,  1.51it/s][A[A[A


2it [00:02,  1.02s/it][A[A[A


3it [00:03,  1.05it/s][A[A[A


4it [00:04,  1.12it/s][A[A[A


5it [00:04,  1.25it/s][A[A[A


6it [00:05,  1.13it/s][A[A[A


7it [00:07,  1.18s/it][A[A[A


8it [00:08,  1.05s/it][A[A[A


9it [00:09,  1.01s/it][A[A[A


10it [00:10,  1.06it/s][A[A[A


11it [00:10,  1.13it/s][A[A[A


12it [00:11,  1.41it/s][A[A[A


13it [00:12,  1.23it/s][A[A[A


14it [00:13,  1.12it/s][A[A[A


15it [00:13,  1.17it/s][A[A[A


16it [00:15,  1.08it/s][A[A[A


17it [00:16,  1.02s/it][A[A[A


18it [00:17,  1.01s/it][A[A[A


19it [00:17,  1.18it/s][A[A[A


20it [00:18,  1.19it/s][A[A[A


21it [00:19,  1.20it/s][A[A[A


22it [00:20,  1.20it/s][A[A[A


23it [00:21,  1.22it/s][A[A[A


24it [00:21,  1.23it/s][A[A[A


25it [00:22,  1.21it/s][A[A[A


26it [00:23,  1.31it/s][A[A[A


27it [00:24,  1.29it/s][A[A[A


28it [00:24,  1.46it/s][A[A[A


29it [00:25,  1.51it/s][A

Remaining papers: 0





In [43]:
# Generic Schema Matching, Ten Years Later
schema_matching_survey_2 = '55c186046feee5614cd15909dfcc587e0ff662d8'

references_df_2 = get_papers(schema_matching_survey_2, 'references')
citations_df_2 = get_papers(schema_matching_survey_2, 'citations')

schema_matching_df_2 = pd.concat([citations_df_2, references_df_2])
filtered_schema_matching_2 = filter_save_selected_papers(schema_matching_df_2, 
                                'schema_matching', schema_matching_survey_2)




0it [00:00, ?it/s][A[A[A

Total number of papers: 35





1it [00:01,  1.45s/it][A[A[A


2it [00:03,  1.59s/it][A[A[A


3it [00:04,  1.37s/it][A[A[A


4it [00:04,  1.05s/it][A[A[A


5it [00:05,  1.02s/it][A[A[A


6it [00:06,  1.12it/s][A[A[A


7it [00:09,  1.61s/it][A[A[A


8it [00:09,  1.22s/it][A[A[A


9it [00:10,  1.20s/it][A[A[A


10it [00:11,  1.14s/it][A[A[A


11it [00:13,  1.26s/it][A[A[A


12it [00:14,  1.28s/it][A[A[A


13it [00:15,  1.19s/it][A[A[A


14it [00:16,  1.13s/it][A[A[A


15it [00:17,  1.03s/it][A[A[A


16it [00:18,  1.00s/it][A[A[A


17it [00:19,  1.07it/s][A[A[A


18it [00:20,  1.05it/s][A[A[A


19it [00:21,  1.02s/it][A[A[A


20it [00:22,  1.19s/it][A[A[A


21it [00:23,  1.06s/it][A[A[A


22it [00:25,  1.18s/it][A[A[A


23it [00:26,  1.13s/it][A[A[A


24it [00:27,  1.08s/it][A[A[A


25it [00:28,  1.20s/it][A[A[A


26it [00:31,  1.59s/it][A[A[A


27it [00:32,  1.52s/it][A[A[A


28it [00:33,  1.29s/it][A[A[A


29it [00:34,  1.20s/it][A

Remaining papers: 0





0it [00:00, ?it/s][A[A[A

Total number of papers: 27





1it [00:00,  1.19it/s][A[A[A


2it [00:01,  1.10it/s][A[A[A


3it [00:03,  1.15s/it][A[A[A


4it [00:04,  1.05s/it][A[A[A


5it [00:04,  1.23it/s][A[A[A


6it [00:06,  1.04s/it][A[A[A


7it [00:07,  1.03it/s][A[A[A


8it [00:07,  1.05it/s][A[A[A


9it [00:08,  1.09it/s][A[A[A


10it [00:09,  1.29it/s][A[A[A


11it [00:10,  1.28it/s][A[A[A


12it [00:10,  1.26it/s][A[A[A


13it [00:11,  1.15it/s][A[A[A


14it [00:12,  1.44it/s][A[A[A


15it [00:13,  1.39it/s][A[A[A


16it [00:14,  1.20it/s][A[A[A


17it [00:14,  1.25it/s][A[A[A


18it [00:15,  1.12it/s][A[A[A


19it [00:16,  1.11it/s][A[A[A


20it [00:17,  1.04it/s][A[A[A


21it [00:18,  1.31it/s][A[A[A


22it [00:19,  1.22it/s][A[A[A


23it [00:20,  1.10it/s][A[A[A


24it [00:20,  1.36it/s][A[A[A


25it [00:21,  1.32it/s][A[A[A


26it [00:21,  1.64it/s][A[A[A


27it [00:22,  1.20it/s][A[A[A

Remaining papers: 0





### Union the papers

In [45]:
sch_match_df = pd.concat([filtered_schema_matching, filtered_schema_matching_2])
sch_match_df = filter_save_selected_papers(sch_match_df, 'all_sch_match', '')


### Manually inspecting the papers 

In [46]:
paper_ids = ['e6a7faf05f1284af3ecd925e3c1efd4c02f1989e',
            'f3e0afe5e0a5761cd53a18439696c09c3ce1625b', 
            '7ff9bf4d58358fc008b059028a3e33919d12b335',
            '1e2281603c94b690db0df65d09779e6ea5470114',
            'af1e1bee41d004a6c1fa608a9fe2a884f48c6e5f',
            'b4d2d42cc56e6a52ea1bc19febc2c4fcdd4f6dd8',
            '9011405b759b492b1132aea7b6165c9d1b0513e7',
            '3ad25d7ddaf1393198bfee58ccad6450532877c7',
            'afdb8e8ef5c2a0e91c9a5103e3ab01ad263d0130',
            '95e3e0cdffe7f4c19f42dff7674c6673f92ff960',
            'aa1158b0ba07da121e4f2ab039bff2c03c929616',
            'd7e5fb5fbc996ee646f8482d89318b4327321017']

entity_matching = ['159edf7f1fdd662b386d5daa0138f882970e3544']
schema_mapping = ['48b25e7b893babdf01b98771d6fd312480e22013']

selected_sch_match = select_papers('_all_sch_match.csv', paper_ids)
select_papers('_all_sch_match', entity_matching, '_entity_matching.csv')
select_papers('_all_sch_match', schema_mapping, '_schema_mapping.csv')

Unnamed: 0,paperId,title,nrCitations,citationVelocity
53,48b25e7b893babdf01b98771d6fd312480e22013,Synthesizing Mapping Relationships Using Table...,12,4


## Schema mapping

In [16]:
# On Evaluating Schema Matching and Mapping
schema_mapping_survey = 'e5b50377a9e4864d9e262a428e018c01ebe67a8d'

ref_smap_df = get_papers(schema_mapping_survey, 'references')
cit_smap_df = get_papers(schema_mapping_survey, 'citations')

if ref_smap_df is None:
    schema_map_df = cit_smap_df
else:
    schema_map_df = pd.concat([cit_smap_df, ref_smap_df])
    
filtered_schema_mapping = filter_save_selected_papers(schema_map_df, 'schema_mapping', schema_mapping_survey)


0it [00:00, ?it/s][A

Total number of papers: 30



1it [00:00,  1.07it/s][A
2it [00:01,  1.03it/s][A
3it [00:03,  1.02s/it][A
4it [00:04,  1.00it/s][A
5it [00:04,  1.08it/s][A
6it [00:07,  1.32s/it][A
7it [00:08,  1.46s/it][A
8it [00:11,  1.78s/it][A
9it [00:12,  1.62s/it][A
10it [00:14,  1.55s/it][A
11it [00:15,  1.42s/it][A
12it [00:16,  1.29s/it][A
13it [00:17,  1.18s/it][A
14it [00:17,  1.11s/it][A
15it [00:18,  1.06s/it][A
16it [00:19,  1.22it/s][A
17it [00:20,  1.18it/s][A
18it [00:21,  1.02it/s][A
19it [00:22,  1.09it/s][A
20it [00:22,  1.26it/s][A
21it [00:23,  1.24it/s][A
22it [00:26,  1.41s/it][A
23it [00:27,  1.27s/it][A
24it [00:27,  1.02it/s][A
25it [00:30,  1.56s/it][A
26it [00:33,  2.13s/it][A
27it [00:34,  1.77s/it][A
28it [00:35,  1.58s/it][A
29it [00:36,  1.34s/it][A
30it [00:37,  1.25s/it][A


Remaining papers: 0



0it [00:00, ?it/s][A

Total number of papers: 4



1it [00:00,  1.23it/s][A
2it [00:01,  1.42it/s][A
3it [00:01,  1.60it/s][A
4it [00:02,  1.72it/s][A

Remaining papers: 0





In [15]:
# Schema Mapping as Query Discovery
schema_mapping_survey_2 = 'a0790bf8d16e0d5e0374566881dfdfb2f20faa88'

ref_smap_df_2 = get_papers(schema_mapping_survey_2, 'references')
cit_smap_df_2 = get_papers(schema_mapping_survey_2, 'citations')

if ref_smap_df_2 is None:
    schema_map_df_2 = cit_smap_df_2
else:
    schema_map_df_2 = pd.concat([cit_smap_df_2, ref_smap_df_2])
    
filtered_schema_mapping_2 = filter_save_selected_papers(schema_map_df_2, 
                                'schema_mapping', schema_mapping_survey_2)


0it [00:00, ?it/s][A

Total number of papers: 91



1it [00:00,  1.17it/s][A
2it [00:01,  1.16it/s][A
3it [00:03,  1.19s/it][A
4it [00:05,  1.42s/it][A
5it [00:06,  1.18s/it][A
6it [00:07,  1.07s/it][A
7it [00:08,  1.05s/it][A
8it [00:09,  1.04s/it][A
9it [00:09,  1.03it/s][A
10it [00:10,  1.20it/s][A
11it [00:11,  1.28it/s][A
12it [00:12,  1.20it/s][A
13it [00:13,  1.14it/s][A
14it [00:13,  1.40it/s][A
15it [00:14,  1.40it/s][A
16it [00:14,  1.68it/s][A
17it [00:15,  1.39it/s][A
18it [00:15,  1.57it/s][A
19it [00:16,  1.47it/s][A
20it [00:16,  1.73it/s][A
21it [00:18,  1.35it/s][A
22it [00:19,  1.24it/s][A
23it [00:19,  1.23it/s][A
24it [00:21,  1.04it/s][A
25it [00:21,  1.23it/s][A
26it [00:22,  1.16it/s][A
27it [00:23,  1.20it/s][A
28it [00:24,  1.13it/s][A
29it [00:25,  1.19it/s][A
30it [00:25,  1.31it/s][A
31it [00:26,  1.22it/s][A
32it [00:28,  1.01it/s][A
33it [00:28,  1.17it/s][A
34it [00:29,  1.00it/s][A
35it [00:30,  1.17it/s][A
36it [00:32,  1.32s/it][A
37it [00:33,  1.23s/it][A
38it [00:

Remaining papers: 0





In [10]:
# Data integration: a theoretical perspective
schema_mapping_survey_3 = '014110225603dd5e0f848a43878afa240034bb19'

ref_smap_df_3 = get_papers(schema_mapping_survey_3, 'references')
cit_smap_df_3 = get_papers(schema_mapping_survey_3, 'citations')

if ref_smap_df_3 is None:
    schema_map_df_3 = cit_smap_df_3
else:
    schema_map_df_3 = pd.concat([cit_smap_df_3, ref_smap_df_3])
    
filtered_schema_mapping_3 = filter_save_selected_papers(schema_map_df_3, 
                                'schema_mapping', schema_mapping_survey_3)

0it [00:00, ?it/s]

Total number of papers: 24


24it [00:30,  1.27s/it]


Remaining papers: 0


0it [00:00, ?it/s]

Total number of papers: 163


157it [02:03,  1.21it/s]

Remaining papers: 6
Sleep 90s



0it [00:00, ?it/s][A

Total number of papers: 6


0it [00:00, ?it/s]


Remaining papers: 6
Sleep 90s



0it [00:00, ?it/s][A

Total number of papers: 6


0it [00:00, ?it/s]


Remaining papers: 6
Sleep 90s



0it [00:00, ?it/s][A

Total number of papers: 6


0it [00:00, ?it/s]


Remaining papers: 6
Sleep 90s



0it [00:00, ?it/s][A

Total number of papers: 6



1it [00:01,  1.11s/it][A
2it [00:02,  1.07s/it][A
3it [00:03,  1.04s/it][A
4it [00:04,  1.02s/it][A
5it [00:05,  1.11s/it][A
6it [00:06,  1.06s/it][A

Remaining papers: 0





In [14]:
# Schema mappings and data examples
schema_mapping_survey_4 = 'a30f5a745f6b691927114cdaa48b6965f11058a2'

ref_smap_df_4 = get_papers(schema_mapping_survey_4, 'references')
cit_smap_df_4 = get_papers(schema_mapping_survey_4, 'citations')

if ref_smap_df_4 is None:
    schema_map_df_4 = cit_smap_df_4
else:
    schema_map_df_4 = pd.concat([cit_smap_df_4, ref_smap_df_4])
    
filtered_schema_mapping_4 = filter_save_selected_papers(schema_map_df_4, 
                                'schema_mapping', schema_mapping_survey_4)


0it [00:00, ?it/s][A

Total number of papers: 12



1it [00:00,  1.04it/s][A
2it [00:02,  1.08s/it][A
3it [00:03,  1.01s/it][A
4it [00:05,  1.43s/it][A
5it [00:06,  1.24s/it][A
6it [00:07,  1.11s/it][A
7it [00:07,  1.10it/s][A
8it [00:08,  1.07it/s][A
9it [00:09,  1.11it/s][A
10it [00:10,  1.13it/s][A
11it [00:11,  1.04it/s][A
12it [00:12,  1.04s/it][A


Remaining papers: 0



0it [00:00, ?it/s][A

Total number of papers: 5



1it [00:00,  1.61it/s][A
2it [00:00,  1.89it/s][A
3it [00:01,  1.51it/s][A
4it [00:02,  1.39it/s][A
5it [00:03,  1.39it/s][A

Remaining papers: 0





### Union the paper

In [17]:
sch_map_df = pd.concat([filtered_schema_mapping_2, filtered_schema_mapping_3, filtered_schema_mapping_4])
sch_map_df = filter_save_selected_papers(sch_map_df, 'all_sch_map', '')
display(sch_map_df)


Unnamed: 0,paperId,title,nrCitations,citationVelocity
148,71219c274777ea42e79180d05a9a377690207e07,Ontology Matching: State of the Art and Future...,796,86
3,014110225603dd5e0f848a43878afa240034bb19,Data integration: a theoretical perspective,1934,86
76,580221d63ae75bdc7d68829916cf608e44a56b27,A survey of approaches to automatic schema mat...,2632,65
41,aa1158b0ba07da121e4f2ab039bff2c03c929616,Similarity flooding: a versatile graph matchin...,953,40
55,1126ceee34acd741396c493c84d8b6072a18bfd7,Potter's Wheel: An Interactive Data Cleaning S...,444,34
11,a7209ca952aa3055e218cdb469d7ce01404d7462,Answering queries using views: A survey,1188,34
110,6a063509ea79ab01571e444b9e4a3cea38239fdd,Data integration: the teenage years,458,23
89,55c186046feee5614cd15909dfcc587e0ff662d8,"Generic Schema Matching, Ten Years Later",177,20
35,7ff9bf4d58358fc008b059028a3e33919d12b335,Generic Schema Matching with Cupid,1114,19
79,af1e1bee41d004a6c1fa608a9fe2a884f48c6e5f,Reducing Uncertainty of Schema Matching via Cr...,78,14


### Manually inspecting the papers

In [18]:
paper_ids = ['0eb31f4a0981882a7f889ba373dab907e4631e77',
            '0bc3e0fa8da8e410e0b0700be04b9b59770decee',
            '9a059df30f9919578adb3ac466a734685401e68b',
            'af1e1bee41d004a6c1fa608a9fe2a884f48c6e5f']

selected_sch_map = select_papers('_all_sch_map.csv', paper_ids)
# select_papers('_all_sch_match', entity_matching, '_entity_matching.csv')
# select_papers('_all_sch_match', schema_mapping, '_schema_mapping.csv')

## Entity resolution

In [47]:
# Evaluation of entity resolution approaches on real-world match problems
entity_resolution_survey = 'c479e3fc832f8bdc63325525995710ed9c314ff8'

ref_er_df = get_papers(entity_resolution_survey, 'references')
cit_er_df = get_papers(entity_resolution_survey, 'citations')

entity_resolution_df = pd.concat([cit_er_df, ref_er_df])
filtered_entity_resolution = filter_save_selected_papers(entity_resolution_df,
                             'entity_resolution', entity_resolution_survey)





0it [00:00, ?it/s][A[A[A

Total number of papers: 16





1it [00:01,  1.37s/it][A[A[A


2it [00:01,  1.14s/it][A[A[A


3it [00:03,  1.20s/it][A[A[A


4it [00:03,  1.05it/s][A[A[A


5it [00:05,  1.16s/it][A[A[A


6it [00:05,  1.00it/s][A[A[A


7it [00:06,  1.01s/it][A[A[A


8it [00:09,  1.35s/it][A[A[A


9it [00:12,  1.94s/it][A[A[A


10it [00:13,  1.68s/it][A[A[A


11it [00:15,  1.68s/it][A[A[A


12it [00:16,  1.45s/it][A[A[A


13it [00:17,  1.30s/it][A[A[A


14it [00:18,  1.42s/it][A[A[A


15it [00:20,  1.61s/it][A[A[A


16it [00:22,  1.39s/it][A[A[A


Remaining papers: 0





0it [00:00, ?it/s][A[A[A

Total number of papers: 49





1it [00:01,  1.01s/it][A[A[A


2it [00:01,  1.06it/s][A[A[A


3it [00:02,  1.03it/s][A[A[A


4it [00:03,  1.08it/s][A[A[A


5it [00:04,  1.14it/s][A[A[A


6it [00:05,  1.07it/s][A[A[A


7it [00:06,  1.14it/s][A[A[A


8it [00:07,  1.18it/s][A[A[A


9it [00:07,  1.21it/s][A[A[A


10it [00:09,  1.22s/it][A[A[A


11it [00:10,  1.12s/it][A[A[A


12it [00:11,  1.03s/it][A[A[A


13it [00:12,  1.08s/it][A[A[A


14it [00:13,  1.03s/it][A[A[A


15it [00:14,  1.01s/it][A[A[A


16it [00:14,  1.26it/s][A[A[A


17it [00:15,  1.19it/s][A[A[A


18it [00:16,  1.20it/s][A[A[A


19it [00:17,  1.11it/s][A[A[A


20it [00:18,  1.25it/s][A[A[A


21it [00:19,  1.20it/s][A[A[A


22it [00:20,  1.11it/s][A[A[A


23it [00:21,  1.07it/s][A[A[A


24it [00:22,  1.02s/it][A[A[A


25it [00:23,  1.02it/s][A[A[A


26it [00:25,  1.25s/it][A[A[A


27it [00:26,  1.10s/it][A[A[A


28it [00:26,  1.09it/s][A[A[A


29it [00:27,  1.14it/s][A

Remaining papers: 0





In [10]:
# Entity Resolution: Past, Present and Yet-to-Come
entity_resolution_survey_2 = 'c86ebb69053348f7dce9cd155cbf84e4067dda45'

ref_er_df_2 = get_papers(entity_resolution_survey_2, 'references', True)
cit_er_df_2 = get_papers(entity_resolution_survey_2, 'citations')

entity_resolution_df_2 = pd.concat([cit_er_df_2, ref_er_df_2])
filtered_entity_resolution_2 = filter_save_selected_papers(entity_resolution_df_2,
                             'entity_resolution', entity_resolution_survey_2)



### Union the papers

In [None]:
er_df = pd.concat([filtered_entity_resolution, filtered_entity_resolution_2])
er_df = filter_save_selected_papers(er_df, 'all_er', '')


### Manually selecting the papers

In [17]:
entity_resolution_survey_2 = 'c86ebb69053348f7dce9cd155cbf84e4067dda45'


index_em = ['c0a29cb35c2965930566d6a407da043e18431eaa', 
           '7f8a1ba888fc4ce551530914d68f23ac54ce265f',
           '8bc23235070ce181d34002e2a44e4b233beaa732',
           '52d2b8b64fe06b1f920c0d72e321a6f904800d5c']
select_papers('{}_entity_resolution.csv'
               .format(entity_resolution_survey_2), 
              index_em, '_entity_matching_2.csv')

index_entity_res = ['62e8cdd77f3494a48a48b55150a093833788386d',
                   '904aac3f0c8bf7f9d97a23d5f563c98fcfb6d104',
                   '4c63aee3ac3d136560c5a333520a9851333a9ded',
                   '814f90ef27bfe5a90e118a1df0e24488e75b7939',
                   '71dfda705992e69797115f94f6f00bc56e0923b5',
                   '8717e86b0b12010f51a6b939cf070989355802ee',
                   'ddf36ee57133fa5fdff848da15b21983dc893a5a']
selected_entity_res = select_papers('{}_entity_resolution.csv'
                                    .format(entity_resolution_survey_2), 
                                    index_entity_res, '_entity_res_2.csv')


## Entity matching

In [49]:
# Frameworks for entity matching: A comparison
entity_matching_survey = 'caa5f6292a065aa1604ff00186d48d2d59cffd63'

ref_em_df = get_papers(entity_matching_survey, 'references')
cit_em_df = get_papers(entity_matching_survey, 'citations')

entity_matching_df = pd.concat([cit_em_df, ref_em_df])
filtered_entity_matching = filter_save_selected_papers(entity_matching_df,
                                                       'entity_matching',
                                                       entity_matching_survey)





0it [00:00, ?it/s][A[A[A

Total number of papers: 36





1it [00:01,  1.25s/it][A[A[A


2it [00:02,  1.36s/it][A[A[A


3it [00:03,  1.16s/it][A[A[A


4it [00:05,  1.25s/it][A[A[A


5it [00:05,  1.14s/it][A[A[A


6it [00:09,  1.99s/it][A[A[A


7it [00:11,  1.93s/it][A[A[A


8it [00:12,  1.62s/it][A[A[A


9it [00:14,  1.60s/it][A[A[A


10it [00:16,  1.81s/it][A[A[A


11it [00:17,  1.64s/it][A[A[A


12it [00:23,  2.89s/it][A[A[A


13it [00:24,  2.37s/it][A[A[A


14it [00:25,  1.94s/it][A[A[A


15it [00:27,  1.81s/it][A[A[A


16it [00:27,  1.50s/it][A[A[A


17it [00:28,  1.16s/it][A[A[A


18it [00:29,  1.08s/it][A[A[A


19it [00:30,  1.08s/it][A[A[A


20it [00:30,  1.02it/s][A[A[A


21it [00:32,  1.02s/it][A[A[A


22it [00:33,  1.04s/it][A[A[A


23it [00:33,  1.09it/s][A[A[A


24it [00:35,  1.07s/it][A[A[A


25it [00:36,  1.03s/it][A[A[A


26it [00:37,  1.00it/s][A[A[A


27it [00:38,  1.01it/s][A[A[A


28it [00:38,  1.27it/s][A[A[A


29it [00:39,  1.21it/s][A

Remaining papers: 0





0it [00:00, ?it/s][A[A[A

Total number of papers: 38





1it [00:00,  1.04it/s][A[A[A


2it [00:02,  1.13s/it][A[A[A


3it [00:03,  1.06s/it][A[A[A


4it [00:05,  1.31s/it][A[A[A


5it [00:06,  1.14s/it][A[A[A


6it [00:06,  1.03it/s][A[A[A


7it [00:07,  1.01s/it][A[A[A


8it [00:08,  1.18it/s][A[A[A


9it [00:09,  1.01it/s][A[A[A


10it [00:10,  1.01it/s][A[A[A


11it [00:11,  1.08it/s][A[A[A


12it [00:12,  1.15it/s][A[A[A


13it [00:12,  1.11it/s][A[A[A


14it [00:13,  1.12it/s][A[A[A


15it [00:14,  1.11it/s][A[A[A


16it [00:15,  1.24it/s][A[A[A


17it [00:16,  1.15it/s][A[A[A


18it [00:17,  1.25it/s][A[A[A


19it [00:17,  1.25it/s][A[A[A


20it [00:18,  1.40it/s][A[A[A


21it [00:19,  1.22it/s][A[A[A


22it [00:20,  1.08it/s][A[A[A


23it [00:21,  1.14it/s][A[A[A


24it [00:22,  1.10it/s][A[A[A


25it [00:23,  1.14it/s][A[A[A


26it [00:24,  1.03it/s][A[A[A


27it [00:25,  1.01it/s][A[A[A


28it [00:25,  1.27it/s][A[A[A


29it [00:26,  1.23it/s][A

Remaining papers: 6
Sleep 90s






0it [00:00, ?it/s][A[A[A[A

Total number of papers: 6


0it [00:00, ?it/s]


Remaining papers: 6
Sleep 90s






0it [00:00, ?it/s][A[A[A[A

Total number of papers: 6






1it [00:00,  1.52it/s][A[A[A[A



2it [00:01,  1.52it/s][A[A[A[A



3it [00:02,  1.29it/s][A[A[A[A



4it [00:03,  1.21it/s][A[A[A[A



5it [00:04,  1.09it/s][A[A[A[A



6it [00:05,  1.14it/s][A[A[A[A

Remaining papers: 0





### Manually filter the papers

In [50]:
index_entity_match = ['0ab4f5f03665fadd1d838b22fc3991062fb91928',
                      '632eca15ed20f87490c60a6005c4c58f06bee61b',
                      'd5f1bf73988ade49a6358afa12b36041118bb042',
                      '761b151368e709b972a1fa2c6f672ad6c675f37a',
                      'e185c3798419512963996b49760ca0e48b3e57bf',
                      '8ff529534f4f83f93a1d0a100d089f2042e53f3c',
                      '7edf5ffa6213d5f8c44f07a10411cf617363d0a0']
selected_entity_match = select_papers(
    '{}_entity_matching.csv'.format(entity_matching_survey),
    index_entity_match)

index_er = ['904aac3f0c8bf7f9d97a23d5f563c98fcfb6d104',
           '4fff4cb2a07a14bf3e41188d094944e9d95a5737']
selected_er = select_papers('{}_entity_matching.csv'.format(entity_matching_survey), index_er,
                            '_entity_resolution.csv')