# Entity Linking

As a result of our knowledge extraction from scientific articles, we know now which software is present in articles. However, the problem remains, that authors name the same software differently. Therefore, in order to allow for the best reasoning we need to be able to map different names to the same distinct software. This is implemented here.

First we need to import the list we generated in the previous steps. 

In [None]:
import csv
import re
import nltk
import pandas as pd

from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

software_counter = []
with open('software_reasoning_list_total.csv', 'r') as software_file:
    software_csv = csv.reader(software_file)
    for idx, row in enumerate(software_csv):
        if idx == 0:
            continue
        software_counter.append({
            'names': [row[0]],
            'count': int(row[1]),
            'unique name': ''
        })
software_counter_bak = software_counter.copy()
stemmer = SnowballStemmer(language='english')
stops =  stopwords.words('english')

In [None]:
base_software_counts = {}
with open('software_reasoning_list_total.csv', 'r') as software_file:
    software_csv = csv.reader(software_file)
    for idx, row in enumerate(software_csv):
        if idx == 0:
            continue
        base_software_counts[row[0]] = int(row[1])

Next we start out with considering simple spelling deviations: casing, special characters and numbers. 
Each time we find a match we will combine the buckets in the list by appending their names. 
How the unique name is assigned will be considered later on.

We can also remove stopwords and try to stem 'normal' words appearing in software names. The stemming needs to be smart enough to perform wrong stemming ob abbreviations or other 'unstemmable' words.  

We can also remove errors at this point. For example if only a single token was extracted that is a special character. In this case our transformation pipeline will just give an empty example. In this case we know that we have found an error. 

We also take into account abbreviations which we build from the first letters and use to improve the linking.

In [None]:
%%time 
software_counter = software_counter_bak.copy()

def match_buckets(software_counter):
    capitalizer = lambda x: x.casefold()
    normalize_string = lambda x: re.sub('[^0-9a-zA-Z]+', ' ', x)
    remove_number = lambda x: x.rstrip('0123456789 ,.').lstrip(' ')
    remove_pro = lambda x: x[:-4] if x.endswith(' pro') else x
    stemming = lambda x: ' '.join([stemmer.stem(a) for a in x.split()]) if len(x.split()) > 1 else x
    rm_stopwords = lambda x: ' '.join([w for w in x.split() if w not in stops])
    get_acronym = lambda x: ''.join([s[0] for s in x.split()]) if len(x.split()) > 2 else None

    indices_to_pop = list()
    for cur_idx, cur_software in enumerate(software_counter):
        cur_names = list(map(capitalizer, cur_software['names']))
        cur_names = list(map(normalize_string, cur_names))
        cur_names = list(map(remove_number, cur_names))
        cur_names = list(map(remove_pro, cur_names))
        cur_names = list(map(rm_stopwords, cur_names))
        cur_acronyms = set(map(get_acronym, cur_names))   
        cur_names = list(map(stemming, cur_names))
        for next_idx in range(cur_idx + 1, len(software_counter)):
            next_names = list(map(capitalizer, software_counter[next_idx]['names']))
            next_names = list(map(normalize_string, next_names))
            next_names = list(map(remove_number, next_names))
            next_names = list(map(rm_stopwords, next_names))
            next_acronyms = set(map(get_acronym, next_names))
            next_names = list(map(stemming, next_names))
            if any(s_name in next_names for s_name in cur_names)\
                or any(acro in next_names for acro in cur_acronyms)\
                or any(acro in cur_names for acro in next_acronyms):
                indices_to_pop.append(cur_idx)
                for n in cur_software['names']:
                    software_counter[next_idx]['names'].append(n)
                software_counter[next_idx]['count'] += cur_software['count']
                break
    if indices_to_pop:
        indices_to_pop.reverse()
        for idx in indices_to_pop:
            software_counter.pop(idx)
    
    return software_counter

In [None]:
match_buckets(software_counter)

## Entity Linking with DBpedia

We created buckets of names which belong to the same entity. 
Next we map them to the names found in DBpedia and further match the buckets.
We iterate over the buckets and if a **single** name matches a DBpedia entry we map it. 

In [None]:
dbpedia_names = pd.read_csv('dbpedia_software_long.csv.gz', compression='gzip')
unique_db_labels = set(dbpedia_names['unique'].tolist())

In [None]:
%%time
from multiprocessing import Pool

def get_unique_name(software_list):
    direct_match = [x in unique_db_labels for x in software_list]
    matched_names = [x for x,y in zip(software_list, direct_match) if y]
    #print(direct_match)
    #alt_name_match =
    if any(direct_match):
        if len(set(matched_names)) > 1:
            print("Found multiple matches in direct match:")
            print(software_list)
            print(matched_names)
            return []
        else:
            return matched_names[0]
    else: 
        alt_name_match = dbpedia_names.loc[dbpedia_names['label'].isin(software_list)]
        if len(alt_name_match.index) > 0:
            unique_name_list = alt_name_match['unique'].tolist()
            if len(set(unique_name_list)) > 1:
                print("Found multiple matches in indirect match:")
                print(software_list)
                print(unique_name_list)
                return []
            else:
                return unique_name_list[0]
        else:
            for idx, row in dbpedia_names.iterrows():
                developer = row['developer']
                software_name = row['unique']
                software_label = row['label']
                match_list = []
                for software in software_list:
                    if not pd.isna(developer) and not pd.isna(software_name) and not pd.isna(software_label) and developer in software and (software_name in software or software_label in software):
                        match_list.append(True)
                        return software_name
                    else:
                        match_list.append(False)
                
                
    return []

def get_unique_names_multi(software_list):
    unique_name = get_unique_name(software_list['names'])
    return unique_name

In [None]:
p = Pool(processes=24)
unique_names = p.map(get_unique_names_multi, software_counter)

In [None]:
import pickle 
pickle.dump( unique_names, open( "unique_names_backup.p", "wb" ) )
#unique_names = pickle.load(open( "unique_names_backup.p", "rb"))

In [None]:
def combine_unique_names(software_buckets, mapped_names):
    software_mapping = {}
    distinct_mapped_count = 0
    names_mapped_count = 0
    total_mapped_count = 0
    distinct_not_mapped_count = 0
    names_not_mapped_count = 0
    total_not_mapped_count = 0
    for s, n in zip(software_buckets, mapped_names):
        if n:
            distinct_mapped_count += 1
            names_mapped_count += len(s['names'])
            total_mapped_count += s['count']
            if n in software_mapping.keys():
                # append to existing 
                software_mapping[n]['alias'].extend(s['names'])
                software_mapping[n]['count'] += s['count']
            else:
                # create new
                software_mapping[n] = {
                    'alias': s['names'],
                    'count': s['count']
                }
        else:
            distinct_not_mapped_count += 1
            names_not_mapped_count += len(s['names'])
            total_not_mapped_count += s['count']
            # create new, choose max occurrence in upper case as unique name 
            max_count = -1
            chosen_unique = ''
            for name in s['names']:
                if base_software_counts[name] > max_count:
                    max_count = base_software_counts[name]
                    chosen_unique = name
            chosen_unique = chosen_unique
            software_mapping[chosen_unique] = {
                'alias': s['names'],
                'count': s['count']
            }
    return software_mapping, distinct_mapped_count, names_mapped_count, total_mapped_count, distinct_not_mapped_count, names_not_mapped_count, total_not_mapped_count

final_mapping, p_count, n_mapped_count, tp_count, n_count, n_not_mapped_count, tn_count = combine_unique_names(software_counter, unique_names)
print("Mapped {} distinct positives to DBpedia amounting to {} total positives. {} distincs ({} total) were not mapped". format(p_count, tp_count, n_count, tn_count))
print("{} and {} names were mapped and not mapped".format(n_mapped_count, n_not_mapped_count))
print("Reduced number of buckets from {} to {}".format(len(base_software_counts), len(final_mapping)))

In [None]:
pickle.dump( final_mapping, open( "final_software_mapping.p", "wb" ) )

We now have the final state of the linking.  
The outputs are now written to a file in which we plan to gather more information on the software.

In [None]:
software_reasoning_list = pd.read_csv('software_reasoning_list_total.csv')

In [None]:
new_column = []
example_column = []
with open("data/software_kg_with_pos_pre_s_3_gold_dev_d_04_l_00015_s_01_3.json", "r") as software_kg:
    kg = json.load(software_kg)
    for s_name in software_reasoning_list['name']:
        found_article = False
        for article in kg['@graph']:
            for software in article['https://data.gesis.org/softwarekg/software']:
                if software['https://schema.org/name'] == s_name:
                    # get article doi as example
                    doi = article['http://schema.org/identifier']
                    found_article = True
                    break
            if found_article = True:
                break
        name_found = False
        for unique_name in final_mapping:
            if s_name in final_mapping[unique_name]['alias']:
                new_column.append(unique_name)       
                name_found = True
                break
        if not name_found:
            print("This case points toward some error in the original data frame: name {} is not in the software list".format(s_name))
            new_column.append('ERROR')

In [None]:
software_reasoning_list['linked_name'] = new_column

cols = software_reasoning_list.columns.tolist()

cols = cols[0:2] + cols[-1:] + cols[2:-1]
software_reasoning_list = software_reasoning_list[cols]

In [None]:
software_reasoning_list.to_csv("software_reasoning_list_linked_names.csv.gz", compression="gzip")

Get an example article for each software.

In [None]:
missed_counter = 0
linked_software_list = {}
for idx, bucket in enumerate(software_counter):
    if not any(["IBM" in s for s in bucket['names']]):
        continue
    print(bucket['names'])
    if idx % 100 == 0:
        print("Processed {}".format(idx))
    unique_name = get_unique_name(bucket['names'])
    if unique_name and unique_name in linked_software_list.keys():
        linked_software_list[unique_name]['names'].extend(bucket['names'])
        linked_software_list[unique_name]['count'] += bucket['count']
    elif unique_name: 
        linked_software_list[unique_name] = {
            'names': bucket['names'],
            'count': bucket['count']
        }
    else:
        missed_counter += 1

print(linked_software_list)
print("Missed {}".format(missed_counter))