# Software Disambiguation

In this notebook, we shall disambiguate some of the software mentions in the CZI old dataset. We shall do so by simply using the fuzzy matching.

In [5]:
# Importing the relevant libraries
import pandas as pd
from thefuzz import fuzz
from thefuzz import process

In [3]:
# Now let us load the old dataset
ROOT_DIR = 'old/'
df_czi_old = pd.read_csv(ROOT_DIR+"/disambiguated/comm_disambiguated.tsv.gz", compression='gzip', sep='\t')
df_czi_old.head(1)

Unnamed: 0,license,location,pmcid,pmid,doi,pubdate,source,number,text,software,version,ID,curation_label,mapped_to_software
0,comm,comm/Micropl/PMC8475362.nxml,8475362,,10.1186/s43591-021-00017-9,2021,Particle selection and identification of polym...,7,"Then, all items were photographed under a bino...",Olympus CellSens,,SM0,not_curated,Olympus cellSens


In [6]:
# Let us get the list of all softwares so we can do fuzzy matching on those
lst_software = df_czi_old[df_czi_old.curation_label.eq('software')].sort_values(by='mapped_to_software')\
        .drop_duplicates(subset='mapped_to_software').mapped_to_software.tolist()

In [7]:
# Number of unique softwares within the old dataset
print("Number of unique softwares within the old dataset", len(lst_software))

Number of unique softwares within the old dataset 4639


In [8]:
# Identifying the software mapping so we can standardize the software names
s2s = {} # initializing a mapping dictionary

# Goign through each software and identifying the closest software based on fuzzy matching
for software in lst_software:
    # We shall only extract the top choice
    choice = process.extract(software, lst_software, limit=1)
    # We will only consider choices if they are not the same name
    if(software != choice[0][0]):
        # Add it to the dictionary
        print(software, choice)
        s2s[software] = choice

s2s

3-matic [('3-Matic', 100)]
3D-Slicer [('3D Slicer', 100)]
ANSYS Fluent [('ANSYS FLUENT', 100)]
ANTs [('ANTS', 100)]
ASReml [('ASREML', 100)]
Abaqus [('ABAQUS', 100)]
Actilife [('ActiLife', 100)]
Adaboost [('AdaBoost', 100)]
Adonis [('ADONIS', 100)]
Amos [('AMOS', 100)]
Anaconda [('ANACONDA', 100)]
Ansys [('ANSYS', 100)]
ArcGIS [('ARCGIS', 100)]
ArcGis [('ARCGIS', 100)]
Arcgis [('ARCGIS', 100)]
Atlas.ti [('ATLAS.ti', 100)]
Avizo [('AVIZO', 100)]
BEAUti [('BEAUTi', 100)]
BWA mem [('BWA MEM', 100)]
BWA-MEM [('BWA MEM', 100)]
BWA-mem [('BWA MEM', 100)]
BayeScan [('BAYESCAN', 100)]
BayesAss [('BAYESASS', 100)]
Bayescan [('BAYESCAN', 100)]
Beagle [('BEAGLE', 100)]
BiNGO [('BINGO', 100)]
BinGO [('BINGO', 100)]
Bingo [('BINGO', 100)]
BioNJ [('BIONJ', 100)]
Bioclim [('BIOCLIM', 100)]
Bioperl [('BioPerl', 100)]
Biotools [('BioTools', 100)]
Biotyper [('BioTyper', 100)]
BootScan [('BOOTSCAN', 100)]
Bootscan [('BOOTSCAN', 100)]
Bottleneck [('BOTTLENECK', 100)]
CD-search [('CD-Search', 100)]
CELLQue

{'3-matic': [('3-Matic', 100)],
 '3D-Slicer': [('3D Slicer', 100)],
 'ANSYS Fluent': [('ANSYS FLUENT', 100)],
 'ANTs': [('ANTS', 100)],
 'ASReml': [('ASREML', 100)],
 'Abaqus': [('ABAQUS', 100)],
 'Actilife': [('ActiLife', 100)],
 'Adaboost': [('AdaBoost', 100)],
 'Adonis': [('ADONIS', 100)],
 'Amos': [('AMOS', 100)],
 'Anaconda': [('ANACONDA', 100)],
 'Ansys': [('ANSYS', 100)],
 'ArcGIS': [('ARCGIS', 100)],
 'ArcGis': [('ARCGIS', 100)],
 'Arcgis': [('ARCGIS', 100)],
 'Atlas.ti': [('ATLAS.ti', 100)],
 'Avizo': [('AVIZO', 100)],
 'BEAUti': [('BEAUTi', 100)],
 'BWA mem': [('BWA MEM', 100)],
 'BWA-MEM': [('BWA MEM', 100)],
 'BWA-mem': [('BWA MEM', 100)],
 'BayeScan': [('BAYESCAN', 100)],
 'BayesAss': [('BAYESASS', 100)],
 'Bayescan': [('BAYESCAN', 100)],
 'Beagle': [('BEAGLE', 100)],
 'BiNGO': [('BINGO', 100)],
 'BinGO': [('BINGO', 100)],
 'Bingo': [('BINGO', 100)],
 'BioNJ': [('BIONJ', 100)],
 'Bioclim': [('BIOCLIM', 100)],
 'Bioperl': [('BioPerl', 100)],
 'Biotools': [('BioTools', 100)]

In [10]:
# Converting this to a dictionary
df_mapping = pd.DataFrame.from_dict(s2s, orient='index').reset_index()\
                .rename(columns={'index':'sft1', 0: 'sft2'})

# Specify the CSV file path
csv_file_path = '../../data/czi_mentions_old_disambiguation_mapping.csv'

# Save the DataFrame as a CSV file
df_mapping.to_csv(csv_file_path, index=False)