# TRM Lexical Similarity Tests
This notebook tests *lexical* similarity between a scanned manufacturer name and a TRM manfacturer name. Since manufacturer names comprise single words or phrases and not sentences reflecting semantics or context, we do not perform *semantic* similarity using, for example, transformer-based models. Instead, we test some algorithms used specifically for computing lexical similarity. 

## Utility Functions
The following utility functions are used throughout the code.

In [1]:
# Remove company suffixes
from cleanco import basename

# Common technology company name suffixes (e.g., Systems, Software, Solutions, Techologies, etc.)
suffix_stopwords = ['Software','Solutions','Systems','Technologies','Tech','Services','Communications',
                    'Comms','Enterprises','Group','Networks','Associates','Assoc','Foundation','Organization','Org',
                    'Project','Proj','Partners','Foundation','Company','Co','Corporation','Corp','Incorporated','Inc'
                    ]

def preproc(names):
    preproc_list = []
    for n in names:
        # Remove common company suffixes (e.g., Corporation, Inc, etc.)
        cleaned = basename(n)
        # Remove common tech company suffixes (e.g., Software, Systems, etc.)
        words = cleaned.split()
        last_word = words[-1] # Get last word
        if last_word.lower() in (string.lower() for string in suffix_stopwords):
            cleaned = cleaned.rsplit(' ', 1)[0] # Remove last word
        cleaned = cleaned.strip() # Remove preceding/trailing whitespace
        preproc_list.append(cleaned)
    return preproc_list

# Test
#x = preproc(['The Test Proj'])
#print(f'Cleaned: {x}')

In [2]:
# Generate XLSX Results (not CSV, so we can later add formulas to XLSX)
from pathlib import Path
from datetime import datetime

results_dir = 'C:\\work\\trm\\results'

# Write results
def write_csv(df, filename):
    try:
        # Set column headers
        df.columns = ["Scanned", "Predicted", "Score", "Expected", "Match"]

        now = datetime.now() # current date and time
        date_time = now.strftime("%m%d%Y%H%M%S")        
        filepath = Path(results_dir + '\\' + filename + '_' + date_time + '.xlsx')  
        filepath.parent.mkdir(parents=True, exist_ok=True) 
        with pd.ExcelWriter(filepath) as writer:
            df.to_excel(writer)  
        #df.to_csv(filepath) 
        print(f'Generated results at: {filepath}')
    except Exception as e:
        print(e) 

In [3]:
# Generate Stats
def get_stats(results):
    num_0_matches = 0  # Num errors
    num_1_matches = 0  # Num correct
    num_2_matches = 0  # Num unknown (assess manually)
    for x in results:
        #print(f'scan={x[0]}, best={x[1]}, score={x[2]}, expected={x[3]}, match={x[4]}')
        if x[4] == 0:
            num_0_matches = num_0_matches + 1
        if x[4] == 1:
            num_1_matches = num_1_matches + 1
        elif x[4] == 2:
            num_2_matches = num_2_matches + 1

    total = len(results)
    perc_0 = num_0_matches / total
    perc_1 = num_1_matches / total
    perc_2 = num_2_matches / total
    print(f'Correct: {perc_1}%, Incorrect: {perc_0}%, Unknown: {perc_2}%')
    print(f'Final scores must be assessed manually.')

## Data
### Input: Official TRM Data
Get the manufacturer names and IDs from the official TRM dataset.

In [4]:
# Read official TRM XLSX into a Pandas DataFrame.
import pandas as pd

# Read official TRM data
df_trm = pd.read_excel('C:\\work\\trm\\ramya_files\\TRM_official.xlsx', sheet_name='1 - component-baseline')

# Remove rows with duplicate manufacturer name
df_trm_dedup = df_trm.drop_duplicates(subset=['Manufacturer Name'])

# Sort row by ascending manufacturer names
df_trm_dedup_sorted = df_trm_dedup.sort_values(by=['Manufacturer Name'])

# Get manufacturer names
trm_mfr_names = df_trm_dedup_sorted['Manufacturer Name']
# Convert to list to allow for indexing
trm_mfr_list = list(trm_mfr_names)
# Get preprocessed mfr names
trm_preproc_mfr = preproc(trm_mfr_names)

TRM rows (dedup): 3674


### Input: Scanned Data
Get the input data (i.e., manufacturer data scanned from the network) and the previously predicted results. We will use previously predicted results as the target we are expected to predict. *NOTE: The input file should be converted to an XLSX file and deduplicated on the scanned manufacturer column before ingesting.*

In [5]:
# Read the scanned manufacturer data
df_scan = pd.read_excel('C:\\work\\trm\\ramya_files\\Manufacturer_LRpredictions_2_dedup.xlsx')

# Get scanned manufacturer names
scan_mfr_names = df_scan['manufacturer']
# Convert to list to allow for indexing
scan_mfr_names_list = list(scan_mfr_names)

# Get preprocessed manufacturer names:
scan_preproc_mfr = preproc(scan_mfr_names)

# Get previously predicted results that we will use as our expected results
expected_mfr_names = df_scan['predict_manufacturer']
# Convert to list to allow for indexing
expected_mfr_names_list = list(expected_mfr_names)

## Lexical Similarity Analyses
### Config
Configuration parametes for all analyses.

In [6]:
# Configure tests
max_rows = None  # If no max, set to None

# Break loops on (perfect) score == 1.0 or dist == 0.0
break_on_perfect_score = True

### SpaCy
Note that SpaCy Doc.similarity requires target words to be 'included' in its internal dictionary (with the possibly of some minor spelling errors). Otherwise, it will not be able to generate word vectors for comparison and result with a similarity score of 0.0. Regardless, we test the performanc of SpaCy for comparison purposes.

WARNING: This SpaCy code takes several minutes to run.

In [20]:
import spacy

# SpaCy en_core_web_lg contains word vectors -- en_core_web_sm does not.
nlp = spacy.load("en_core_web_lg")

#### Doc.similarity

In [None]:
# Test
w1 = nlp('Microsft')
w2 = nlp('Microsoft')
sim_score = w1.similarity(w2)
sim2_score = w2.similarity(w1)
print(f'score: {sim2_score}')

In [21]:
# TRM
results = []
for i, scan_preproc in enumerate(scan_preproc_mfr):
    print(f'Analyzing scan word {i}', end='\r')
    best_sim_score = 0.0
    scan_index = -1
    best_mfr_name = None
    for j, trm_preproc in enumerate(trm_preproc_mfr):
        
        # Similarity
        scan_mfr_tokens = nlp(scan_preproc)
        trm_mfr_tokens = nlp(trm_preproc)
        sim_score = scan_mfr_tokens.similarity(trm_mfr_tokens)
        
        if sim_score > best_sim_score:
            best_sim_score = sim_score
            best_mfr_name = trm_mfr_list[j]
            scan_index = i
            if break_on_perfect_score and best_sim_score == 1.0:
                break
            
    matches_expected = 0
    if best_sim_score == 1.0:
        matches_expected = 1
    elif pd.isna(expected_mfr_names_list[scan_index]):
        matches_expected = 2
    elif best_mfr_name == expected_mfr_names_list[scan_index]:
        matches_expected = 1
        
    results.append([scan_mfr_names_list[i], best_mfr_name, best_sim_score, expected_mfr_names_list[scan_index], matches_expected])
    
    if max_rows != None and i == max_rows:
        break
    
df_results = pd.DataFrame.from_records(results)
write_csv(df_results, 'spacy_similarity')

get_stats(results)

Analyzing scan word 0

  sim_score = scan_mfr_tokens.similarity(trm_mfr_tokens)


Generated results at: C:\work\trm\results\spacy_similarity_02192022170858.xlsx
Correct: 0.5957446808510638%, Incorrect: 0.031914893617021274%, Unknown: 0.3723404255319149%
Final scores must be assessed manually.


### NLTK

In [7]:
import nltk

#### Levenshtein edit-distance

In [8]:
# Test
dist = nltk.edit_distance('asdfasdf', 'Microsoft')
print(f'dist: {dist}')

dist: 7


In [18]:
# TRM
results = []
for i, scan_preproc in enumerate(scan_preproc_mfr):
    print(f'Analyzing scan word {i}', end='\r')
    best_sim_score = 100.0
    scan_index = -1
    best_mfr_name = None
    for j, trm_preproc in enumerate(trm_preproc_mfr):
        
        # Similarity
        if len(scan_preproc.split()) < len(trm_preproc.split()):
            trm_name_array = trm_preproc.split()[:len(scan_preproc.split())]
            trm_name = ' '.join(trm_name_array)
        else:
            trm_name = trm_preproc
            
        dist = nltk.edit_distance(scan_preproc, trm_name)
        
        if dist < best_sim_score:
            best_sim_score = dist
            best_mfr_name = trm_mfr_list[j]
            scan_index = i
            if break_on_perfect_score and best_sim_score == 0.0:
                break
            
    matches_expected = 0
    if best_sim_score == 0.0:
        matches_expected = 1
    elif pd.isna(expected_mfr_names_list[scan_index]):
        matches_expected = 2
    elif best_mfr_name == expected_mfr_names_list[scan_index]:
        matches_expected = 1
        
    results.append([scan_mfr_names_list[i], best_mfr_name, best_sim_score, expected_mfr_names_list[scan_index], matches_expected])
    
    if max_rows != None and i == max_rows:
        break
    
df_results = pd.DataFrame.from_records(results)
write_csv(df_results, 'nltk_levenshtein')

get_stats(results)

Generated results at: C:\work\trm\results\nltk_levenshtein_02192022162225.xlsx
Correct: 0.6276595744680851%, Incorrect: 0.010638297872340425%, Unknown: 0.3617021276595745%
Final scores must be assessed manually.


#### Jaccard Distance

In [10]:
# Test
dist = nltk.jaccard_distance(set('asdfasdf'), set('Microsoft'))
print(f'dist: {dist}')

dist: 0.8


In [22]:
# TRM
results = []
for i, scan_preproc in enumerate(scan_preproc_mfr):
    print(f'Analyzing scan word {i}', end='\r')
    best_sim_score = 100.0
    scan_index = -1
    best_mfr_name = None
    for j, trm_preproc in enumerate(trm_preproc_mfr):
        
        # Similarity
        if len(scan_preproc.split()) < len(trm_preproc.split()):
            trm_name_array = trm_preproc.split()[:len(scan_preproc.split())]
            trm_name = ' '.join(trm_name_array)
        else:
            trm_name = trm_preproc
            
        dist = nltk.jaccard_distance(set(scan_preproc), set(trm_name))

        if dist < best_sim_score:
            best_sim_score = dist
            best_mfr_name = trm_mfr_list[j]
            scan_index = i
            if break_on_perfect_score and best_sim_score == 0.0:
                break
            
    matches_expected = 0
    if best_sim_score == 0.0:
        matches_expected = 1
    elif pd.isna(expected_mfr_names_list[scan_index]):
        matches_expected = 2
    elif best_mfr_name == expected_mfr_names_list[scan_index]:
        matches_expected = 1
        
    results.append([scan_mfr_names_list[i], best_mfr_name, best_sim_score, expected_mfr_names_list[scan_index], matches_expected])
    
    if max_rows != None and i == max_rows:
        break
    
df_results = pd.DataFrame.from_records(results)
write_csv(df_results, 'nltk_jaccard')

get_stats(results)

Generated results at: C:\work\trm\results\nltk_jaccard_02192022175112.xlsx
Correct: 0.6382978723404256%, Incorrect: 0.010638297872340425%, Unknown: 0.35106382978723405%
Final scores must be assessed manually.


### Word2Vec with Cosine similarity

In [12]:
# Functions
def word2vec(word):
    from collections import Counter
    from math import sqrt

    # count the characters in word
    cw = Counter(word)
    # precomputes a set of the different characters
    sw = set(cw)
    # precomputes the "length" of the word vector
    lw = sqrt(sum(c*c for c in cw.values()))

    # return a tuple
    return cw, sw, lw

def cosdis(v1, v2):
    # which characters are common to the two words?
    common = v1[1].intersection(v2[1])
    # by definition of cosine distance we have
    return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]

In [13]:
# Test
v1 = word2vec('Cisco')
v2 = word2vec('Cisco')
sim = cosdis(v1, v2)
print(f'sim: {sim}')

sim: 1.0


In [19]:
# TRM
results = []
for i, scan_preproc in enumerate(scan_preproc_mfr):
    print(f'Analyzing scan word {i}', end='\r')
    best_sim_score = 0.0
    scan_index = -1
    best_mfr_name = None
    for j, trm_preproc in enumerate(trm_preproc_mfr):
        
        # Similarity
        if len(scan_preproc.split()) < len(trm_preproc.split()):
            trm_name_array = trm_preproc.split()[:len(scan_preproc.split())]
            trm_name = ' '.join(trm_name_array)
        else:
            trm_name = trm_preproc
            
        scan_vec = word2vec(scan_preproc)
        trm_vec = word2vec(trm_name)
        sim = cosdis(scan_vec, trm_vec)
        
        if sim > best_sim_score:
            best_sim_score = sim
            best_mfr_name = trm_mfr_list[j]
            scan_index = i
            if break_on_perfect_score and best_sim_score == 1.0:
                break
            
    matches_expected = 0
    if best_sim_score >= 1:  # For some reason, some best scores are 1.0000000000000002
        matches_expected = 1    
    elif pd.isna(expected_mfr_names_list[scan_index]):
        matches_expected = 2
    elif best_mfr_name == expected_mfr_names_list[scan_index]:
        matches_expected = 1
        
    results.append([scan_mfr_names_list[i], best_mfr_name, best_sim_score, expected_mfr_names_list[scan_index], matches_expected])
    
    if max_rows != None and i == max_rows:
        break
    
df_results = pd.DataFrame.from_records(results)
write_csv(df_results, 'word2vec_cosine')

get_stats(results)


Generated results at: C:\work\trm\results\word2vec_cosine_02192022162424.xlsx
Correct: 0.6276595744680851%, Incorrect: 0.010638297872340425%, Unknown: 0.3617021276595745%
Final scores must be assessed manually.


### Difflib

In [26]:
import difflib

In [28]:
# Test
sm = difflib.SequenceMatcher(None)

sm.set_seq2('Microsoft')
test = 'Microsoft'
sm.set_seq1(test)
print(f' {test}, {sm.ratio()}')

 Microsoft, 1.0


In [30]:
# TRM
results = []
for i, scan_preproc in enumerate(scan_preproc_mfr):
    print(f'Analyzing scan word {i}', end='\r')
    best_sim_score = 0.0
    scan_index = -1
    best_mfr_name = None
    
    sm = difflib.SequenceMatcher(None)
    sm.set_seq2(scan_preproc)

    for j, trm_preproc in enumerate(trm_preproc_mfr):
        
        # Similarity
        sm.set_seq1(trm_preproc)
        sim = sm.ratio()
        
        if sim > best_sim_score:
            best_sim_score = sim
            best_mfr_name = trm_mfr_list[j]
            scan_index = i
            #print(f'scan: {scan_preproc}, trm: {best_mfr_name}, best_score: {best_sim_score}')
            if break_on_perfect_score and best_sim_score == 1.0:
                break
            
    matches_expected = 0
    if best_sim_score >= 1:  # For some reason, some best scores are 1.0000000000000002
        matches_expected = 1    
    elif pd.isna(expected_mfr_names_list[scan_index]):
        matches_expected = 2
    elif best_mfr_name == expected_mfr_names_list[scan_index]:
        matches_expected = 1
        
    results.append([scan_mfr_names_list[i], best_mfr_name, best_sim_score, expected_mfr_names_list[scan_index], matches_expected])
    
    if max_rows != None and i == max_rows:
        break
    
df_results = pd.DataFrame.from_records(results)
write_csv(df_results, 'difflib_dist')

get_stats(results)

Generated results at: C:\work\trm\results\difflib_dist_02192022180116.xlsx
Correct: 0.6276595744680851%, Incorrect: 0.010638297872340425%, Unknown: 0.3617021276595745%
Final scores must be assessed manually.
