# TRM Lexical Similarity Tests
This notebook tests *lexical* similarity between a scanned manfacturer name and a TRM manfacturer name. Since manufacturer names comprise single words or phrases and not sentences containing context, we do not perform *semantic* similarity using, for example, BERT. Further, we do not use any packages that rely on dictionaries for performing word similarity, such as gensim. 

## Read Data
### Official TRM Data
Get the manufacturer names and IDs from the official TRM dataset.

In [40]:
# Read official TRM XLSX into a Pandas DataFrame.
import pandas as pd

# Read official TRM data
df_trm = pd.read_excel('C:\\work\\trm\\ramya_files\\TRM_official.xlsx', sheet_name='1 - component-baseline')
#print(f'Column headings:\n{df_trm.columns}')
#print(f'Num rows: {len(df_trm.index)}')

# Remove rows with duplicate manufacturer name
df_trm_dedup = df_trm.drop_duplicates(subset=['Manufacturer Name'])
print(f'TRM rows (dedup): {len(df_trm_dedup.index)}')

# Sort row by ascending manufacturer names
df_trm_dedup_sorted = df_trm_dedup.sort_values(by=['Manufacturer Name'])

# Get Manufacturer Names
trm_mfr_names = df_trm_dedup_sorted['Manufacturer Name']
#print(f'trm_mfr_names: {trm_mfr_names}')
# Get Manufacturer IDs
trm_mfr_id = df_trm_dedup_sorted['Manufacturer ID']

TRM rows (dedup): 3675


### (Scanned) Input Data
Get the input data (scanned from the network) and Ramya's predicted results. We will use Ramya's predicted results as the results we are expected to get.

In [52]:
# Read the scanned manufacturer data
df_scan = pd.read_csv('C:\\work\\trm\\ramya_files\\Manufacturer_LRpredictions_2.csv')
#print(f'Column headings:\n{df_scan.columns}')
print(f'Scan rows (dedup): {len(df_scan.index)}')

# Remove rows with duplicate manufacturer name. Note columns G and H are predicted TRM values.
df_scan_dedup = df_scan.drop_duplicates(subset=['manufacturer'])
#print(f'Dedup num rows: {len(df_scan_dedup.index)}')

# Sort row by ascending manufacturer names
df_scan_dedup_sorted = df_scan_dedup.sort_values(by=['manufacturer'])

# Get scanned Manufacturer Names
scan_mfr_names = df_scan_dedup_sorted['manufacturer']
#print(f'scan_mfr_names: {scan_mfr_names}')

# Get Ramya's predicted results that we will use as our expected results
expected_mfr_names = df_scan_dedup_sorted['predict_manufacturer']
print(f'expected_mfr_names: {expected_mfr_names}')
# Convert to list for extraction of expected mfr names
expected_mfr_names_list = list(expected_mfr_names)

Scan rows (dedup): 875
expected_mfr_names: 7      Cisco Systems
776           McAfee
873              NaN
874              NaN
868              NaN
           ...      
70               NaN
65               NaN
0          Microsoft
59               NaN
60               NaN
Name: predict_manufacturer, Length: 94, dtype: object


## Lexical Similarity Analyses
### SpaCy

In [86]:
import spacy

# SpaCy en_core_web_lg contains word vectors -- en_core_web_sm does not.
nlp = spacy.load("en_core_web_lg")

results = []
rows = len(scan_mfr_names)
#for i, scan_mfr in enumerate(scan_mfr_names):
for i, scan_mfr in enumerate(scan_mfr_names):
    print(f'Analyzing scan word {i} of {rows}', end='\r')
    best_sim_score = 0.0
    scan_index = -1
    best_mfr_name = None
    for trm_mfr in trm_mfr_names:
        scan_mfr_tokens = nlp(scan_mfr)
        trm_mfr_tokens = nlp(trm_mfr)
        sim_score = scan_mfr_tokens.similarity(trm_mfr_tokens)
        if sim_score > best_sim_score:
            best_sim_score = sim_score
            best_mfr_name = trm_mfr
            scan_index = i
            
    matches_expected = 0
    #print(f"best_mfr_name: {best_mfr_name}")
    #print(f"scan_index: {scan_index}")
    #print(f"expected_mfr_names: {expected_mfr_names_list[scan_index]}")
    if best_mfr_name == expected_mfr_names_list[scan_index]:
        matches_expected = 1
    r = []
    r.append(scan_mfr)      # Scan mfr name
    r.append(best_mfr_name)  # Best mfr name
    r.append(best_sim_score) # Sim score for best mfr name
    r.append(expected_mfr_names_list[scan_index])    # Expected mfr name
    r.append(matches_expected)  # Best mfr name matched expected name
    results.append(r)
    
    if i == 1:
        break
    
for x in results:
    print(f'scan={x[0]}, pred={x[1]}, score={x[2]}, expected={x[3]}, match={x[4]}')

Analyzing scan word 0 of 94

  sim_score = scan_mfr_tokens.similarity(trm_mfr_tokens)


scan=Cisco, pred=Cisco Systems, score=0.8021419966775432, expected=Cisco Systems, match=1
scan=McAfee, pred=McAfee, score=1.0, expected=McAfee, match=1


### NLTK
#### Levenshtein edit-distance

In [96]:
import nltk

results = []
rows = len(scan_mfr_names)
for i, scan_mfr in enumerate(scan_mfr_names):
    print(f'Analyzing scan word {i} of {rows}', end='\r')
    best_dist = 100.0
    scan_index = -1
    best_mfr_name = None
    trm_name = None
    for trm_mfr in trm_mfr_names:
        # Since edit_distance() is affected 
        if len(scan_mfr.split()) < len(trm_mfr.split()):
            trm_name_array = trm_mfr.split()[:len(scan_mfr.split())]
            trm_name = ' '.join(trm_name_array)
        else:
            trm_name = trm_mfr
            
        dist = nltk.edit_distance(scan_mfr, trm_name)
        if 'Cisco' in trm_mfr:
            print(f'scan_mfr: {scan_mfr}, trm name: {trm_mfr}, trm name reduced: {trm_name}, dist: {dist}') 
        if dist <= best_dist:
            best_dist = dist
            best_mfr_name = trm_mfr
            scan_index = i
            
    matches_expected = 0
    #print(f"max_mfr_name: {max_mfr_name}")
    #print(f"scan_index: {scan_index}")
    #print(f"expected_mfr_names: {expected_mfr_names_list[scan_index]}")
    if best_mfr_name == expected_mfr_names_list[scan_index]:
        matches_expected = 1
    r = []
    r.append(scan_mfr)      # Scan mfr name
    r.append(best_mfr_name)  # Best mfr name
    r.append(best_dist)      # Min dist for best mfr name
    r.append(expected_mfr_names_list[scan_index])    # Expected mfr name
    r.append(matches_expected)  # Best mfr name matched expected name
    results.append(r)
    
    if i == 1:
        break
    
for x in results:
    print(f'scan={x[0]}, pred={x[1]}, score={x[2]}, expected={x[3]}, match={x[4]}')

scan_mfr: Cisco, trm name: Cisco Systems, trm name reduced: Cisco, dist: 0
scan_mfr: McAfee, trm name: Cisco Systems, trm name reduced: Cisco, dist: 6
scan=Cisco, pred=Cisco Systems, score=0, expected=Cisco Systems, match=1
scan=McAfee, pred=McAfee, score=0, expected=McAfee, match=1


#### Jaccard Distance

In [97]:
results = []
rows = len(scan_mfr_names)
for i, scan_mfr in enumerate(scan_mfr_names):
    print(f'Analyzing scan word {i} of {rows}', end='\r')
    best_dist = 100.0
    scan_index = -1
    best_mfr_name = None
    trm_name = None
    for trm_mfr in trm_mfr_names:
        # Since edit_distance() is affected 
        if len(scan_mfr.split()) < len(trm_mfr.split()):
            trm_name_array = trm_mfr.split()[:len(scan_mfr.split())]
            trm_name = ' '.join(trm_name_array)
        else:
            trm_name = trm_mfr
            
        dist = nltk.jaccard_distance(set(scan_mfr), set(trm_name))
        if 'Cisco' in trm_mfr:
            print(f'scan_mfr: {scan_mfr}, trm name: {trm_mfr}, trm name reduced: {trm_name}, dist: {dist}') 
        if dist <= best_dist:
            best_dist = dist
            best_mfr_name = trm_mfr
            scan_index = i
            
    matches_expected = 0
    #print(f"max_mfr_name: {max_mfr_name}")
    #print(f"scan_index: {scan_index}")
    #print(f"expected_mfr_names: {expected_mfr_names_list[scan_index]}")
    if best_mfr_name == expected_mfr_names_list[scan_index]:
        matches_expected = 1
    r = []
    r.append(scan_mfr)      # Scan mfr name
    r.append(best_mfr_name)  # Best mfr name
    r.append(best_dist)      # Min dist for best mfr name
    r.append(expected_mfr_names_list[scan_index])    # Expected mfr name
    r.append(matches_expected)  # Best mfr name matched expected name
    results.append(r)
    
    if i == 1:
        break
    
for x in results:
    print(f'scan={x[0]}, pred={x[1]}, score={x[2]}, expected={x[3]}, match={x[4]}')

scan_mfr: Cisco, trm name: Cisco Systems, trm name reduced: Cisco, dist: 0.0
scan_mfr: McAfee, trm name: Cisco Systems, trm name reduced: Cisco, dist: 0.8888888888888888
scan=Cisco, pred=Cisco Systems, score=0.0, expected=Cisco Systems, match=1
scan=McAfee, pred=McAfee, score=0.0, expected=McAfee, match=1


### Word2Vec with Cosign similarity

In [109]:
def word2vec(word):
    from collections import Counter
    from math import sqrt

    # count the characters in word
    cw = Counter(word)
    # precomputes a set of the different characters
    sw = set(cw)
    # precomputes the "length" of the word vector
    lw = sqrt(sum(c*c for c in cw.values()))

    # return a tuple
    return cw, sw, lw

def cosdis(v1, v2):
    # which characters are common to the two words?
    common = v1[1].intersection(v2[1])
    # by definition of cosine distance we have
    return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]


results = []
rows = len(scan_mfr_names)
for i, scan_mfr in enumerate(scan_mfr_names):
    print(f'Analyzing scan word {i} of {rows}', end='\r')
    best_dist = 0.0
    scan_index = -1
    best_mfr_name = None
    trm_name = None
    for trm_mfr in trm_mfr_names:
        # Since edit_distance() is affected 
        if len(scan_mfr.split()) < len(trm_mfr.split()):
            trm_name_array = trm_mfr.split()[:len(scan_mfr.split())]
            trm_name = ' '.join(trm_name_array)
        else:
            trm_name = trm_mfr
            
        scan_vec = word2vec(scan_mfr)
        trm_vec = word2vec(trm_name)
        sim = cosdis(scan_vec, trm_vec)
        if 'Cisco' in trm_mfr:
            print(f'scan_mfr: {scan_mfr}, trm name: {trm_mfr}, trm name reduced: {trm_name}, sim: {sim}, currebest: {best_dist}')
        if sim > best_dist:
            print(f'sim > best_dist: scan_mfr: {scan_mfr}, trm name: {trm_mfr}, trm name reduced: {trm_name}, dist: {sim}')

            best_dist = sim
            best_mfr_name = trm_mfr
            scan_index = i
            
    matches_expected = 0
    #print(f"max_mfr_name: {max_mfr_name}")
    #print(f"scan_index: {scan_index}")
    #print(f"expected_mfr_names: {expected_mfr_names_list[scan_index]}")
    if best_mfr_name == expected_mfr_names_list[scan_index]:
        matches_expected = 1
    r = []
    r.append(scan_mfr)      # Scan mfr name
    r.append(best_mfr_name)  # Best mfr name
    r.append(best_dist)      # Min dist for best mfr name
    r.append(expected_mfr_names_list[scan_index])    # Expected mfr name
    r.append(matches_expected)  # Best mfr name matched expected name
    results.append(r)
    
    if i == 1:
        break
    
for x in results:
    print(f'scan={x[0]}, pred={x[1]}, score={x[2]}, expected={x[3]}, match={x[4]}')


sim > best_dist: scan_mfr: Cisco, trm name: 2BrightSparks, trm name reduced: 2BrightSparks, dist: 0.2309401076758503
sim > best_dist: scan_mfr: Cisco, trm name: 3DVista, trm name reduced: 3DVista, dist: 0.3380617018914066
sim > best_dist: scan_mfr: Cisco, trm name: 4Bits Ltd., trm name reduced: 4Bits, dist: 0.39999999999999997
sim > best_dist: scan_mfr: Cisco, trm name: AccessData Group, trm name reduced: AccessData, dist: 0.4472135954999579
sim > best_dist: scan_mfr: Cisco, trm name: Accordion, trm name reduced: Accordion, dist: 0.6201736729460423
sim > best_dist: scan_mfr: Cisco, trm name: Acronis International GmbH, trm name reduced: Acronis, dist: 0.6761234037828132
sim > best_dist: scan_mfr: Cisco, trm name: Association of American Railroads, trm name reduced: Association, dist: 0.7592566023652966
sim > best_dist: scan_mfr: Cisco, trm name: Circos, trm name reduced: Circos, dist: 0.912870929175277
scan_mfr: Cisco, trm name: Cisco Systems, trm name reduced: Cisco, sim: 1.0, currebe

In [105]:
        scan_vec = word2vec('McAfee')
        trm_vec = word2vec('//SEIBERT/MEDIA')
        sim = cosdis(scan_vec, trm_vec)
        print(sim)

0.13130643285972254
