In [2]:
import csv
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

In [3]:
groundTruth = pd.read_csv("Datasets/G.csv", sep='|')
sTrain = pd.read_csv("Datasets/STrain.csv", sep='|')

#### TF-IDF with N-grams

In [3]:
import re
def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in "McDonalds":')
ngrams('McDonalds')

All 3-grams in "McDonalds":


['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds']

In [4]:
# Can be parallelized
from sklearn.feature_extraction.text import TfidfVectorizer
company_names = groundTruth['name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
ground_truth_matrix = vectorizer.fit_transform(company_names)

In [5]:
sTrainFil = sTrain[0:10000]
train_matrix = vectorizer.transform(sTrainFil['name'])
train_matrix

<10000x78817 sparse matrix of type '<class 'numpy.float64'>'
	with 223152 stored elements in Compressed Sparse Row format>

In [6]:
# Below code calculates cosine similarities and return top results
# Implement LSA
def cosine_similarities(trainMat, groundTruthMat, top):
    sim = trainMat*groundTruthMat.T.tocsc()
    return np.argmax(sim, axis=1), np.max(sim, axis=1)

In [7]:
t1 = time.time()
match_id, score = cosine_similarities(train_matrix, ground_truth_matrix,1)
t = time.time()-t1
print("Time taken for computing similarities:", t)

Time taken for computing similarities: 84.30719304084778


In [8]:
match_id_col, match_score_col = np.asarray(match_id).ravel(), score.toarray().ravel()
match_company_id = np.array(groundTruth.loc[match_id_col]['company_id'])
match_company_name = np.array(groundTruth.loc[match_id_col]['name'])

In [9]:
sTrainFil['match_id'] = match_id_col
sTrainFil['match_score'] = match_score_col
sTrainFil['match_company_id'] = match_company_id
sTrainFil['match_company_name'] = match_company_name

In [10]:
sTrainFil.head()

Unnamed: 0,train_index,name,company_id,match_id,match_score,match_company_id,match_company_name
0,0,ATRION Immo bilien & Co. KG,-1,342766,0.498201,250537,ATRION Immobilien Verwaltung GmbH
1,1,MyTyme Inve stments Inc,356624,84431,0.901331,356624,MyTyme Investments Inc
2,2,Financial USI.,510805,427147,0.592161,152602,"DS Financial, LLC"
3,3,FlexShares Trust - FlexShares Morningstar Emer...,523467,432988,0.962004,523467,FlexShares Trust - FlexShares Morningstar Emer...
4,4,Health Sinai SF,231108,75219,0.541243,231108,Sinai Health System Foundation
