In [1]:
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

In [2]:
def ngrams(string, n=3):
    string = re.sub(r'[,-./\']',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [3]:
# Below code calculates cosine similarities and return top results
def get_top_sim(sparse_row):
    nnz = sparse_row.getnnz()
    if nnz==0:
        return (0.0, None, -1)
    else:
        arg_index = np.argpartition(sparse_row.data, -1)[-1]
        match_id = sparse_row.indices[arg_index]
        match_score = sparse_row.data[arg_index]
        if match_score<0.60:
            result = (0.0, "NaN", -1)
        else:
            result = (match_score, groundTruth.loc[match_id]['name'], 
                     groundTruth.loc[match_id]['company_id'])
    return result
def cosine_similarities(trainMat, groundTruthMat):
    sim = trainMat.dot(groundTruthMat.T)
    #sim = trainMat*groundTruthMat.T.tocsc()
    return [get_top_sim(row) for row in sim]

In [4]:
def execute_matching(sTestFil, match_df):
    test_matrix = vectorizer.transform(sTestFil['name'])
    res = cosine_similarities(test_matrix, ground_truth_matrix)
    match_score, match_name , match_company_id = zip(*res)
    sTestFil['match_company_id'] = np.array(match_company_id)
    match_df = match_df.append(sTestFil[["test_index", "match_company_id"]])
    return match_df

In [5]:
def index_range(nrows, chunk_size):
    return range(1 * chunk_size, (nrows // chunk_size ) * chunk_size, chunk_size)

def split(dfm, chunk_size):
    indices = index_range(dfm.shape[0], chunk_size)
    return np.split(dfm, indices)

In [7]:
groundTruth = pd.read_csv("Datasets/G.csv", sep='|')
sTest = pd.read_csv("Datasets/STest.csv", sep='|')
vectorizer = pickle.load(open("vectorizer.pickle", "rb"))
ground_truth_matrix = sparse.load_npz("ground_truth.npz")
slices = split(sTest, 10000)
df_ = pd.DataFrame(columns=["test_index", "match_company_id"])
for sTestFil in slices:
    df_ = execute_matching(sTestFil, df_)

In [10]:
vectorizer

TfidfVectorizer(analyzer=<function ngrams at 0x1a0a3ce2f0>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), norm='l2',
        preprocessor=None, smooth_idf=True, stop_words=None,
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [8]:
df_.to_csv("result_fin.csv", sep='|', header=True, index=False)

In [9]:
df_.head()

Unnamed: 0,test_index,match_company_id
0,0,74004
1,1,-1
2,2,379427
3,3,-1
4,4,588856
