In [1]:
import re
from time import time
import pandas as pd
# from unidecode import unidecode
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from underthesea import word_tokenize
from operator import itemgetter
import pickle
from gensim import corpora, models, similarities


In [2]:
def clean_function(text):  
    #lowercase
    text = text.lower()
    
    #xoá ký tự đặc biệt, ký tự không phải alphanumeric 
    text = re.sub("[^a-z0-9àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễòóọỏõôồốộổỗơờớợởỡìíịỉĩùúụủũưừứựửữỳýỵỷỹđ]",' ', text)
    
    #xoá leading zero
    text = [e.lstrip('0') for e in text.split(' ')]
    text = (' ').join(text)
    
    #replace extra spaces with single space
    text = re.sub(' +',' ',text)
    
    #xoá khoảng trắng ở đầu và cuối mỗi câu
    text = text.strip()
    
    #word tokenzie
    text = word_tokenize(text, format='text')
    return text

In [3]:
from time import time
def nearestNeighbors_search(query):
    #Load data
    product = pd.read_csv('data/product_names_clean.csv', index_col=0)
    names_pca = pd.read_csv('data/names_pca.csv', index_col=0)

    #Load model
    with open('tfidf.pkl', 'rb') as file:  
        tfidf_model = pickle.load(file)
    with open('pca.pkl', 'rb') as file:  
        pca_model = pickle.load(file)
    
    t0 = time()
    #query cleaning
    q = clean_function(query)
    
    #Vectorize query
    q_tf = tfidf_model.transform([q])
    q_tf = pd.DataFrame(q_tf.toarray(), 
                        columns=tfidf_model.get_feature_names())
    
    #Dimension rediction query
    q_pca = pca_model.transform(q_tf)
    
    #Find 5 nearest neighbors
    neigh = NearestNeighbors(n_neighbors=5, metric='minkowski', algorithm = 'auto')
    neigh.fit(names_pca)    
    distance, idx = neigh.kneighbors(q_pca)
    names = product.loc[idx[0],'names'].values
    results = pd.DataFrame([names, distance[0]], index = ['product_names', 'distance']).T
    print('Time:',time()-t0)
    return results  

In [4]:
def nearestNeighbors_search_noPCA(query, tfidf_matrix):
    #Load data
    product = pd.read_csv('data/product_names_clean.csv', index_col=0)

    #Load model
    with open('tfidf.pkl', 'rb') as file:  
        tfidf_model = pickle.load(file)
    
    t0 = time()
    #query cleaning
    q = clean_function(query)
    
    #Vectorize query
    q_tf = tfidf_model.transform([q])
    q_tf = pd.DataFrame(q_tf.toarray(), 
                        columns=tfidf_model.get_feature_names())
    
    #Find 5 nearest neighbors
    neigh = NearestNeighbors(n_neighbors=5, metric='minkowski', algorithm = 'auto')
    neigh.fit(tfidf_matrix)    
    distance, idx = neigh.kneighbors(q_tf)
    names = product.loc[idx[0],'names'].values
    results = pd.DataFrame([names, distance[0]], index = ['product_names', 'distance']).T
    print('Time:',time()-t0)
    return results  

In [5]:
def cosine_search(query):   
    #Load models 
    with open('tfidf.pkl', 'rb') as file:  
        tfidf_model = pickle.load(file)
    with open('pca.pkl', 'rb') as file:  
        pca_model = pickle.load(file)
    
    #Load data
    df_product = pd.read_csv('data/product_names_clean.csv', index_col=0)
    pca_matrix = pd.read_csv('data/names_pca.csv', index_col=0).values
    
    t0 = time()
    # Clean text
    query = clean_function(query)
    
    # Vectorize query
    query_tfidf = tfidf_model.transform([query])
    query_tfidf = pd.DataFrame(query_tfidf.toarray(), 
                               columns=tfidf_model.get_feature_names())

    # Reduce dimension to 10 components
    query_pca = pca_model.transform(query_tfidf)
    
    # Calculate consine similarity
    cosine_similarities = cosine_similarity(query_pca, pca_matrix).flatten()

    # Get index of top 5 similarities_score
    related_product_indices = cosine_similarities.argsort()[-5:]

    # Get top 5 similarities_score
    cosine_similarities.sort()
    similarities_score = cosine_similarities[-5:]
    result_dict = { 'product_names': df_product.iloc[related_product_indices,:]['names'].values, 
                    'sim_score':similarities_score}
    result = pd.DataFrame({k: pd.Series(v) for k, v in result_dict.items()})
    print('Time:', time()-t0)
    return result

In [6]:
def cosine_search_all(queries):
    #Load models 
    with open('tfidf.pkl', 'rb') as file:  
        tfidf_model = pickle.load(file)
    with open('pca.pkl', 'rb') as file:  
        pca_model = pickle.load(file)
    
    #Load data
    df_product = pd.read_csv('data/product_names_clean.csv', index_col=0)
    pca_matrix = pd.read_csv('data/names_pca.csv', index_col=0).values
    
    t0 = time()
    results = pd.DataFrame(columns = ['query','product_names', 'sim_score'])

    #Search
    for q in queries:
        # Vectorize query
        query_tfidf = tfidf_model.transform([q])
        query_tfidf = pd.DataFrame(query_tfidf.toarray(), 
                                   columns=tfidf_model.get_feature_names())
        
        # Reduce dimension to 10 components
        query_pca = pca_model.transform(query_tfidf)
        
        # Calculate consine similarity
        cosine_similarities = cosine_similarity(query_pca, pca_matrix).flatten()
        
        # Get index of top 5 similarities_score
        related_product_indices = cosine_similarities.argsort()[-5:]
        
        # Get top 5 similarities_score
        cosine_similarities.sort()
        similarities_score = cosine_similarities[-5:]
        
        # Create results dataframe 
        result_dict = {'query':[q]*5,
                       'product_names': df_product.iloc[related_product_indices,:]['names'].values, 
                       'sim_score':similarities_score}
        result = pd.DataFrame({k: pd.Series(v) for k, v in result_dict.items()})
        results =pd.concat([results,result], ignore_index=True)
    print('Time:', time()-t0)
    return results

In [7]:
def cosine_search_noPCA(query, tfidf_matrix):   
    #Load models 
    with open('tfidf.pkl', 'rb') as file:  
        tfidf_model = pickle.load(file)
    
    #Load data
    df_product = pd.read_csv('data/product_names_clean.csv', index_col=0)
    
    t0 = time()
    # Clean text
    query = clean_function(query)
    
    # Vectorize query
    query_tfidf = tfidf_model.transform([query])
    query_tfidf = pd.DataFrame(query_tfidf.toarray(), 
                               columns=tfidf_model.get_feature_names())
    
    # Calculate consine similarity
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

    # Get index of top 5 similarities_score
    related_product_indices = cosine_similarities.argsort()[-5:]

    # Get top 5 similarities_score
    cosine_similarities.sort()
    similarities_score = cosine_similarities[-5:]
    result_dict = { 'product_names': df_product.iloc[related_product_indices,:]['names'].values, 
                    'sim_score':similarities_score}
    result = pd.DataFrame({k: pd.Series(v) for k, v in result_dict.items()})
    print('Time:', time()-t0)
    return result

In [8]:
def jaccard_search(query):
    #Load data
    df_product = pd.read_csv('data/product_names_clean.csv', index_col=0)
    
    t0=time()
    results = pd.DataFrame(columns=['product_names', 'jaccard_score'])
    
    query = clean_function(query)
    words_doc1 = set(query.lower().split()) 
    
    for i in range(df_product.shape[0]):
        words_doc2 = set(df_product.loc[i,'clean_names'].lower().split())

        # Find the intersection of words list of doc1 & doc2
        intersection = words_doc1.intersection(words_doc2)

        # Find the union of words list of doc1 & doc2
        union = words_doc1.union(words_doc2)

        # Calculate Jaccard similarity score 
        # using length of intersection set divided by length of union set
        jaccard_score = len(intersection) / len(union)
        result_dict = {'product_names':df_product.loc[i,'names'],
                       'jaccard_score':jaccard_score}
        result = pd.DataFrame({k: pd.Series(v) for k, v in result_dict.items()})
        results = pd.concat([results,result], axis=0)
    print('Time:',time()-t0)
    return results.sort_values('jaccard_score', ascending=False)[:5]

In [9]:
def gensim_search(query, dictionary, similarities_matrix, name_tfidf_model, name_lsi_model):
    t0 = time()
    query_bow = dictionary.doc2bow([clean_function(query)])
    query_tfidf = name_tfidf_model[query_bow]
    query_lsi = name_lsi_model[query_tfidf]

    similarities_matrix.num_best = 5
    name_list = similarities_matrix[query_lsi]

    name_list.sort(key=itemgetter(1), reverse=True)
    product_names = []
    product = pd.read_csv('data/product_names_clean.csv', index_col=0)

    for j, name in enumerate(name_list):
        product_names.append(
            {'product_names': product['names'][name[0]],
             'gensim_score': round((name[1] * 100),2)})
        if j == (similarities_matrix.num_best-1):
            break
    print('Time:', time()-t0)
    return pd.DataFrame(product_names, columns=['product_names','gensim_score'])

In [10]:
def linearsimilarity_search(query, vectorizer, pca, product_names_pca, product):    
    # Clean text
    query = clean_function(query)
    # Vectorize query
    query_tfidf = vectorizer.transform([query])
    query_tfidf = pd.DataFrame(query_tfidf.toarray(), 
                               columns=vectorizer.get_feature_names())

    # Reduce dimension to 10 components
    query_pca = pca.transform(query_tfidf)
    
    # Calculate linear_similarities 
    linear_similarities = linear_kernel(query_pca, product_names_pca).flatten()

    # Get index of top 5 similarities_score
    related_product_indices = linear_similarities.argsort()[-5:]

    # Get top 5 similarities_score
    linear_similarities.sort()
    similarities_score = linear_similarities[-5:]
    result_dict = { 'product_names': product.iloc[related_product_indices,:]['names'].values, 
                    'sim_score':similarities_score}
    result = pd.DataFrame({k: pd.Series(v) for k, v in result_dict.items()})
    return result