In [74]:
import numpy as np
import pandas as pd
import os
from pandas.io.json import json_normalize
import string, pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
pd.options.mode.chained_assignment = None

In [82]:
# please change the file_dir to your working directory to read the file
file_dir = '/home/danyzix/Dissertation/Data'

In [83]:
# pickling functions to save and load lists
def pickle_save(data, name):
    path = os.path.join(file_dir, '{}.pkl'.format(name))
    with open(path, "wb") as f:
        pickle.dump(data, f, protocol=-1)
        f.close()
def pickle_open(name):
    path = os.path.join(file_dir, '{}.pkl'.format(name))
    with open(path, "rb") as f:
        data = pickle.load(f)
        f.close()
    return data

In [84]:
rb_pairs3 = pickle_open('rb_pairs3')

In [85]:
# function to get unique texts and update a Counter from a list
def get_unique_texts(data, row):
    temp_text = []
    for i in range(len(data)):
        temp_text.append(data[i][row])
    counter = collections.Counter(temp_text)               # add a counter to count the frequency of texts in the brand texts
    unique_text = list(set(temp_text))                     # form a set of unique texts for the data
    return counter, unique_text 

In [90]:
# get the unique texts from the retailer sequences
retailer_sequence_counter, unique_retailer_sequence = get_unique_texts(rb_pairs3, 0)

print('{} retailer sequences have only {} unique sequences'.format(len(rb_pairs3), len(unique_retailer_sequence)))

67457 retailer sequences have only 60790 unique sequences


In [87]:
# open the manufacturer sequence-productIDmapping file
text_product_map = pickle_open('unique_brand_text_product_mapping')
len(text_product_map), text_product_map[1]

(24645,
 ['lg electronics 32 class qhd led ips monitor radeon freesync 31.5 diagonal 32qk500w',
  1531971])

In [91]:
# form an evaluation corpus from unique_retailer_sequence + text_product_map
eval_corpus = unique_retailer_sequence.copy()
for i in range(len(text_product_map)):
    eval_corpus.append(text_product_map[i][0])
    
len(unique_retailer_sequence), len(text_product_map), len(eval_corpus)

(60790, 24645, 85435)

In [89]:
# TFIDF_Box to store all cosine similarity scores between unique_retailer_sequence and manufacturer sequence
def TFIDF_Box(corpus):
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0)
    tfidf_matrix =  tfidf_vectorizer.fit_transform(corpus)
    # build the box to store all TF-IDF cosine similarity values
    start = time.time()
    print('number of rows to be processed: {}'.format(len(unique_retailer_sequence)))
    TFIDF_box = []
    for i in range(len(unique_retailer_sequence)):
        cosine_similarities = linear_kernel(tfidf_matrix[i], tfidf_matrix[len(unique_retailer_sequence):]).flatten()
        related_docs_indices = cosine_similarities.argsort()[:-5:-1]
        related_products = [text_product_map[a][1] for a in related_docs_indices]
        cosine_similarities = cosine_similarities[related_docs_indices]
        TFIDF_box.append([unique_retailer_sequence[i], related_products, cosine_similarities, related_docs_indices ])
        if i%5000==0:
            print('processed {} products, time elapsed:{} seconds'.format(i+1, (time.time() - start)))
    print('complete')
    return TFIDF_box

In [77]:
# calculate accuracy and store correct and wrong predictions
def accuracy_calcluator(model_box, pred_pairs, lookup_pairs):
    n_correct_predictions = 0
    correct_predictions = []
    wrong_predictions = []
    for i in range(len(pred_pairs)):
        predicted_product = [a[1][0] for a in model_box if a[0] == pred_pairs[i][0]][0]
        if predicted_product == lookup_pairs[i][2]:            
            correct_predictions.append([i, lookup_pairs[i][0], lookup_pairs[i][1],
                                        pred_pairs[i][0], predicted_product, lookup_pairs[i][2]])
        else:
            wrong_predictions.append([i, lookup_pairs[i][0], lookup_pairs[i][1],
                                        pred_pairs[i][0], predicted_product, lookup_pairs[i][2]])
            
    return correct_predictions, wrong_predictions

In [78]:
# build the TFIDF_box to store all the cosine similarity values
TFIDF_box = TFIDF_Box(eval_corpus)

number of rows to be processed: 60790
processed 1 products, time elapsed:0.04127621650695801 seconds
processed 5001 products, time elapsed:119.12265872955322 seconds
processed 10001 products, time elapsed:235.84542512893677 seconds
processed 15001 products, time elapsed:354.39545941352844 seconds
processed 20001 products, time elapsed:489.09501481056213 seconds
processed 25001 products, time elapsed:606.6250429153442 seconds
processed 30001 products, time elapsed:720.4740612506866 seconds
processed 35001 products, time elapsed:828.1681640148163 seconds
processed 40001 products, time elapsed:934.6169023513794 seconds
processed 45001 products, time elapsed:1044.6419348716736 seconds
processed 50001 products, time elapsed:1149.4891266822815 seconds
processed 55001 products, time elapsed:1251.2585191726685 seconds
processed 60001 products, time elapsed:1355.2324039936066 seconds
complete


In [81]:
# separate the right and wrong predictions for the overall prediction and calculate the accuracy 
tfidf_overall_CP, tfidf_overall_WP = accuracy_calcluator(TFIDF_box, rb_pairs3, rb_pairs3)
print('accuracy: {0:.2f}%'.format(int(len(tfidf_overall_CP))/int(len(rb_pairs3))*100))

accuracy: 47.71%
