In [109]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import pandas as pd, mysql.connector, nltk
import xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import string
from collections import Counter

import gensim
from gensim import corpora
from gensim import matutils, models
import scipy.sparse

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# 1. Data Preparation

In [None]:
# get the first file which has product titles and brand categoriws
df1 = pd.read_csv('\\Users\\Dan\\Documents\\Python\\Data\\Flix_products\\product_title.csv', engine='python')
df1.head(2)

In [None]:
# load second product file and load only the necessary columns
products_complete = pd.read_csv('\\Users\\Dan\\Documents\\Python\\Data\\Flix_products\\products.csv', sep='\t')
products= products_complete[['product_id','product_title','mpn_list','manufacturer_id', 'brand_title',
                            'flix_parent_category', 'flix_subCategory1', 'flix_subCategory2',
                            'benchmark_category', 'benchmark_category2']]
products.head(2)

In [None]:
# merge the two products file with an inner join to keep only the common products from both
products = pd.merge(products, df1, on='product_id', how='inner')
products.head(2)

In [None]:
# check how many products have unassigned categories
products['benchmark_category2'].isna().sum(), products['benchmark_category2'].isna().sum()/len(products.index)

In [None]:
# drop the unassugned category products
products.dropna(subset=['benchmark_category2'], inplace=True)

In [None]:
# let's look at the distribution of number of products and brands by product categories
group_products = products.groupby('benchmark_category2')['product_id'].count()
group_brand  = products.groupby('benchmark_category2')['brand_title'].nunique()
plt.figure(figsize=(15,5))
ax = sns.scatterplot(x=group_products, y=group_brand)
ax.set_ylabel('# of brands')
ax.set_xlabel('# of products')

In [None]:
# lets look at the top 30 categories by number of products
products.groupby('benchmark_category2')['product_id'].count().sort_values().tail(30).plot.barh(figsize=(8,8), color='purple')

In [None]:
products.info()

In [None]:
# lets subset by top 50 categories and use this data for further analysis
top_Ncats = products.groupby('benchmark_category2')['product_id'].count().nlargest(50).keys()
prod_Ncats = products[products['benchmark_category2'].isin(top_Ncats)].reset_index(drop=True)
prod_Ncats.info()

In [None]:
# portion of overall data in the top 50 categories
len(prod_Ncats)/len(products)*100

# 2. Categorisation with product titles and categories

In [None]:
prod_Ncats['Categories'].head(2)

In [None]:
# Remove the pipes '|' in the Categories
prod_Ncats['Categories'] = prod_Ncats['Categories'].str.replace('|', '')
prod_Ncats['Categories'].head(2)

In [None]:
# lets concatenate the two columns : product titles and categories
prod_Ncats['title&cat']=np.nan
for i in range(len(prod_Ncats)):
    if pd.isna(prod_Ncats.loc[i, 'product_title']):
        prod_Ncats.loc[i, 'title&cat'] =  prod_Ncats.loc[i, 'title'] + ' ' + prod_Ncats.loc[i, 'Categories']
    else:
        prod_Ncats.loc[i,'title&cat'] =  prod_Ncats.loc[i, 'product_title'] + ' ' + prod_Ncats.loc[i, 'Categories']

In [None]:
prod_Ncats.to_csv('prod_50cats.csv', sep='\t', encoding='utf-8', index=False)

In [11]:
prod_Ncats = pd.read_csv('prod_50cats.csv', sep='\t')

In [None]:
# check for empty strings in the two columns 
#prod_15cats[prod_15cats[['product_title','Categories']].isnull().values.any(1)]

# Bag of Word 

In [119]:
# functions to tokenize the whole doc and form a vocabulary

# clean texts
def tokenize_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [w.lower() for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = lemmatized_output = [' '.join([lemmatizer.lemmatize(w) for w in tokens])]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load text and add to vocab
def add_to_vocab(column, vocab):
    for i in range(len(column)):
        tokens = tokenize_doc(column[i])
        vocab.update(tokens)  

In [121]:
#create the vocabulary after tokenization of the whole doc
vocab = Counter()
add_to_vocab(prod_Ncats['title&cat'], vocab)

In [117]:
prod_Ncats['title&cat'].head()

0    HP EliteBook 840 G1 Notebook PC Laptops And Hy...
1                         Speedlight SB-N7 Speedlights
2    Electrolux EFC60465OX 60 cm Stainless Steel Ch...
3    Electrolux EFC90468OX 90 cm Stainless Steel Ch...
4            Oven EVY9841AOX Electrolux  NonStar  Oven
Name: title&cat, dtype: object

In [14]:
# length of the vocab and the most occuring words
len(vocab), print(vocab.most_common(20))
#print(vocab.most_common()[:len(vocab)-100:-1])  # to obtain the lower list

[('hp', 81516), ('pc', 34871), ('notebook', 24464), ('series', 20676), ('pavilion', 11978), ('home', 9793), ('desktop', 9730), ('pcs', 9598), ('workstation', 7968), ('compaq', 7286), ('mobile', 7160), ('philips', 7101), ('consumer', 6220), ('tv', 6122), ('ink', 5807), ('elitebook', 5680), ('led', 5631), ('probook', 5555), ('printer', 5503), ('cartridges', 4748)]


(11832, None)

In [15]:
# remove tokens with just one occurence to reduce the vocab size
tokens = [k for k,c in vocab.items() if c > 1]
print('Vocabulary size: {}'.format( len(tokens)))

Vocabulary size: 7373


In [None]:
# save the vocabulary to a file
def save_vocab(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()
    
# save the vocabulary and tokens
save_vocab(tokens, 'vocab.txt')

In [9]:
with open('vocab.txt', encoding="utf-8") as f:
    lines = f.read().splitlines()

In [19]:
# function to tokenize each row
def tokenize_row(column, vocab):
    tokenz = []
    for i in range(len(column)):
        tokens = tokenize_doc(column[i])
        tokens = [w for w in tokens if w in vocab]
        tokenz.append(tokens)
    return tokenz

In [20]:
# tokenize each row
prod_Ncats['tokens'] = tokenize_row(prod_Ncats['title&cat'], vocab)
prod_Ncats['tokens'].head()

0    [hp, elitebook, notebook, pc, laptops, hybrids...
1                            [speedlight, speedlights]
2    [electrolux, cm, stainless, steel, chimney, de...
3    [electrolux, cm, stainless, steel, chimney, de...
4                    [oven, electrolux, nonstar, oven]
Name: tokens, dtype: object

In [None]:
# histogram of token lengths per product
token_length = []
for i in range(len(prod_Ncats['tokens'])):
    token_length.append(len(prod_Ncats['tokens'][i]))
    
plt.figure(figsize=(10,5))   
sns.distplot(token_length, bins=50, kde=False, rug=True)

In [None]:
# enumerate example
# for ele in enumerate(prod_Ncats['tokens'][1]): 
#     print(ele)
# for index, ele in enumerate(prod_Ncats['tokens'][1]): 
#     print (index,ele)

In [22]:
# convert tokens in the list to texts to make it vectorizing freindly
prod_Ncats['tokens']=[" ".join(tokens) for tokens in prod_Ncats['tokens'].values]
prod_Ncats['tokens'].head()

0    hp elitebook notebook pc laptops hybrids busin...
1                               speedlight speedlights
2    electrolux cm stainless steel chimney design h...
3    electrolux cm stainless steel chimney design h...
4                         oven electrolux nonstar oven
Name: tokens, dtype: object

In [None]:
# function to plot bar-chart of groupby agg counts
def bar_plot(df,group_column, agg):
    df.groupby(group_column)[agg].count().sort_values(ascending=False).plot.bar(figsize=(12,6), color='purple')

In [None]:
# plot the count of products by categories
bar_plot(prod_Ncats, 'benchmark_category2' ,'product_id')

# Train and Test data split with stratified sampling

In [None]:
# create train and test set with stratified sampling
model_data = prod_Ncats.copy()
y = model_data.pop('benchmark_category2')
x = model_data['tokens']
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.30, random_state=42, stratify=y)

# TF-IDF Vectors as features

In [None]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [None]:
# reverse transformation of LabelEncoder
#encoder.inverse_transform(y_test)
#print(list(encoder.classes_))

In [None]:
# generate the TF-IDF features and transform the tran and test datasets
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{2,}', stop_words='english', max_features=7000)
tfidf_vect.fit(prod_Ncats['tokens'])
x_train_tfidf =  tfidf_vect.transform(x_train)
x_test_tfidf =  tfidf_vect.transform(x_test)
len(tfidf_vect.vocabulary_), x_train_tfidf.shape

In [None]:
#x_train_tfidf[1,:150].toarray()

# One hot-encoding - Count Vectors as features

In [None]:
# generate the CountVector features and transform the tran and test datasets
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{2,}', stop_words='english')
count_vect.fit(prod_Ncats['tokens'])
x_train_CV =  count_vect.transform(x_train)
x_test_CV =  count_vect.transform(x_test)

In [None]:
#count_vect.vocabulary_

In [None]:
# document term matrix
print(x_train_CV[20:25,1:30].toarray())

In [None]:
count_vect.get_feature_names()[1:5]

# Prediction Models

In [None]:
class train_model():
    def __init__(self,classifier, train_data, test_data):
        self.train = classifier.fit(train_data, y_train)
        self.predict = classifier.predict(test_data)
        self.cm = confusion_matrix(y_test, self.predict)
        self.precision = np.diag(self.cm) / np.sum(self.cm, axis = 0)
        self.recall = np.diag(self.cm) / np.sum(self.cm, axis = 1)  
        
    def return_metrics(self):
        print('Accuracy: {}'.format(metrics.accuracy_score(self.predict, y_test)))
        print('Average Precision: {}'.format(np.average(self.precision)))
        print('Average Recall: {}'.format(np.average(self.recall)))
    
    def return_predictions(self):
        return self.predict        

In [None]:
# function to plot the count of wrong predictions by categories
def plot_wrong_predictions(model):
    # extract the predictions
    predicted = model.return_predictions()
    # subset the training data rows from the main dataframe
    train_data = prod_Ncats.iloc[list(x_test.index.values),:]
    # add the predicted columns to the train_data
    train_data['predicted'] = encoder.inverse_transform(predicted)
    # extract the wrong predictions
    wrong_predictions = train_data[train_data['benchmark_category2'] != train_data['predicted']]
    warnings.filterwarnings('ignore')
    # plot a count of the wrong categories
    #bar_plot(wrong_predictions, 'benchmark_category2' ,'product_id')  
    ax = wrong_predictions.groupby('benchmark_category2')['product_id'].nunique().sort_values(ascending=False).plot.bar(color='purple')
    ax.set_xlabel('categories')
    ax.set_ylabel('number of products')
    

In [None]:
# train models on the TF-IDF features
NB_tfidf = train_model(naive_bayes.MultinomialNB(), x_train_tfidf, x_test_tfidf)   # Naive Bayes
LR_tfidf = train_model(linear_model.LogisticRegression(), x_train_tfidf, x_test_tfidf)  # Logistic Regression
RF_tfidf = train_model(ensemble.RandomForestClassifier(), x_train_tfidf, x_test_tfidf)  # Random Forests

In [None]:
# measure the predictions
NB_tfidf.return_metrics()
LR_tfidf.return_metrics()
RF_tfidf.return_metrics()

In [None]:
print(classification_report(y_test, y_pred_NB))

In [None]:
# compare the wrong predictions between two models
f, axs = plt.subplots(1,2,figsize=(18,6))
plt.subplot(1,2,1)
plot_wrong_predictions(NB_tfidf)
plt.title('NB_tfidf wrong prdictions')

plt.subplot(1,2,2)
plot_wrong_predictions(RF_tfidf)
plt.title('RF_tfidf wrong prdictions')

In [None]:
# train models on the CountVector features
NB_CV = train_model(naive_bayes.MultinomialNB(), x_train_CV, x_test_CV)   # Naive Bayes
LR_CV = train_model(linear_model.LogisticRegression(), x_train_CV, x_test_CV)  # Logistic Regression
RF_CV = train_model(ensemble.RandomForestClassifier(), x_train_CV, x_test_CV)  # Random Forests

In [None]:
# measure the predictions
NB_CV.return_metrics()
LR_CV.return_metrics()
RF_CV.return_metrics()

In [None]:
# compare the wrong predictions between two models
f, axs = plt.subplots(1,2,figsize=(18,6))
plt.subplot(1,2,1)
plot_wrong_predictions(LR_CV)
plt.title('LR_CV wrong prdictions')

plt.subplot(1,2,2)
plot_wrong_predictions(RF_CV)
plt.title('RF_CV wrong prdictions')

In [None]:
# # extract the predictions
# predicted = NB_tfidf.return_predictions()
# # transform the predictions to the original categories
# encoder.inverse_transform(predicted)
# # # subset the training data rows from the main dataframe
# # train_data = prod_Ncats.iloc[list(x_test.index.values),:]
# # # add the predicted columns to the train_data
# # train_data['predicted'] = encoder.inverse_transform(predicted)
# # # extract the wrong predictions
# # wrong_predictions = train_data[train_data['benchmark_category2'] != train_data['predicted']]
# # # plot a count of the wrong categories
# # bar_plot(wrong_predictions, 'benchmark_category2' ,'product_id')

# Topic Modelling - Attempt 1

In [16]:
prod_Ncats['tokens'].shape, len(tokens), len(vocab)

((63349,), 7373, 11832)

In [106]:
# create a Term Matrix with CountVectorizer
cv = CountVectorizer(stop_words='english', vocabulary=list(tokens))
data_cv = cv.fit_transform(prod_Ncats['tokens'])
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = prod_Ncats.index
data_dtm.head()

Unnamed: 0,hp,elitebook,notebook,pc,laptops,hybrids,business,laptop,pcs,series,...,lanyard,ergofit,error,unable,compute,smartcook,mattel,ftz,freshcare,wfe
0,4,4,4,3,1,1,1,1,2,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# transpose the Term Matrix to a Term-Document Matrix
tdm = data_dtm.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63339,63340,63341,63342,63343,63344,63345,63346,63347,63348
hp,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
elitebook,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
notebook,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
pc,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
laptops,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [186]:
# turn the Term-Document Matrix into a gensim format corpus, df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [193]:
print(list(corpus)[0:2])

[[(0, 4), (1, 4), (2, 4), (3, 3), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1)], [(10, 1), (11, 1)]]


In [167]:
# create the dictionary needed for LDA modelling
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [205]:
# Human readable format of corpus (term-frequency)
print([[(id2word[id], freq) for id, freq in cp] for cp in list(corpus)[:1]])

[[('hp', 4), ('elitebook', 4), ('notebook', 4), ('pc', 3), ('laptops', 1), ('hybrids', 1), ('business', 1), ('laptop', 1), ('pcs', 2), ('series', 1)]]


In [206]:
lda_5 = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, passes=20)
lda_5.print_topics()

[(0,
  '0.020*"epson" + 0.019*"cameras" + 0.018*"steam" + 0.015*"appliances" + 0.012*"storage" + 0.011*"vacuum" + 0.010*"tefal" + 0.010*"food" + 0.009*"kitchen" + 0.009*"kmix"'),
 (1,
  '0.041*"mobile" + 0.037*"hp" + 0.032*"printer" + 0.029*"printers" + 0.029*"accessories" + 0.026*"multifunction" + 0.026*"series" + 0.024*"laser" + 0.023*"lenovo" + 0.022*"galaxy"'),
 (2,
  '0.052*"tv" + 0.023*"hd" + 0.023*"monitors" + 0.021*"aeg" + 0.018*"cm" + 0.018*"oven" + 0.017*"home" + 0.016*"tvs" + 0.016*"nonstar" + 0.016*"ultra"'),
 (3,
  '0.231*"hp" + 0.102*"pc" + 0.074*"notebook" + 0.066*"pcs" + 0.053*"series" + 0.049*"desktop" + 0.037*"home" + 0.037*"pavilion" + 0.033*"laptop" + 0.024*"laptops"'),
 (4,
  '0.063*"workstation" + 0.037*"mobile" + 0.035*"lighting" + 0.030*"led" + 0.029*"headphones" + 0.028*"philips" + 0.026*"home" + 0.025*"zbook" + 0.021*"audio" + 0.021*"consumer"')]

In [56]:
# a POS tokenizer that only keeps nouns
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)  

In [62]:
# extract only the nouns from the data
data_nouns = pd.DataFrame(prod_Ncats['tokens'].apply(nouns))

In [75]:
# create a Term Matrix with CountVectorizer
cv_nouns = CountVectorizer(stop_words='english')
#cv_nouns._validate_vocabulary()
data_cv_nouns = cv_nouns.fit_transform(data_nouns['tokens'])
data_cv_nouns_dtm = pd.DataFrame(data_cv_nouns.toarray(), columns=cv_nouns.get_feature_names())
data_cv_nouns_dtm.index = data_nouns.index

In [207]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_cv_nouns_dtm.transpose()))
# Create the vocabulary dictionary
id2word_n = dict((v, k) for k, v in cv_nouns.vocabulary_.items())

In [78]:
# Let's try 40 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=40, id2word=id2word_n, passes=100)
ldan.print_topics()

[(1,
  '0.087*"combination" + 0.086*"stand" + 0.078*"whirlpool" + 0.064*"print" + 0.047*"standing" + 0.037*"connectivity" + 0.036*"hpe" + 0.034*"hpdefault" + 0.031*"unit" + 0.020*"link"'),
 (27,
  '0.211*"hp" + 0.202*"desktop" + 0.125*"pc" + 0.115*"pcs" + 0.084*"business" + 0.073*"series" + 0.061*"workstations" + 0.048*"prodesk" + 0.013*"server" + 0.009*"elitedesk"'),
 (16,
  '0.074*"food" + 0.059*"hand" + 0.046*"kitchen" + 0.040*"blenders" + 0.039*"blender" + 0.038*"ovens" + 0.034*"mixer" + 0.034*"kmix" + 0.028*"processor" + 0.028*"kenwood"'),
 (6,
  '0.371*"epson" + 0.070*"client" + 0.060*"hdr" + 0.054*"keyboard" + 0.053*"singlepack" + 0.045*"adapter" + 0.042*"base" + 0.041*"keyboards" + 0.039*"model" + 0.016*"cyan"'),
 (22,
  '0.124*"corsair" + 0.103*"phone" + 0.100*"support" + 0.081*"cases" + 0.074*"samsung" + 0.069*"case" + 0.058*"mobile" + 0.037*"view" + 0.034*"colour" + 0.030*"charger"'),
 (17,
  '0.227*"ink" + 0.155*"cartridges" + 0.122*"supplies" + 0.078*"cartridge" + 0.068*"r

In [103]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldan[corpusn]
list(zip([a for [(a,b)] in corpus_transformed], data_cv_nouns_dtm.index))

ValueError: too many values to unpack (expected 1)

In [101]:
# To find out how many topics exist in the doc
hdp_model= models.HdpModel(corpus=corpusn, id2word=dictionary)
hdp_model.show_topics()

[(0,
  '0.174*hp + 0.131*pc + 0.072*notebook + 0.057*series + 0.047*pavilion + 0.035*pcs + 0.023*workstation + 0.023*probook + 0.022*compaq + 0.019*desktop + 0.015*elitebook + 0.014*home + 0.014*form + 0.014*factor + 0.014*energy + 0.014*star + 0.013*workstations + 0.010*entertainment + 0.010*business + 0.009*hybrids'),
 (1,
  '0.040*tv + 0.027*monitors + 0.021*hp + 0.020*home + 0.019*philips + 0.017*hd + 0.016*consumer + 0.015*series + 0.015*pc + 0.011*monitor + 0.010*entertainment + 0.010*tvs + 0.009*cm + 0.009*vision + 0.008*televisions + 0.008*accessories + 0.007*audio + 0.007*notebook + 0.006*speakers + 0.006*video'),
 (2,
  '0.043*supplies + 0.043*hp + 0.040*cartridges + 0.031*ink + 0.026*toner + 0.020*printer + 0.019*laser + 0.015*inktonerpaperprinter + 0.015*color + 0.015*laserjet + 0.010*printing + 0.010*kits + 0.009*cartridge + 0.009*pc + 0.009*series + 0.008*paper + 0.006*printers + 0.006*print + 0.006*epson + 0.005*notebook'),
 (3,
  '0.059*hp + 0.053*printer + 0.047*series