In [2]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

In [3]:
import pandas as pd, mysql.connector
import xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


# 1. Dataset preparation

In [7]:
df1 = pd.read_excel('\\Users\\Daniel\\Python\\Data\\Flix_products\\SamsungReport.xlsx','Features', keep_default_na=False)
#df1 = pd.read_csv('features.csv', dtype={'title': object, 'html': object})

In [8]:
df1[['title', 'html']] = df1[['title', 'html']].astype(str)

In [9]:
df1.head()

Unnamed: 0,product_id,title,html
0,628714,xbox-supertemplate,xbox-supertemplate1111 kieskrreurig
1,653143,contrast enhancer,Applies one million criteria for contrast yet ...
2,653143,colour optimiser,Preserves the original level of brightness whi...
3,653143,detail enhancer,Selectively sharpens edges so that you see thi...
4,653143,3way burn-in protection,3 Types of Anti-Burn Program\nScreen burn-in i...


In [10]:
df1.apply(lambda x: x.count(), axis=0)

product_id    49215
title         49215
html          49215
dtype: int64

In [11]:
# concatenate title and html for the same product_id into one cell
df2 = df1.groupby('product_id')['title','html'].agg(lambda x: ' '.join(x)).reset_index()
df2['texts'] = df2['title'] + " " + df2['html']
df3 = df2.groupby('product_id')['texts'].agg(lambda x: ' '.join(x)).reset_index()

In [12]:
df3.shape

(6036, 2)

In [13]:
df3.head(2)

Unnamed: 0,product_id,texts
0,628714,xbox-supertemplate xbox-supertemplate1111 kies...
1,653143,contrast enhancer colour optimiser detail enha...


In [26]:
# database access details
config = {
    'user': 'crystal',
    'password': 'dF983zlw8qd',
    'host': '134.213.47.184',
    'database': 'pentaho_stage'
}
# function for database connection
def connect(config):
    cnx = mysql.connector.connect(**config)
    return cnx
    cursor = cnx.cursor()

In [27]:
# get product list from database, it contains product info such as names, categories, manufacturers, etc.
cnx = connect(config)
products = pd.read_sql("""select * from d_product_brand_flix_category_united""", con=cnx)
cnx.close()

In [17]:
#save the raw products in the directory to save running the long query again
#products.to_csv('products.csv', sep='\t', encoding='utf-8', index=False)
products = pd.read_csv('\\Users\\Daniel\\Python\\Data\\products.csv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [18]:
# merge some product details
df3 = pd.merge(df3, products[['product_id', 'flix_parent_category', 'flix_subCategory1']],
                 on='product_id', how='left')

In [19]:
df3.astype(str).groupby('flix_parent_category')['product_id'].count()

flix_parent_category
Cameras                      280
Computing/Gaming            1461
Health & Beauty                2
Home Appliances              757
Home Entertainment          1110
Office Needs                   4
Other                          7
Phone/Mobiles               1519
Small Gadgets/Appliances      21
nan                          875
Name: product_id, dtype: int64

In [20]:
# drop products with unassigned categories
df3.dropna(subset=['flix_parent_category'], inplace=True)

In [21]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df3['texts'], df3['flix_parent_category'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [22]:
encoder.classes_

array(['Cameras', 'Computing/Gaming', 'Health & Beauty',
       'Home Appliances', 'Home Entertainment', 'Other', 'Phone/Mobiles',
       'Small Gadgets/Appliances'], dtype=object)

In [23]:
len(train_x),len(valid_x),

(3870, 1291)

# 2. Feature Engineering

2.1 Count Vectors

In [27]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', stop_words='english', token_pattern=r'\w{1,}')
count_vect.fit(df3['texts'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [30]:
# tokenised word vectors
#print(count_vect.vocabulary_)

In [29]:
# length of the word vectors
len(count_vect.vocabulary_)

14632

In [31]:
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [32]:
# array version of the encoded vector showing occurrence for each word
xtrain_count[3152,1900:2000].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [33]:
xtrain_count.shape

(3870, 14632)

In [34]:
type(xtrain_count)

scipy.sparse.csr.csr_matrix

2.2 TF-IDF Vectors

In [78]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df3['texts'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [223]:
xvalid_tfidf.shape, xvalid_count.shape

((1291, 5000), (1291, 13927))

In [204]:
print(tfidf_vect.idf_)

[2.89880033 4.81288093 5.59783566 ... 6.6046404  6.55334711 6.77649066]


In [197]:
xtrain_tfidf[15,100:150].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.04248429, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.03717055, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.01626553, 0.02562912, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

In [79]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df3['texts'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [80]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(df3['texts'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

2.3 Word Embeddings

In [90]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec', encoding="utf8")):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

In [92]:
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(df3['texts'])
word_index = token.word_index

In [93]:
# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

In [94]:
# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

2.4 NLP based features

In [99]:
df3['char_count'] = df3['texts'].apply(len)
df3['word_count'] = df3['texts'].apply(lambda x: len(x.split()))
df3['word_density'] = df3['char_count'] / (df3['word_count']+1)
df3['punctuation_count'] = df3['texts'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
df3['title_word_count'] = df3['texts'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df3['upper_case_word_count'] = df3['texts'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [100]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

In [101]:
# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

In [102]:
df3['noun_count'] = df3['texts'].apply(lambda x: check_pos_tag(x, 'noun'))
df3['verb_count'] = df3['texts'].apply(lambda x: check_pos_tag(x, 'verb'))
df3['adj_count'] = df3['texts'].apply(lambda x: check_pos_tag(x, 'adj'))
df3['adv_count'] = df3['texts'].apply(lambda x: check_pos_tag(x, 'adv'))
df3['pron_count'] = df3['texts'].apply(lambda x: check_pos_tag(x, 'pron'))

In [236]:
df3.head()

Unnamed: 0,product_id,texts,flix_parent_category,flix_subCategory1,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,noun_count,verb_count,adj_count,adv_count,pron_count
0,628714,xbox-supertemplate xbox-supertemplate1111 kies...,Computing/Gaming,Software,54,3,13.5,2,0,0,1,0,2,0,0
1,653143,contrast enhancer colour optimiser detail enha...,Home Entertainment,TVs,2918,467,6.235043,77,66,18,206,57,39,16,14
2,653144,contrast enhancer colour optimiser pivot suppo...,Computing/Gaming,Monitors,2899,460,6.288503,77,73,19,204,57,38,16,14
3,653529,"Footnote nan nan Model Code: CC9N10B, CC9N10E<...",Cameras,Camera Accessories,172,29,5.733333,26,9,1,38,0,3,0,0
4,653536,Extraordinary photos in seconds with i-Functio...,Cameras,Digital SLR,3254,527,6.162879,101,54,27,185,67,64,30,37


2.5 Topic Models as features

In [103]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

In [209]:
# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

In [224]:
#topic_summaries

# 3. Model Building

In [213]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [216]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.10611928737412858
NB, WordLevel TF-IDF:  0.11309062742060419
NB, N-Gram Vectors:  0.1386522075910147
NB, CharLevel Vectors:  0.10147172734314484


In [225]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("LR, CharLevel Vectors: ", accuracy)



LR, Count Vectors:  0.11463981409759876
LR, WordLevel TF-IDF:  0.1123160340821069
LR, N-Gram Vectors:  0.11154144074360961
LR, CharLevel Vectors:  0.10689388071262587


In [226]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)



SVM, N-Gram Vectors:  0.001549186676994578


In [227]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print ("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("RF, WordLevel TF-IDF: ", accuracy)



RF, Count Vectors:  0.1254841208365608




RF, WordLevel TF-IDF:  0.11696359411309062


In [229]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  0.09140201394268009
Xgb, WordLevel TF-IDF:  0.09604957397366383
Xgb, CharLevel Vectors:  0.11541440743609604


In [231]:
def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier 

classifier = create_model_architecture(xtrain_tfidf_ngram.shape[1])
accuracy = train_model(classifier, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, is_neural_net=True)
print ("NN, Ngram Level TF IDF Vectors",  accuracy)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/1
NN, Ngram Level TF IDF Vectors 0.051897753679318356


In [232]:
def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_cnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("CNN, Word Embeddings",  accuracy)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/1
CNN, Word Embeddings 0.051897753679318356


In [234]:
def create_rnn_lstm():
# Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rnn_lstm()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("RNN-LSTM, Word Embeddings",  accuracy)

Epoch 1/1
RNN-LSTM, Word Embeddings 0.051897753679318356


In [235]:
def create_rcnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    # Add the recurrent layer
    rnn_layer = layers.Bidirectional(layers.GRU(50, return_sequences=True))(embedding_layer)
    
    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rcnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("CNN, Word Embeddings",  accuracy)

Epoch 1/1
CNN, Word Embeddings 0.051897753679318356
