In [1]:
import pandas as pd
import numpy as np
import re
import difflib
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection, naive_bayes, svm
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
import logging
from keras.models import Sequential
from keras import layers
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# read in file from the color cleaning notebook
color_sub = pd.read_csv('subset_color.csv')

In [3]:
# drop the added column
color_sub = color_sub.drop(columns = 'Unnamed: 0')

In [4]:
color_sub.columns

Index(['product_id', 'name', 'description', 'details', 'brand_info',
       'product_color_id', 'attribute_name', 'attribute_value', 'is_black',
       'is_beige', 'is_burgundy', 'is_white', 'is_gray', 'is_gold', 'is_blue',
       'is_neutral', 'is_pink', 'is_orange', 'is_navy', 'is_brown', 'is_red',
       'is_yellow', 'is_multi', 'is_green', 'is_silver', 'is_teal',
       'is_purple'],
      dtype='object')

In [5]:
# initialize the tf-idf vectorizer
vectorizer = TfidfVectorizer(token_pattern=r'\b[a-zA-Z]{3,}\b',stop_words="english",binary = True,min_df = 0.005,max_df = 0.7,max_features =300)

In [6]:
# vectorize the data
columns = ['description','details','brand_info', 'name']
model_data=pd.DataFrame()
for j in columns:
    corpus = []
    for i in range(0,len(color_sub)):
        corpus.append(color_sub.loc[i,j])
    vect = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names()
    c=pd.DataFrame(vect.toarray().transpose(), index=terms)
    model_data=pd.concat([model_data,c.T],axis = 1)

In [7]:
# check the dimensionality
model_data.shape

(39942, 1079)

#### Collocation to further reduce the dimensionality

In [8]:
# function to start collocation
stopwords_coll = set(stopwords.words('english') + [".",'.', ",",":", "''", "'s", "'", "``", "(", ")", "-","timestamp","varchar","html","digit"])
filter_stops = lambda w: len(w) < 3 or w in stopwords_coll
def collocation_list(data_col):
    new_list = []
    for i in range(0,len(data_col)):
        words = word_tokenize(data_col[i])
        res_words = []
        for word in words:
            if word not in stopwords_coll:
                res_words.append(word)
        new_list.append(res_words)
    return(new_list)

In [9]:
#creates n-list of n size n = no of columns
cols = ['brand_info', 'description', 'name','details']
n_list = []
for col in cols:
    n_list.append(collocation_list(color_sub[col]))

In [11]:
# obtain top 25 collocations by raw frequency for 'details' column
collocation_finder = BigramCollocationFinder.from_documents(n_list[3])
measures = BigramAssocMeasures()
collocation_finder.apply_word_filter(filter_stops)
collocation_finder.nbest(BigramAssocMeasures.likelihood_ratio, 25)

[('Dry', 'clean'),
 ('normal', 'size'),
 ('suggest', 'taking'),
 ('taking', 'normal'),
 ('sizeWe', 'suggest'),
 ('style', 'fit'),
 ('clean', 'Imported'),
 ('fit', 'true'),
 ('Fits', 'true'),
 ('hip', 'varcharShe'),
 ('height', 'bust'),
 ('true', 'sizeWe'),
 ('measurement', 'height'),
 ('Model', 'wearing'),
 ('stretchy', 'fabric'),
 ('bust', 'waist'),
 ('fastening', 'Composition'),
 ('Mid', 'weight'),
 ('waist', 'hip'),
 ('varcharShe', 'wearing'),
 ('true', 'size'),
 ('Model', 'measurement'),
 ('fabric', 'Model'),
 ('Made', 'Italy'),
 ('Machine', 'wash')]

In [10]:
# we find that dry clean, machine wash, 'fits true to size', 'made in italy' 
# should be collated together
col_list = [['Dry', 'clean'], ['Machine', 'wash'], ['true', 'size'],['fit', 'true_size'], ['Fits', 'true_size'],  
            ['Made', 'Italy']]
data_col = color_sub['details']
new_list = []
clean_list = ['Dry','wash','Italy', 'size']
for i in range(0,len(data_col)):
    words = word_tokenize(data_col[i])
    len_words = len(words)-1
    for i in range(0,len_words):
        bi_word = []
        j = i+1
        bi_word.append(words[i])
        bi_word.append(words[j])
        if(bi_word in col_list):
            sentence = "_".join(bi_word)
            words[i] = sentence
    for word in words:
        if word in clean_list:
            words.remove(word)
    res_word = []
    for word in words:
        res_word.append(word)
    sentence = " ".join(res_word)
    new_list.append(sentence)
color_sub['details'] = new_list

In [13]:
# obtain top 25 collocations by raw frequency for 'name' column
collocation_finder = BigramCollocationFinder.from_documents(n_list[2])
measures = BigramAssocMeasures()
collocation_finder.apply_word_filter(filter_stops)
collocation_finder.nbest(BigramAssocMeasures.likelihood_ratio, 25)

[('Wide', 'Leg'),
 ('supersoft', 'yarn'),
 ('Floral', 'Print'),
 ('ankle', 'boot'),
 ('High', 'Waist'),
 ('wide', 'leg'),
 ('Crew', 'Essential'),
 ('High', 'Rise'),
 ('Leg', 'Pants'),
 ('Les', 'Second'),
 ('Second', 'Medium'),
 ('leg', 'pant'),
 ('Mockneck', 'sweater'),
 ('Double', 'Faced'),
 ('Medium', 'NOIR'),
 ('everyday', 'cashmere'),
 ('straight', 'leg'),
 ('Small', 'Marcie'),
 ('Waist', 'Leggings'),
 ('sweater', 'supersoft'),
 ('Leg', 'Jeans'),
 ('track', 'pant'),
 ('Mini', 'Dress'),
 ('shoulder', 'bag'),
 ('snake', 'effect')]

In [12]:
col_list = [['wide','leg'], ['ankle', 'boot'], ['High', 'Waist'], ['Wide', 'Leg'], ['High', 'Rise'], 
            ['straight', 'leg'], ['shoulder', 'bag']]
data_col = color_sub['name']
new_list = []
clean_list = ['leg', 'boot', 'Waist', 'Leg', 'Rise','bag']
for i in range(0,len(data_col)):
    words = word_tokenize(data_col[i])
    len_words = len(words)-1
    for i in range(0,len_words):
        bi_word = []
        j = i+1
        bi_word.append(words[i])
        bi_word.append(words[j])
        if(bi_word in col_list):
            sentence = "_".join(bi_word)
            words[i] = sentence
    for word in words:
        if word in clean_list:
            words.remove(word)
    res_word = []
    for word in words:
        res_word.append(word)
    sentence = " ".join(res_# we find that wide leg, ankle boot, high waist, high rise, straight leg, and shoulder bag
#should be collocated together
word)
    new_list.append(sentence)
color_sub['name'] = new_list

In [15]:
# obtain top 25 collocations by raw frequency for 'description' column
collocation_finder = BigramCollocationFinder.from_documents(n_list[1])
measures = BigramAssocMeasures()
collocation_finder.apply_word_filter(filter_stops)
collocation_finder.nbest(BigramAssocMeasures.likelihood_ratio, 25)

[('Dry', 'clean'),
 ('Machine', 'wash'),
 ('measure', 'approximately'),
 ('approximately', 'inch'),
 ('Heel', 'measure'),
 ('Designer', 'color'),
 ('Made', 'Italy'),
 ('wide', 'leg'),
 ('best', 'selling'),
 ('zip', 'fastening'),
 ('high', 'waisted'),
 ('pencil', 'skirt'),
 ('hip', 'wearing'),
 ('clean', 'Imported'),
 ('High', 'Rise'),
 ('high', 'rise'),
 ('wearing', 'size'),
 ('Button', 'fastening'),
 ('crazy', 'soft'),
 ('bust', 'waist'),
 ('waist', 'hip'),
 ('speak', 'experience'),
 ('mockneck', 'sweater'),
 ('Hand', 'wash'),
 ('Size', 'Fit')]

In [14]:
# we find that dry clean, machine wash, made in italy, wide leg, best selling, high waisted
# pencil skirt, high rise, and hand wash should be collocated together

col_list = [['Dry','clean'], ['Machine', 'wash'], ['Made', 'Italy'], ['wide', 'leg'], ['best', 'selling'], 
            ['high', 'waisted'], ['pencil', 'skirt'], ['High', 'Rise'], ['high','rise'], ['Hand','wash']]
data_col = color_sub['description']
new_list = []
clean_list = ['clean', 'wash', 'Italy', 'leg', 'selling','waisted', 'skirt','Rise', 'rise']
for i in range(0,len(data_col)):
    words = word_tokenize(data_col[i])
    len_words = len(words)-1
    for i in range(0,len_words):
        bi_word = []
        j = i+1
        bi_word.append(words[i])
        bi_word.append(words[j])
        if(bi_word in col_list):
            sentence = "_".join(bi_word)
            words[i] = sentence
    for word in words:
        if word in clean_list:
            words.remove(word)
    res_word = []
    for word in words:
        res_word.append(word)
    sentence = " ".join(res_word)
    new_list.append(sentence)
color_sub['description'] = new_list

In [17]:
# obtain top 25 collocations by raw frequency for 'brand_info' column
collocation_finder = BigramCollocationFinder.from_documents(n_list[0])
measures = BigramAssocMeasures()
collocation_finder.apply_word_filter(filter_stops)
collocation_finder.nbest(BigramAssocMeasures.likelihood_ratio, 25)

[('net', 'porter'),
 ('woman', 'category'),
 ('pink', 'modaoperandi'),
 ('bananarepublic', 'gap'),
 ('gap', 'browse'),
 ('frame', 'store'),
 ('rag', 'bone'),
 ('CLOTHING', 'TOPS'),
 ('CLOTHING', 'TOPShtml'),
 ('goop', 'shop'),
 ('shop', 'goop'),
 ('Aleksandre', 'Akhalkatsishviliwomen'),
 ('browse', 'product'),
 ('product', 'pid'),
 ('Handbags', 'Handbags'),
 ('Crewshirts', 'topshtml'),
 ('HolidayGiftGuide19', 'GiftsForHersakstimestamp'),
 ('ralph', 'lauren'),
 ('TOPS', 'pink'),
 ('store', 'product'),
 ('CLOTHING', 'DRESSEShtml'),
 ('CLOTHING', 'DRESSES'),
 ('wide', 'leg'),
 ('little', 'girl'),
 ('materiel', 'ss20')]

In [16]:
# we find that wide leg should be collocated together

col_list = [['wide', 'leg']]
data_col = color_sub['brand_info']
new_list = []
clean_list = ['leg']
for i in range(0,len(data_col)):
    words = word_tokenize(data_col[i])
    len_words = len(words)-1
    for i in range(0,len_words):
        bi_word = []
        j = i+1
        bi_word.append(words[i])
        bi_word.append(words[j])
        if(bi_word in col_list):
            sentence = "_".join(bi_word)
            words[i] = sentence
    for word in words:
        if word in clean_list:
            words.remove(word)
    res_word = []
    for word in words:
        res_word.append(word)
    sentence = " ".join(res_word)
    new_list.append(sentence)
color_sub['brand_info'] = new_list

# Neural Network

Implement neural network on each of our dummy variables for our cleaned data

In [31]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
color_class = ['is_black', 'is_beige', 'is_burgundy', 'is_white', 'is_gray', 'is_gold', 'is_blue', 'is_neutral', 
               'is_pink', 'is_orange', 'is_navy', 'is_brown', 'is_red', 'is_yellow', 'is_multi', 'is_green', 
               'is_silver', 'is_teal', 'is_purple']

In [164]:
for i in color_class:
    print(i)
    X=model_data
    y=color_sub[i].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    print('\n0-1 distribution: \n') 
    print(color_sub[i].sum()/len(color_sub))
    size=[[64,8,2],[128,32,4]]
    epoch=[5,8,10,20]
    print("\nAccuracy Scores: \n")
    for j in size:
        for k in epoch:
            nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=j,max_iter=k).fit(X_train, y_train)
            y_pred_RF = nn.predict(X_test)
            print(accuracy_score(y_pred_RF,y_test))

is_black

0-1 distribution: 

0.4881578288518352

Accuracy Scores: 

0.5130804856677932
0.6942045312304419
0.7347602954061835
0.5130804856677932
0.6244836650394292
0.7258730754787833
0.8397796970834898
0.9405432469645763
is_beige

0-1 distribution: 

0.14796454859546343

Accuracy Scores: 

0.8590562022781324
0.8590562022781324
0.9087495306045813
0.8590562022781324
0.8590562022781324
0.8590562022781324
0.8949805983226937
0.973213168106146
is_burgundy

0-1 distribution: 

0.04726853938210405

Accuracy Scores: 

0.9514332206784328
0.9514332206784328
0.9514332206784328
0.9514332206784328
0.9514332206784328
0.9514332206784328
0.9514332206784328
0.9989986231067718
is_white

0-1 distribution: 

0.38207901457112814

Accuracy Scores: 

0.626110902490925
0.626110902490925
0.626110902490925
0.9091250469395419
0.6271122793841533
0.6326198522969082
0.8056077106020778
0.9059957441482038
is_gray

0-1 distribution: 

0.13789995493465526

Accuracy Scores: 

0.8645637751908875
0.8645637751908875
0.86456

Likely have overfitting for the cases where we have highly unbalaced classes

All are above 0.9

###### Black
Best is size = [128, 32, 4] epoch = 20
Dist: 48.8% 1
Best: 0.9405
###### Beige
Best is size = [128, 32, 4] epoch = 20
Dist: 14.7% 1
Best: 0.9732
###### Burgundy
Best is size = [128, 32, 4] and epoch = 20
Dist: 4.7%
Best: 0.9989
###### White
Best is size = [64, 8, 2] and epoch = 20
Dist: 38.2% 1
Best: 0.9091
###### Gray
Best is size = [64, 8, 2] and epoch = 20
Dist: 13.8% 1
Best: 0.9881
###### Gold
Best is size = [128, 32, 4] and epoch = 20
Dist: 7.5% 1
Best: 0.9879
###### Blue
Best is size = [64, 8, 2] epoch = 20
Dist: 18.7% 1
Best: 0.9775
###### Neutral
Best is size = [64, 8, 2] and epoch = 20
Dist: 4.9% 1
Best: 0.9759
###### Pink
Best is size = [128, 32, 4] and epoch = 20
Dist: 19.7% 1
Best: 0.9715
###### Orange
Best is size = [64, 8, 2] and epoch = 20
Dist: 6.2% 1
Best: 0.9649
###### Navy
Best is size = [128, 32, 4] and epoch = 20
Dist: 15.2% 1
Best: 0.9694
###### Brown
Best is size = [64, 8, 2] and epoch = 20
Dist: 15.4% 1
Best: 0.9628
###### Red
Best is size = [128, 32, 4] and epoch = 20
Dist: 11.3% 1
Best: 0.9778
###### Yellow
Best is size = [128, 32, 4] and epoch = 20
Dist: 6.9% 1
Best: 0.9976
###### Multi
Best is size = [128, 32, 4] and epoch = 20
Dist: 6.4% 1
Best: 0.9983
###### Green
Best is size = [64, 8, 2] and epoch = 20
Dist: 14.8% 1
Best: 0.9525
###### Silver
Best is size = [128,32,4] and epoch = 20
Dist: 1.3% 1
Best: 1
###### Teal
Best is size = [128,32,4] epoch = 20
Dist: 3.4% 1
Best: 1
###### Purple
Best is size = [128,32,4] and epoch = 20
Dist: 4.2% 1
Best: 0.9978

# Naive Bayes

implement naive bayes on each of our dummy variables for our cleaned data

Top value for each dummy variable is % of product ids with a '1' in that dummy variable

In [166]:
for i in color_class:
    print(i)
    X=model_data
    y=color_sub[i].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    print(color_sub[i].sum()/len(color_sub))
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    y_pred_NB = classifier.predict(X_test)
    print(accuracy_score(y_pred_NB,y_test))
    print('\n')

is_black
0.4881578288518352
0.7388909750907497


is_beige
0.14796454859546343
0.8362748779571911


is_burgundy
0.04726853938210405
0.9983727625485042


is_white
0.38207901457112814
0.7316309926148454


is_gray
0.13789995493465526
0.9270246589059957


is_gold
0.07490861749536828
0.9733383402177995


is_blue
0.18664563617245006
0.8634372261860057


is_neutral
0.049747133343347856
0.9821003880335462


is_pink
0.19653497571478645
0.8452872699962448


is_orange
0.061689449702068
0.9926148454124422


is_navy
0.15161984878073206
0.931405682813869


is_brown
0.15354764408392169
0.8539241457003379


is_red
0.11311401532221721
0.968206283640005


is_yellow
0.06894997746732763
0.9922393290774816


is_multi
0.06356717239997997
0.9856052071598448


is_green
0.14798958489810224
0.9339091250469396


is_silver
0.013169095188022633
0.9994993115533859


is_teal
0.03389915377297081
1.0


is_purple
0.0419858795253117
0.9993741394417324




# SVM

Implement SVM on each of our dummy variables for our cleaned data

Top value for each dummy variable is % of product ids with a '1' in that dummy variable

In [169]:
for i in color_class:
    print(i)
    X=model_data
    y=color_sub[i].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    print(color_sub[i].sum()/len(color_sub))
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    # predict the labels on validation dataset
    predictions_SVM = SVM.predict(X_test)
    # Use accuracy_score function to get the accuracy
    print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100, "\n")

is_black
0.4881578288518352
SVM Accuracy Score ->  99.78720741018901 

is_beige
0.14796454859546343
SVM Accuracy Score ->  100.0 

is_burgundy
0.04726853938210405
SVM Accuracy Score ->  99.94993115533859 

is_white
0.38207901457112814
SVM Accuracy Score ->  99.94993115533859 

is_gray
0.13789995493465526
SVM Accuracy Score ->  100.0 

is_gold
0.07490861749536828
SVM Accuracy Score ->  100.0 

is_blue
0.18664563617245006
SVM Accuracy Score ->  100.0 

is_neutral
0.049747133343347856
SVM Accuracy Score ->  99.94993115533859 

is_pink
0.19653497571478645
SVM Accuracy Score ->  99.94993115533859 

is_orange
0.061689449702068
SVM Accuracy Score ->  99.9749655776693 

is_navy
0.15161984878073206
SVM Accuracy Score ->  100.0 

is_brown
0.15354764408392169
SVM Accuracy Score ->  99.91237952184252 

is_red
0.11311401532221721
SVM Accuracy Score ->  100.0 

is_yellow
0.06894997746732763
SVM Accuracy Score ->  100.0 

is_multi
0.06356717239997997
SVM Accuracy Score ->  100.0 

is_green
0.14798958

# Logistic

Top value for each dummy variable is % of product ids with a '1' in that dummy variable

In [170]:
from sklearn.linear_model import LogisticRegression

In [171]:
for i in color_class:
    print(i)
    X=model_data
    y=color_sub[i].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    print(color_sub[i].sum()/len(color_sub))
    logreg=LogisticRegression(n_jobs=1, C=1e5)      
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))

is_black
0.4881578288518352
accuracy 1.0
is_beige
0.14796454859546343
accuracy 1.0
is_burgundy
0.04726853938210405
accuracy 1.0
is_white
0.38207901457112814
accuracy 1.0
is_gray
0.13789995493465526
accuracy 1.0
is_gold
0.07490861749536828
accuracy 1.0
is_blue
0.18664563617245006
accuracy 1.0
is_neutral
0.049747133343347856
accuracy 1.0
is_pink
0.19653497571478645
accuracy 1.0
is_orange
0.061689449702068
accuracy 1.0
is_navy
0.15161984878073206
accuracy 1.0
is_brown
0.15354764408392169
accuracy 1.0
is_red
0.11311401532221721
accuracy 1.0
is_yellow
0.06894997746732763
accuracy 1.0
is_multi
0.06356717239997997
accuracy 1.0
is_green
0.14798958489810224
accuracy 1.0
is_silver
0.013169095188022633
accuracy 1.0
is_teal
0.03389915377297081
accuracy 1.0
is_purple
0.0419858795253117
accuracy 1.0


# Word2Vec

In [25]:
import gensim
from gensim.models import Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

In [26]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [27]:
X=color_sub[['description','details','name','brand_info']]

In [28]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
names=['description','name','details','brand_info']
c=pd.DataFrame()
for i in names:
    X_new = X.apply(lambda r: w2v_tokenize_text(r[i]),axis=1).values
    X_word_average = word_averaging_list(wv,X_new)
    X_array=pd.DataFrame(X_word_average)
    c=pd.concat([c,X_array],axis=1)



























































































































































































































In [29]:
c.shape

(39942, 1200)

I ran the below word2vec model for over an hour before Jupyter notebooks crashed.

Due to time constraints, I did not re-run the code, but I will likely re-run it before we submit part II.
From what we can see from the code that did run, word2vec did not have as good of accuracy as earlier models

In [None]:
# Ran this for an hour then it crashed
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

for i in color_class:
    y=color_sub[i].values
    X_train, X_test, y_train, y_test = train_test_split(c.values, y, test_size = 0.2, random_state = 0)
    size=[(128,64),(100,50),(50,20)]
    for j in size:
        nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=j).fit(X_train, y_train)
        y_pred_RF = nn.predict(X_test)
        #print("Actual % of the data for tag",p,"is",sum(subset_for_model['attribute_value'])/len(subset_for_model)*100)
        print(accuracy_score(y_test,y_pred_RF),j,i)
        print("% for ", i, "is",max((1-(color_sub[i].sum()/len(color_sub))),(color_sub[i].sum()/len(color_sub))),'\n')

1.0 (128, 64) is_black
% for  is_black is 0.5118421711481649 

0.9836024533733884 (100, 50) is_black
% for  is_black is 0.5118421711481649 

1.0 (50, 20) is_black
% for  is_black is 0.5118421711481649 

1.0 (128, 64) is_beige
% for  is_beige is 0.8520354514045365 

1.0 (100, 50) is_beige
% for  is_beige is 0.8520354514045365 

1.0 (50, 20) is_beige
% for  is_beige is 0.8520354514045365 

1.0 (128, 64) is_burgundy
% for  is_burgundy is 0.9527314606178959 

1.0 (100, 50) is_burgundy
% for  is_burgundy is 0.9527314606178959 

1.0 (50, 20) is_burgundy
% for  is_burgundy is 0.9527314606178959 

1.0 (128, 64) is_white
% for  is_white is 0.6179209854288719 

0.8905995744148204 (100, 50) is_white
% for  is_white is 0.6179209854288719 

1.0 (50, 20) is_white
% for  is_white is 0.6179209854288719 

1.0 (128, 64) is_gray
% for  is_gray is 0.8621000450653448 

1.0 (100, 50) is_gray
% for  is_gray is 0.8621000450653448 

1.0 (50, 20) is_gray
% for  is_gray is 0.8621000450653448 

1.0 (128, 64) is_g

# Deep Learning keras with W2V

This model had perfect or almost perfect accuracy for every color class

In [33]:
for i in color_class:
    y=color_sub[i].values
    X_train, X_test, y_train, y_test = train_test_split(c.values, y, test_size = 0.2, random_state = 0)
    input_dim = X_train.shape[1]  # Number of features
    model = Sequential()
    model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    history = model.fit(X_train, y_train, epochs=20,verbose=False,validation_data=(X_test, y_test),batch_size=10)
    loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
    print("Training Accuracy: {:.4f}".format(accuracy),"for",i)
    loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
    print("Testing Accuracy:  {:.4f}".format(accuracy),"for",i)

Training Accuracy: 1.0000 for is_black
Testing Accuracy:  1.0000 for is_black
Training Accuracy: 1.0000 for is_beige
Testing Accuracy:  1.0000 for is_beige
Training Accuracy: 1.0000 for is_burgundy
Testing Accuracy:  1.0000 for is_burgundy
Training Accuracy: 1.0000 for is_white
Testing Accuracy:  1.0000 for is_white
Training Accuracy: 1.0000 for is_gray
Testing Accuracy:  1.0000 for is_gray
Training Accuracy: 1.0000 for is_gold
Testing Accuracy:  1.0000 for is_gold
Training Accuracy: 1.0000 for is_blue
Testing Accuracy:  1.0000 for is_blue
Training Accuracy: 1.0000 for is_neutral
Testing Accuracy:  1.0000 for is_neutral
Training Accuracy: 1.0000 for is_pink
Testing Accuracy:  1.0000 for is_pink
Training Accuracy: 1.0000 for is_orange
Testing Accuracy:  1.0000 for is_orange
Training Accuracy: 1.0000 for is_navy
Testing Accuracy:  1.0000 for is_navy
Training Accuracy: 1.0000 for is_brown
Testing Accuracy:  1.0000 for is_brown
Training Accuracy: 1.0000 for is_red
Testing Accuracy:  1.0000

# Keras with our own embeddings

This model did not have as good of accuracy as other models

In [34]:
color_sub['all_data']=color_sub['details']+color_sub['name']+color_sub['description']+color_sub['brand_info']

In [35]:
#Keras Tokenizer
from keras.preprocessing.text import Tokenizer
columns=['brand_info','details','description','name']
for i in ['brand_info']:
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(color_sub[i])
    X = tokenizer.texts_to_sequences(color_sub[i])
    # Adding 1 because of reserved 0 index
    CD=tokenizer.word_index
    maxlen = 100
    X = pad_sequences(X, padding='post', maxlen=maxlen)
for i in ['details','description','name']:
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(color_sub[i])
    p = tokenizer.texts_to_sequences(color_sub[i])# Adding 1 because of reserved 0 index
    CD={**CD,**tokenizer.word_index}
    vocab_size=len(CD)
    p = pad_sequences(p, padding='post', maxlen=maxlen)
    X=np.concatenate((X,p),axis=1)   

In [36]:
vocab_size

5675

In [37]:
embedding_dim = 50
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_30"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           283750    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 50)                0         
_________________________________________________________________
dense_57 (Dense)             (None, 10)                510       
_________________________________________________________________
dense_58 (Dense)             (None, 1)                 11        
Total params: 284,271
Trainable params: 284,271
Non-trainable params: 0
_________________________________________________________________


In [38]:
for i in color_class:
    y=color_sub[i].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    input_dim = X_train.shape[1]  # Number of features
    model = Sequential()
    model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    history = model.fit(X_train, y_train, epochs=10,verbose=False,validation_data=(X_test, y_test),batch_size=5)
    loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
    print("Training Accuracy: {:.4f}".format(accuracy),"for",i)
    loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
    print("Testing Accuracy:  {:.4f}".format(accuracy),"for",i)

Training Accuracy: 0.6374 for is_black
Testing Accuracy:  0.6403 for is_black
Training Accuracy: 0.8796 for is_beige
Testing Accuracy:  0.8768 for is_beige
Training Accuracy: 0.9992 for is_burgundy
Testing Accuracy:  0.9994 for is_burgundy
Training Accuracy: 0.6966 for is_white
Testing Accuracy:  0.7028 for is_white
Training Accuracy: 0.9721 for is_gray
Testing Accuracy:  0.9698 for is_gray
Training Accuracy: 0.9892 for is_gold
Testing Accuracy:  0.9900 for is_gold
Training Accuracy: 0.9845 for is_blue
Testing Accuracy:  0.9845 for is_blue
Training Accuracy: 0.9976 for is_neutral
Testing Accuracy:  0.9970 for is_neutral
Training Accuracy: 0.9843 for is_pink
Testing Accuracy:  0.9847 for is_pink
Training Accuracy: 0.9986 for is_orange
Testing Accuracy:  0.9989 for is_orange
Training Accuracy: 0.9950 for is_navy
Testing Accuracy:  0.9949 for is_navy
Training Accuracy: 0.9164 for is_brown
Testing Accuracy:  0.9151 for is_brown
Training Accuracy: 0.9943 for is_red
Testing Accuracy:  0.9951

# Keras with pre-trained GloVe embeddings

This model had some errors with model setup due to the size of the array

In [39]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:

            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [40]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.100d.txt',tokenizer.word_index, embedding_dim)

In [41]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.1737444933920705

In [43]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

ValueError: Layer weight shape (5675, 50) not compatible with provided weight shape (1079, 50)

In [44]:
for i in color_class:
    y=color_sub[i].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    input_dim = X_train.shape[1]  # Number of features
    model = Sequential()
    model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    history = model.fit(X_train, y_train, epochs=20,verbose=False,validation_data=(X_test, y_test),batch_size=10)
    loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
    print("Training Accuracy: {:.4f}".format(accuracy),"for",i)
    loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
    print("Testing Accuracy:  {:.4f}".format(accuracy),"for",i)

Training Accuracy: 0.7771 for is_black
Testing Accuracy:  0.7766 for is_black
Training Accuracy: 0.8782 for is_beige
Testing Accuracy:  0.8787 for is_beige
Training Accuracy: 1.0000 for is_burgundy
Testing Accuracy:  1.0000 for is_burgundy
Training Accuracy: 0.6890 for is_white
Testing Accuracy:  0.6977 for is_white
Training Accuracy: 0.9792 for is_gray
Testing Accuracy:  0.9770 for is_gray
Training Accuracy: 0.9934 for is_gold
Testing Accuracy:  0.9935 for is_gold
Training Accuracy: 0.9602 for is_blue
Testing Accuracy:  0.9569 for is_blue
Training Accuracy: 0.9832 for is_neutral
Testing Accuracy:  0.9816 for is_neutral
Training Accuracy: 0.9684 for is_pink
Testing Accuracy:  0.9681 for is_pink
Training Accuracy: 0.9948 for is_orange
Testing Accuracy:  0.9952 for is_orange
Training Accuracy: 0.9854 for is_navy
Testing Accuracy:  0.9817 for is_navy
Training Accuracy: 0.8477 for is_brown
Testing Accuracy:  0.8530 for is_brown
Training Accuracy: 0.9995 for is_red
Testing Accuracy:  0.9992