In [1]:
#Load all libraries
import pandas as pd
import numpy as np
import en_core_web_sm
import spacy
from scipy.spatial.distance import cosine
nlp = en_core_web_sm.load()
from sklearn.model_selection import train_test_split
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
nlp_md = spacy.load("en_core_web_md")
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from nltk import word_tokenize
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
import logging

In [2]:
#input the data
data_label = pd.read_csv('data_label.csv',index_col = 0)
print(data_label.shape)
print(data_label.head(2))

(3916, 17)
                   product_id           brand  \
0  01DTJCERF6F4NRZ2WSJFFA1EYS          theory   
1  01DVPBJ6464YKYGVAE0A1HMKGN  alexander_wang   

                                         description           brand_category  \
0  beige stretchsilk slip digit silk digit spande...  clothing top tank camis   
1  black velvet concealed hook zip fastening digi...      clothing dress mini   

                        name  \
0  teah stretchsilk camisole   
1        layered velvet mini   

                                             details  is_casual  is_modern  \
0  fit_true_size normal cut slightly loose fit li...        1.0        1.0   
1  fit_true_size normal designed fitted bust wais...        0.0        1.0   

   is_androgynous  is_romantic  is_boho  is_business casual  is_edgy  is_glam  \
0             0.0          1.0      0.0                 1.0      0.0      1.0   
1             0.0          0.0      0.0                 0.0      0.0      1.0   

   is_classic  is_ath

In [3]:
#This nulls are incoprated due to cleaning process in previous code
data_label.isnull().sum()

product_id              0
brand                   1
description             0
brand_category        190
name                    0
details               224
is_casual               0
is_modern               0
is_androgynous          0
is_romantic             0
is_boho                 0
is_business casual      0
is_edgy                 0
is_glam                 0
is_classic              0
is_athleisure           0
is_retro                0
dtype: int64

In [4]:
data_label.fillna('unknowntoken',inplace=True)
data_label.isnull().sum()

product_id            0
brand                 0
description           0
brand_category        0
name                  0
details               0
is_casual             0
is_modern             0
is_androgynous        0
is_romantic           0
is_boho               0
is_business casual    0
is_edgy               0
is_glam               0
is_classic            0
is_athleisure         0
is_retro              0
dtype: int64

### Pre-trained word2vec using en_core_web_sm

In [5]:
columns = ['brand', 'description', 'brand_category', 'name','details']
data_label_sub = data_label.loc[:,columns]
w2v_model_sm=pd.DataFrame()
for col in columns:
    new_list = []
    for i in range(0,len(data_label_sub)):
        new_list.append(nlp(data_label_sub.loc[i,col]).vector)
    X_array=pd.DataFrame(new_list)
    w2v_model_sm=pd.concat([w2v_model_sm,X_array],axis=1)

In [6]:
w2v_model_sm.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,-0.495816,1.007125,0.592011,-0.989226,1.161886,1.720696,6.638313,-0.599922,0.952174,2.253995,...,-0.089093,-1.037553,0.481995,0.864222,1.041488,-0.710707,-0.322838,1.966523,1.461966,0.629186
1,0.723979,-4.703322,0.231796,0.710519,3.448232,1.050524,1.041941,1.054857,1.878826,0.328565,...,0.40201,-1.576209,0.184507,1.262392,1.111494,0.486093,-0.532086,1.55006,1.258919,0.871626


### Pre-trained word2vec using en_core_web_md

In [7]:
columns = ['brand', 'description', 'brand_category', 'name','details']
data_label_sub = data_label.loc[:,columns]
w2v_model_md=pd.DataFrame()
for col in columns:
    new_list = []
    for i in range(0,len(data_label_sub)):
        new_list.append(nlp_md(data_label_sub.loc[i,col]).vector)
    X_array=pd.DataFrame(new_list)
    w2v_model_md=pd.concat([w2v_model_md,X_array],axis=1)

In [8]:
w2v_model_md.shape

(3916, 1500)

In [9]:
w2v_model_md.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.26227,0.14685,-0.31801,-0.15813,-0.66168,0.057471,-0.21232,0.29958,0.3585,2.0177,...,-0.076138,0.053004,0.157833,0.082269,-0.164426,-0.032932,-0.083052,-0.044259,0.148788,-0.027553
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.017492,0.02115,0.054696,0.09732,-0.116556,-0.080968,-0.054529,0.084784,0.139407,-0.041822


### Using TF-idf to weight differnt pre-trained words from these list of word2vec

In [10]:
vectorizer = TfidfVectorizer()

In [11]:
columns = ['brand', 'description', 'brand_category', 'name','details']
tf_idf_data=pd.DataFrame()
for j in columns:
    corpus = []
    for i in range(0,len(data_label)):
        corpus.append(data_label.loc[i,j])
    vect = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names()
    c=pd.DataFrame(vect.toarray().transpose(), index=terms)
    tf_idf_data=pd.concat([tf_idf_data,c.T],axis = 1)
print(f'The Dimensionality of the data is:{tf_idf_data.shape}')

The Dimensionality of the data is:(3916, 11477)


In [12]:
DOCUMENT_SUM_COLUMN = "DOCUMENT_TF_IDF_SUM"

# sum the tf idf scores for each document
tf_idf_data[DOCUMENT_SUM_COLUMN] = tf_idf_data.sum(axis=1)
available_tf_idf_scores = tf_idf_data.columns # a list of all the columns we have
available_tf_idf_scores = list(map( lambda x: x.lower(), available_tf_idf_scores)) # lowercase everything

In [13]:
nlp_md(data_label_sub.loc[1,'brand']).text.lower() in available_tf_idf_scores

True

In [14]:
columns = ['brand','description','brand_category', 'name','details']
data_label_sub = data_label.loc[:,columns]
w2v_md_tfidf_model=pd.DataFrame()
for col in columns:
    new_list = []
    for i in range(0,len(data_label_sub)):
        tokens = nlp_md(data_label_sub.loc[i,col])
        # initially start a running total of tf-idf scores for a document
        total_tf_idf_score_per_document = 0
        # start a running total of initially all zeroes (300 is picked since that is the word embedding size used by word2vec)
        running_total_word_embedding = np.zeros(300) 
        for token in tokens: # iterate through each token
        # if the token has a pretrained word embedding it also has a tf-idf score
            if token.has_vector and token.text.lower() in available_tf_idf_scores:
                tf_idf_score = tf_idf_data.loc[i, token.text.lower()]
                tf_idf_score = tf_idf_score.mean() #there could be multiple tags
                #print(f"{token} has tf-idf score of {tf_idf_score}")
                #print(f"{token.vector} is the vector score")
                running_total_word_embedding += tf_idf_score * token.vector
                total_tf_idf_score_per_document += tf_idf_score
        # divide the total embedding by the total tf-idf score for each document
        if (running_total_word_embedding.sum() == 0):
            document_embedding = running_total_word_embedding
        else:
            document_embedding = running_total_word_embedding / total_tf_idf_score_per_document
        new_list.append(document_embedding)
    X_array=pd.DataFrame(new_list)
    w2v_md_tfidf_model=pd.concat([w2v_md_tfidf_model,X_array],axis=1)

In [15]:
w2v_md_tfidf_model.shape

(3916, 1500)

In [16]:
len(nlp(data_label.loc[0,'brand']).vector)

96

In [17]:
columns = ['brand','description','brand_category', 'name','details']
data_label_sub = data_label.loc[:,columns]
w2v_sm_tfidf_model=pd.DataFrame()
for col in columns:
    new_list = []
    for i in range(0,len(data_label_sub)):
        tokens = nlp(data_label_sub.loc[i,col])
        # initially start a running total of tf-idf scores for a document
        total_tf_idf_score_per_document = 0
        # start a running total of initially all zeroes (96 is picked since that is the word embedding size used by word2vec em)
        running_total_word_embedding = np.zeros(96) 
        for token in tokens: # iterate through each token
        # if the token has a pretrained word embedding it also has a tf-idf score
            if token.has_vector and token.text.lower() in available_tf_idf_scores:
                tf_idf_score = tf_idf_data.loc[i, token.text.lower()]
                tf_idf_score = tf_idf_score.mean() #there could be multiple tags
                #print(f"{token} has tf-idf score of {tf_idf_score}")
                #print(f"{token.vector} is the vector score")
                running_total_word_embedding += tf_idf_score * token.vector
                total_tf_idf_score_per_document += tf_idf_score
        # divide the total embedding by the total tf-idf score for each document
        if (running_total_word_embedding.sum() == 0):
            document_embedding = running_total_word_embedding
        else:
            document_embedding = running_total_word_embedding / total_tf_idf_score_per_document
        new_list.append(document_embedding)
    X_array=pd.DataFrame(new_list)
    w2v_sm_tfidf_model=pd.concat([w2v_sm_tfidf_model,X_array],axis=1)

In [18]:
w2v_sm_tfidf_model.shape

(3916, 480)

### As, we can see there is an improvement over training averaging word embeddings using tf-idf but there is still a lot to be desired

### We next implement word2vec using Gensim

In [19]:
import gensim
from gensim.models import Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
wv.init_sims(replace=True)

In [20]:
from itertools import islice
#list(islice(wv.vocab, 13030, 13050))

In [21]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        #logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [22]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [23]:
columns = ['brand','description','brand_category', 'name','details']
word2vec_genism_model =pd.DataFrame()
for i in columns:
    X_new = data_label.apply(lambda r: w2v_tokenize_text(r[i]),axis=1).values
    X_word_average = word_averaging_list(wv,X_new)
    X_array=pd.DataFrame(X_word_average)
    word2vec_genism_model=pd.concat([word2vec_genism_model,X_array],axis=1)
word2vec_genism_model.shape

(3916, 1500)

### Doc2_vec implementation

In [24]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


In [25]:
columns = ['brand','description','brand_category', 'name','details']
data_label_sub = data_label.loc[:,columns]
data_label_sub['X'] = data_label_sub['brand']+' '+data_label_sub['description']+' '+data_label_sub['brand_category']+' '+data_label_sub['name']+' '+data_label_sub['details']

In [26]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data_label_sub['X'])]

In [27]:
model = Doc2Vec(documents, vector_size=250, window=6, min_count=2, workers=4)

In [28]:
columns = ['brand','description','brand_category', 'name','details']
data_label_sub = data_label.loc[:,columns]
doc2vec_gen_model = pd.DataFrame()
for col in columns:
    new_list = []
    for i in range(0,len(data_label_sub)):
        new_list.append(model.infer_vector([data_label_sub.loc[i,col]]))
    X_array=pd.DataFrame(new_list)
    doc2vec_gen_model=pd.concat([doc2vec_gen_model,X_array],axis=1)

In [29]:
doc2vec_gen_model.shape

(3916, 1250)

### Interpret all results

In [30]:
x_list = [w2v_model_sm,w2v_sm_tfidf_model,w2v_model_md,w2v_md_tfidf_model,word2vec_genism_model,doc2vec_gen_model]
name = ['word2_vec_sm','w2v_sm_tfidf_model','word2_vec_md','w2v_md_tfidf_model','word2vec_genism_model','doc2vec_gen_model']
cols = data_label.columns[7:9]
for col in cols:
    for i in range(0,len(x_list)):
        X=x_list[i]    
        y=data_label[col].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0,stratify = y)
        rand = data_label[col].sum()/len(data_label)
        print(f'random for {col} is :{max(rand,1-rand)*100}')
        SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
        SVM.fit(X_train,y_train)
        # predict the labels on validation dataset
        predictions_SVM = SVM.predict(X_test)
        # Use accuracy_score function to get the accuracy
        print(f'SVM Accuracy Score for model {name[i]} for columns {col} is:{accuracy_score(predictions_SVM, y_test)*100}')

random for is_modern is :53.166496424923395
SVM Accuracy Score for model word2_vec_sm for columns is_modern is:73.08673469387756
random for is_modern is :53.166496424923395
SVM Accuracy Score for model w2v_sm_tfidf_model for columns is_modern is:73.34183673469387
random for is_modern is :53.166496424923395
SVM Accuracy Score for model word2_vec_md for columns is_modern is:71.5561224489796
random for is_modern is :53.166496424923395
SVM Accuracy Score for model w2v_md_tfidf_model for columns is_modern is:71.93877551020408
random for is_modern is :53.166496424923395
SVM Accuracy Score for model word2vec_genism_model for columns is_modern is:53.18877551020408
random for is_modern is :53.166496424923395
SVM Accuracy Score for model doc2vec_gen_model for columns is_modern is:53.18877551020408
random for is_androgynous is :82.40551583248212
SVM Accuracy Score for model word2_vec_sm for columns is_androgynous is:85.20408163265306
random for is_androgynous is :82.40551583248212
SVM Accuracy Sc

### Using Keras to build a classification model - LSTM, RNN

In [31]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from typing import List

Using TensorFlow backend.


In [32]:
columns = ['brand','description','brand_category', 'name','details']
data_label_sub = data_label.loc[:,columns]
docs = data_label_sub['brand']+' '+data_label_sub['description']+' '+data_label_sub['brand_category']+' '+data_label_sub['name']+' '+data_label_sub['details']

In [33]:
import spacy
stopwords_removed_docs = list(
    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), docs))

In [34]:
#stopwords_removed_docs

In [35]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="unknowntoken")
tokenizer.fit_on_texts(stopwords_removed_docs)

In [36]:
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

In [37]:
MAX_SEQUENCE_LENGTH= get_max_token_length_per_doc(docs)

In [38]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

# integer encode the documents
encoded_docs = integer_encode_documents(stopwords_removed_docs, tokenizer)
# this is a list of lists, the numbers represent the index position of that word.
# for instance, 33 means the 33rd word in the vocabulary
# Notice the last document has 4 numbers, since it is a 4 word document: Could have done better.
padded_docs = pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')


In [39]:
#Keras toolkit
from random import randint
from numpy import array, argmax, asarray, zeros
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding

In [40]:
VOCAB_SIZE = int(len(tokenizer.word_index) * 1.1)

## Load in GloVe Vectors

In [41]:
def load_glove_vectors():
    embeddings_index = {}
    with open('glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index


embeddings_index = load_glove_vectors()

Loaded 400000 word vectors.


In [42]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((VOCAB_SIZE, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # check that it is an actual word that we have embeddings for
        embedding_matrix[i] = embedding_vector

In [43]:
len(embedding_matrix)

9164

In [44]:
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.layers import Flatten, Masking
# define model

def make_binary_classification_rnn_model(plot=False):
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
    model.add(SimpleRNN(units=64, input_shape=(1, MAX_SEQUENCE_LENGTH)))
    model.add(Dense(16))
    model.add(Dense(2, activation='softmax'))
    
    # Compile the model
    model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    # summarize the model
    model.summary()
    
    if plot:
        plot_model(model, to_file='model.png', show_shapes=True)
    return model

def make_lstm_classification_model(plot=False):
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
    model.add(LSTM(units=32, input_shape=(1, MAX_SEQUENCE_LENGTH)))
    model.add(Dense(16))
    model.add(Dense(2, activation='softmax'))
    
    # Compile the model
    model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    # summarize the model
    model.summary()
    
    if plot:
        plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [45]:
model = make_lstm_classification_model()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 177, 100)          916400    
_________________________________________________________________
masking_1 (Masking)          (None, 177, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                17024     
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
Total params: 933,986
Trainable params: 17,586
Non-trainable params: 916,400
_________________________________________________________________


In [46]:
### Changing Y variable can help output other variables

In [47]:
#data_label.columns

In [48]:
cols = 'is_casual'
labels=data_label[cols].values
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
labels = to_categorical(encoder.fit_transform(labels))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2,random_state=0)

In [49]:
# fit the model
history = model.fit(X_train, y_train,validation_split = 0.1, epochs=5, verbose=1)

Train on 2818 samples, validate on 314 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [50]:
import keras
from matplotlib import pyplot as plt

def plot_fit_history(history):
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()
plot_fit_history(history)

<Figure size 640x480 with 1 Axes>

In [51]:
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)

labels=data_label[cols].values
rand = labels.sum()/len(labels)
print(f'random for is :{max(rand,1-rand)*100}')
print('Accuracy: %f' % (accuracy*100))

random for is :67.16036772216547
Accuracy: 79.336733


In [52]:
cols = 'is_modern'
labels=data_label[cols].values
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
labels = to_categorical(encoder.fit_transform(labels))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2,random_state=0)

In [53]:
# fit the model
history = model.fit(X_train, y_train,validation_split = 0.1, epochs=5, verbose=1)

Train on 2818 samples, validate on 314 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [54]:
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)

labels=data_label[cols].values
rand = labels.sum()/len(labels)
print(f'random for is :{max(rand,1-rand)*100}')
print('Accuracy: %f' % (accuracy*100))

random for is :53.166496424923395
Accuracy: 73.214287


In [55]:
#checking for binary rnn

In [56]:
model = make_binary_classification_rnn_model()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 177, 100)          916400    
_________________________________________________________________
masking_2 (Masking)          (None, 177, 100)          0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 64)                10560     
_________________________________________________________________
dense_3 (Dense)              (None, 16)                1040      
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 34        
Total params: 928,034
Trainable params: 11,634
Non-trainable params: 916,400
_________________________________________________________________


In [57]:
cols = 'is_modern'
labels=data_label[cols].values
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
labels = to_categorical(encoder.fit_transform(labels))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2,random_state = 0)

In [58]:
# fit the model
history = model.fit(X_train, y_train,validation_split = 0.1, epochs=5, verbose=1)

Train on 2818 samples, validate on 314 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [59]:
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)

labels=data_label[cols].values
rand = labels.sum()/len(labels)
print(f'random for is :{max(rand,1-rand)*100}')
print('Accuracy: %f' % (accuracy*100))

random for is :53.166496424923395
Accuracy: 70.280612


In [60]:
#Building model using simple RNN

In [61]:
#Define model
from keras.layers.recurrent import SimpleRNN
from keras.layers import Flatten, Masking
# define model
model = Sequential()
model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
model.add(SimpleRNN(units=64, input_shape=(1, MAX_SEQUENCE_LENGTH)))
model.add(Dense(32))
model.add(Dense(2, activation='softmax'))

In [62]:
#compile model
from keras.utils.vis_utils import plot_model

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
#plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 177, 100)          916400    
_________________________________________________________________
masking_3 (Masking)          (None, 177, 100)          0         
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 64)                10560     
_________________________________________________________________
dense_5 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 66        
Total params: 929,106
Trainable params: 12,706
Non-trainable params: 916,400
_________________________________________________________________
None


In [63]:
cols = 'is_modern'
labels=data_label[cols].values
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
labels = to_categorical(encoder.fit_transform(labels))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.1)

In [64]:
model.fit(X_train, y_train, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x1758db978>

In [65]:
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)

labels=data_label[cols].values
rand = labels.sum()/len(labels)
print(f'random for is :{max(rand,1-rand)*100}')
print('Accuracy: %f' % (accuracy*100))

random for is :53.166496424923395
Accuracy: 66.581631
