In [157]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas,  numpy, string#xgboost,textblob, 
pandas.set_option('max_colwidth',2000)
pandas.set_option('display.height', 100)
pandas.set_option('display.max_rows', 500)
pandas.set_option('display.max_columns', 500)
pandas.set_option('display.width', 10000)
import tensorflow as  tf
from tensorflow import keras as ks



# 1. Dataset preparation

For the purpose of this article, I am the using dataset of amazon reviews which can be downloaded at [this link](https://codeload.github.com/gist/ad1d9c58d338e20d09ff26bcc06c4235/zip/1d2261e2276cbb0257a2ed6e2f1f4320464c7c07). The dataset consists of 3.6M text reviews and their labels, we will use only a small fraction of data. To prepare the dataset, load the downloaded data into a pandas dataframe containing two columns – text and label.

In [41]:
# load the dataset
data = open('data/amazon_corpus').readlines()
labels, texts = [], []
for i, line in enumerate(data):
    content = line.split()
    labels.append(content[0])
    texts.append(' '.join(content[1:]))

# create a dataframe using texts and lables
trainDF = pandas.DataFrame()

trainDF['label'] = labels
trainDF['text'] = texts

In [42]:
trainDF[:2]

Unnamed: 0,label,text
0,__label__2,Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
1,__label__2,"The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny."


Next, we will split the dataset into training and validation sets so that we can train and test classifier. Also, we will encode our target column so that it can be used in machine learning models.

In [43]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [44]:
train_y[:5]

array([0, 1, 1, 1, 0], dtype=int64)

# 2. Feature Engineering

## 2.1 Count Vectors as features

Count Vector is a matrix notation of the dataset in which every row represents a document from the corpus, every column represents a term from the corpus, and every cell represents the frequency count of a particular term in a particular document.

In [45]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [74]:
print(train_x[:1])
print(count_vect.vocabulary_.get('songs'))
# songs in train_x count 2,so xtrain_count[1][26120]=2
xtrain_count[:1].toarray()[:,26120]


2038    Good songs, but I have heard them all already: The Dissapointed. Why do I need demos of all the songs already on Apple Venus 1. Let's bring on Apple Venus 2, after all, isn't that what we were promised?
Name: text, dtype: object
26120
12443


array([1], dtype=int64)

In [84]:
# i in train_x count 2,so xtrain_count[1][14032]=2
print(count_vect.vocabulary_.get('i'))
xtrain_count[:1].toarray()[:,14032]

14032


array([2], dtype=int64)

## 2.2 TF-IDF Vectors as features

TF-IDF score represents the relative importance of a term in the document and the entire corpus. TF-IDF score is composed by two terms: the first computes the normalized Term Frequency (TF), the second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

TF-IDF Vectors can be generated at different levels of input tokens (words, characters, n-grams)

a. Word Level TF-IDF : Matrix representing tf-idf scores of every term in different documents

b. N-gram Level TF-IDF : N-grams are the combination of N terms together. This Matrix representing tf-idf scores of N-grams

c. Character Level TF-IDF : Matrix representing tf-idf scores of character level n-grams in the corpu

In [48]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [80]:
print('len of tfidf_vect is %d'%len(tfidf_vect.vocabulary_.keys()))


len of tfidf_vect is 5000


In [103]:
numpy.where( xtrain_tfidf[:1].toarray() > 0 )

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64),
 array([   3,   31,  182,  209,  217,  292,  599,  642, 1303, 1312, 1963,
        2076, 2095, 2222, 2381, 2591, 2970, 3055, 3073, 3428, 3770, 4093,
        4339, 4426, 4428, 4433, 4727, 4829, 4849, 4855, 4875], dtype=int64))

In [118]:
for (k,v) in tfidf_vect.vocabulary_.items():
    if v==4727:
        print(k)

venus


In [116]:
print('the id-idf of word venus is {}'.format(xtrain_tfidf[:1].toarray()[:,4727]))

the id-idf of word venus is [0.48864014]


## 2.4 Text / NLP based features

A number of extra text based features can also be created which sometimes are helpful for improving text classification models. Some examples are:

    ** Word Count of the documents ** – total number of words in the documents
    Character Count of the documents – total number of characters in the documents
    Average Word Density of the documents – average length of the words used in the documents
    Puncutation Count in the Complete Essay – total number of punctuation marks in the documents
    Upper Case Count in the Complete Essay – total number of upper count words in the documents
    Title Word Count in the Complete Essay – total number of proper case (title) words in the documents
    Frequency distribution of Part of Speech Tags:
        Noun Count
        Verb Count
        Adjective Count
        Adverb Count
        Pronoun Count

These features are highly experimental ones and should be used according to the problem statement only.

In [124]:
trainDF['char_count'] = trainDF['text'].apply(len)
trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [126]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))


In [127]:
trainDF[:1]

Unnamed: 0,label,text,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,noun_count,verb_count,adj_count,adv_count,pron_count
0,__label__2,Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^,426,80,5.259259,11,10,3,0,0,0,0,0


## 2.5 Topic Models as features

Topic Modelling is a technique to identify the groups of words (called a topic) from a collection of documents that contains best information in the collection. I have used Latent Dirichlet Allocation for generating Topic Modelling Features. LDA is an iterative model which starts from a fixed number of topics. Each topic is represented as a distribution over words, and each document is then represented as a distribution over topics. Although the tokens themselves are meaningless, the probability distributions over words provided by the topics provide a sense of the different ideas contained in the documents. 

In [129]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))


# 3. Model Building

The final step in the text classification framework is to train a classifier using the features created in the previous step. There are many different choices of machine learning models which can be used to train a final model. We will implement following different classifiers for this purpose:

    Naive Bayes Classifier
    Linear Classifier
    Support Vector Machine
    Bagging Models
    Boosting Models
    Shallow Neural Networks
    Deep Neural Networks
        Convolutional Neural Network (CNN)
        Long Short Term Modelr (LSTM)
        Gated Recurrent Unit (GRU)
        Bidirectional RNN
        Recurrent Convolutional Neural Network (RCNN)
        Other Variants of Deep Neural Networks

Lets implement these models and understand their details. The following function is a utility function which can be used to train a model. It accepts the classifier, feature_vector of training data, labels of training data and feature vectors of valid data as inputs. Using these inputs, the model is trained and accuracy score is computed.


In [130]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

## 3.1 Naive Bayes

Implementing a naive bayes model using sklearn implementation with different features

Naive Bayes is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. A Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature

In [133]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ( "NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.8428
NB, WordLevel TF-IDF:  0.8456
NB, N-Gram Vectors:  0.8288
NB, CharLevel Vectors:  0.8052


## 3.2 Linear Classifier

Implementing a Linear Classifier (Logistic Regression)

Logistic regression measures the relationship between the categorical dependent variable and one or more independent variables by estimating probabilities using a logistic/sigmoid function. One can read more about logistic regression

In [160]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("LR, CharLevel Vectors: ", accuracy)

# accuracy = train_model(linear_model.LogisticRegression(), topic_summaries, train_y, xvalid_tfidf_ngram_chars)
# print ("LR, topic_summaries: ", accuracy)


LR, Count Vectors:  0.8624
LR, WordLevel TF-IDF:  0.8712
LR, N-Gram Vectors:  0.8284
LR, CharLevel Vectors:  0.8416


3.3 Implementing a SVM Model

Support Vector Machine (SVM) is a supervised machine learning algorithm which can be used for both classification or regression challenges. The model extracts a best possible hyper-plane / line that segregates the two classes. One can read more about it 

In [136]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)
print ("SVM, Count Vectors: ", accuracy)

accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("SVM, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)


SVM, Count Vectors:  0.6692
SVM, WordLevel TF-IDF:  0.5048
SVM, N-Gram Vectors:  0.5048


## 3.4 Bagging Model

Implementing a Random Forest Model

Random Forest models are a type of ensemble models, particularly bagging models. They are part of the tree based model family.

In [138]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print ("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("RF, WordLevel TF-IDF: ", accuracy)

RF, Count Vectors:  0.7424
RF, WordLevel TF-IDF:  0.76


## 3.5 Boosting Model

In [158]:
from sklearn.ensemble import GradientBoostingRegressor
# GBDT on Count Vectors
accuracy = train_model(ensemble.GradientBoostingClassifier(), xtrain_count, train_y, xvalid_count)
print ("GBDT, Count Vectors: ", accuracy)

# GBDT on Word Level TF IDF Vectors
accuracy = train_model(ensemble.GradientBoostingClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("GBDT, WordLevel TF-IDF: ", accuracy)

GBDT, Count Vectors:  0.7952
GBDT, WordLevel TF-IDF:  0.786


## 3.6 Shallow Neural Networks

In [None]:
layers, models, optimizers
from tensorflow.keras.preprocessing import text, sequence

In [171]:
def create_model_architecture(input_size):
    # create input layer 
    input_layer = ks.layers.Input((input_size,))
    
    # create hidden layer
    hidden_layer = ks.layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = ks.layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = ks.models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=ks.optimizers.Adam(), loss='binary_crossentropy')
    return classifier 

In [173]:
classifier = create_model_architecture(xtrain_tfidf_ngram.shape[1])
accuracy = train_model(classifier, xtrain_tfidf, train_y, xvalid_tfidf, is_neural_net=True)
print ("NN, Ngram Level TF IDF Vectors",  accuracy)

Epoch 1/1

NN, Ngram Level TF IDF Vectors 0.5048


## 3.7.1 Convolutional Neural Network

In [175]:
def create_cnn():
    # Add an Input Layer
    input_layer = ks.layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = ks.layers.Embedding(len(word_index) + 1, 300)(input_layer)
    embedding_layer = ks.layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = ks.layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = ks.layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = ks.layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = ks.layers.Dropout(0.25)(output_layer1)
    output_layer2 = ks.layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = ks.models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_cnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("CNN, Word Embeddings",  accuracy)

NameError: name 'word_index' is not defined

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os
import pandas as pd
embed = hub.Module("https://tfhub.dev/google/nnlm-en-dim128/1")
messages=["cat is on the mat", "dog is in the fog"]
embeddings = embed(messages)
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(embeddings)

  for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
    print("Message: {}".format(messages[i]))
    print("Embedding size: {}".format(len(message_embedding)))
    message_embedding_snippet = ", ".join(
        (str(x) for x in message_embedding[:3]))
    print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

INFO:tensorflow:Initialize variable module_2/embeddings/part_0:0 from checkpoint b'C:\\Users\\TANGGU~1\\AppData\\Local\\Temp\\tfhub_modules\\32f2b2259e1cc8ca58c876921748361283e73997\\variables\\variables' with embeddings
Message: cat is on the mat
Embedding size: 128
Embedding: [0.2710797190666199, -0.010550727136433125, -0.05728397145867348, ...]

Message: dog is in the fog
Embedding size: 128
Embedding: [0.24816465377807617, -0.06619296967983246, -0.07488654553890228, ...]

