Packages:
* [NLTK](http://www.nltk.org/howto/classify.html)
* [SpaCy](https://spacy.io/)
* [AllenNLP](https://allennlp.org/tutorials)

Articles:
* [A Comprehensive Guide to Understand and Implement Text Classification in Python](https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/)
* [Machine Learning, NLP: Text Classification using scikit-learn, python and NLTK](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)
* [State-of-the-Art Text Classification using BERT model: “Predict the Happiness” Challenge](https://appliedmachinelearning.blog/2019/03/04/state-of-the-art-text-classification-using-bert-model-predict-the-happiness-hackerearth-challenge/)

In [1]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from tqdm import tqdm
import numpy as np
import pandas as pd
import string
import time


stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    result_stemmed = []
    for token in simple_preprocess(text, min_len = 2):
        result.append(token)
#         if token not in STOPWORDS:
        result_stemmed.append(lemmatize_stemming(token))
    
    return (result, result_stemmed)

with open("./data/raw/Hygiene/hygiene.dat.labels") as f:
    LABELS = [int(l) for l in f.readlines() if l[0].isdigit()]

ALL_RAW_TEXTS = []
ALL_TEXTS = []
ALL_STEMMED_TEXTS = []
ALL_CONCAT_STEMMED_TEXTS = []
LABELED_TEXTS = []
LABELED_CONCAT_TEXTS = []
LABELED_STEMMED_TEXTS = []
LABELED_CONCAT_STEMMED_TEXTS = []

with open("./data/raw/Hygiene/hygiene.dat") as f:
    ALL_RAW_TEXTS = f.readlines()

for _text in tqdm(ALL_RAW_TEXTS):
    _result, _result_stemmed = preprocess(_text)
    ALL_TEXTS.append(_result)
    ALL_STEMMED_TEXTS.append(_result_stemmed)

ALL_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in ALL_STEMMED_TEXTS]

LABELED_TEXTS = ALL_TEXTS[0:len(LABELS)]
LABELED_CONCAT_TEXTS = [" ".join(_text) for _text in LABELED_TEXTS]

LABELED_STEMMED_TEXTS = ALL_STEMMED_TEXTS[0:len(LABELS)]
LABELED_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in LABELED_STEMMED_TEXTS]

FEATURE_MORE =pd.read_csv("./data/raw/Hygiene/hygiene.dat.additional", header=None)

100%|██████████| 13299/13299 [05:58<00:00, 37.14it/s]


In [2]:
# create a dataframe using texts and lables
labeled_df = pd.DataFrame()
labeled_df['concat_stemmed_text'] = LABELED_CONCAT_STEMMED_TEXTS
labeled_df['stemmed_text'] = LABELED_STEMMED_TEXTS
# labeled_df['concat_stemmed_text'] = LABELED_CONCAT_TEXTS
# labeled_df['stemmed_text'] = LABELED_TEXTS
labeled_df['label'] = LABELS

# split the dataset into training and validation datasets 
train_concat_stemmed_text, test_concat_stemmed_text, train_label, test_label = model_selection.train_test_split(labeled_df['concat_stemmed_text'], 
                                                                                  labeled_df['label'],
                                                                                  test_size = 0.2)
train_stemmed_text = labeled_df['stemmed_text'][train_concat_stemmed_text.index]
test_stemmed_text = labeled_df['stemmed_text'][test_concat_stemmed_text.index]

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_label = encoder.fit_transform(train_label)
test_label = encoder.fit_transform(test_label)

In [3]:
# dictionary = corpora.Dictionary(processed_docs)
# print("Before prunn:%d"%(len(dictionary)))
# dictionary.filter_extremes(no_below = 2, no_above = 0.5)
# print("After prunn:%d"%(len(dictionary)))
# corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Feature Engineering

## Count Vectors as features

In [4]:
class MyCountVectorizer:
    def __init__(self, max_df = 0.5):
        self.vect_model = CountVectorizer(analyzer='word', max_df = max_df)
        
    def fit(self, texts_concat):
        self.vect_model.fit(texts_concat)
    
    def transform(self, texts_concat):
        return self.vect_model.transform(texts_concat)

count_vect = MyCountVectorizer()
count_vect.fit(train_concat_stemmed_text)
train_count = count_vect.transform(train_concat_stemmed_text)
test_count =  count_vect.transform(test_concat_stemmed_text)

## TF-IDF Vectors as features

In [5]:
%%time

# word level tf-idf
class MyTfidfVectorizer(MyCountVectorizer):
    def __init__(self, analyzer='word', ngram_range = None, max_features=5000):
        if ngram_range is None:
            self.vect_model = TfidfVectorizer(analyzer = analyzer, max_features = max_features)
        else:
            self.vect_model = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, 
                                              max_features = max_features)
            
    def fit(self, texts_concat):   
        self.vect_model.fit(texts_concat)
        self.vocabulary = self.vect_model.vocabulary_


tfidf_vect = MyTfidfVectorizer()
tfidf_vect.fit(train_concat_stemmed_text)
train_tfidf =  tfidf_vect.transform(train_concat_stemmed_text)
test_tfidf =  tfidf_vect.transform(test_concat_stemmed_text)

# ngram level tf-idf 
tfidf_vect_ngram = MyTfidfVectorizer(ngram_range=(2,3))
tfidf_vect_ngram.fit(train_concat_stemmed_text)
train_tfidf_ngram =  tfidf_vect_ngram.transform(train_concat_stemmed_text)
test_tfidf_ngram =  tfidf_vect_ngram.transform(test_concat_stemmed_text)

# characters level tf-idf
tfidf_vect_ngram_chars = MyTfidfVectorizer(analyzer='char', ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(train_concat_stemmed_text)
train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_concat_stemmed_text) 
test_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_concat_stemmed_text) 

CPU times: user 13.4 s, sys: 174 ms, total: 13.5 s
Wall time: 12.9 s


## Word Embeddings

### Build from review corpus

In [6]:
%%time

from gensim.models.fasttext import FastText

class MyFastTextTfidfVectorizer(MyCountVectorizer):
    def __init__(self, tfidf_vectorizer, size = 100):
        self.embedding_size = size
        self.tfidf_vectorizer = tfidf_vectorizer
        self.fasttext_model = FastText(size = size, window = 5, min_count = 5)

    def tfidf2embedding(self, value_vector):
        _weighted_value = np.zeros(self.embedding_size)
        for key in self.tfidf_vectorizer.vocabulary:
            _index = self.tfidf_vectorizer.vocabulary[key]
            if value_vector[_index] != 0:
                _weighted_value += self.fasttext_model[key] * value_vector[_index]

        return _weighted_value
    
    def fit(self, texts):
        _texts_concat = [" ".join(_text) for _text in texts]
        self.tfidf_vectorizer = MyTfidfVectorizer()
        self.tfidf_vectorizer.fit(_texts_concat)
        
        self.fasttext_model.build_vocab(sentences = texts)
        self.fasttext_model.train(sentences = texts, 
                                  total_examples = len(texts), 
                                  epochs=10)
        
    def transform(self, texts):
        _texts_concat = [" ".join(_text) for _text in texts]
        _tfidf_values = self.tfidf_vectorizer.transform(_texts_concat)
        return np.asarray([self.tfidf2embedding(_value.toarray()[0]) for _value in _tfidf_values])

fasttext_tfidf_vect = MyFastTextTfidfVectorizer(tfidf_vect)
fasttext_tfidf_vect.fit(ALL_STEMMED_TEXTS)
train_fasttext_embedding = fasttext_tfidf_vect.transform(train_stemmed_text)
test_fasttext_embedding = fasttext_tfidf_vect.transform(test_stemmed_text)

  from ipykernel import kernelapp as app


CPU times: user 39min 59s, sys: 38 s, total: 40min 37s
Wall time: 1h 1min 41s


### Prebuilt Embedding

In [7]:
# %%time
# from keras.preprocessing import text, sequence
# from keras import layers, models, optimizers

# # load the pre-trained word-embedding vectors 
# embeddings_index = {}
# for i, line in enumerate(open('data/model/wiki-news-300d-1M.vec')):
#     values = line.split()
#     embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# # create a tokenizer 
# token = text.Tokenizer()
# token.fit_on_texts(LABELED_CONCAT_STEMMED_TEXTS)
# word_index = token.word_index

# # convert text to sequence of tokens and pad them to ensure equal length vectors 
# text_train_seq = sequence.pad_sequences(token.texts_to_sequences(train_text), maxlen=70)
# text_test_seq = sequence.pad_sequences(token.texts_to_sequences(test_text), maxlen=70)

# # create token-embedding mapping
# embedding_matrix = np.zeros((len(word_index) + 1, 300))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

## Text / NLP based features

In [8]:
# %%time
# trainDF['char_count'] = trainDF['text'].apply(len)
# trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
# trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
# trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
# trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
# trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [9]:
# %%time

# import textblob

# pos_family = {
#     'noun' : ['NN','NNS','NNP','NNPS'],
#     'pron' : ['PRP','PRP$','WP','WP$'],
#     'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
#     'adj' :  ['JJ','JJR','JJS'],
#     'adv' : ['RB','RBR','RBS','WRB']
# }

# # function to check and get the part of speech tag count of a words in a given sentence
# def check_pos_tag(x, flag):
#     cnt = 0
#     try:
#         wiki = textblob.TextBlob(x)
#         for tup in wiki.tags:
#             ppo = list(tup)[1]
#             if ppo in pos_family[flag]:
#                 cnt += 1
#     except:
#         pass
#     return cnt

# trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
# trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
# trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
# trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
# trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

## Topic Models as features

In [10]:
# # train a LDA Model
# lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
# X_topics = lda_model.fit_transform(text_train_count)
# topic_word = lda_model.components_
# vocab = count_vect.get_feature_names()

# # view the topic models
# n_top_words = 10
# topic_summaries = []
# for i, topic_dist in enumerate(topic_word):
#     topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
#     topic_summaries.append(' '.join(topic_words))

In [11]:
%%time

import os
from gensim import corpora, models

class MyLDAVectorizer(MyCountVectorizer):
    mallet_path = ".." + os.sep + "mallet-2.0.8"+ os.sep + "bin" + os.sep +"mallet"
    
    def __init__(self, TOPIC_COUNT = 100):
        self.topic_count = TOPIC_COUNT
    
    def fit(self, texts):
        self.dictionary = corpora.Dictionary(texts)
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]
        self.tfidf_model = models.TfidfModel(_corpus)
        _tfidf_corpus = self.tfidf_model[_corpus]

#         self.vect_model = models.LdaModel(_tfidf_corpus, 
#                             num_topics = self.topic_count, 
#                             id2word = self.dictionary,
#                             random_state = 100,
#                             eval_every = 5, 
#                             alpha = 'auto', 
#                             gamma_threshold = 0.01)
        
        self.vect_model = models.wrappers.LdaMallet(self.mallet_path, 
                                                     corpus = _corpus, 
                                                     num_topics = self.topic_count, 
                                                     id2word = self.dictionary)
    
    def toarray(self, doc_topics):
        _doc_vect  = np.zeros((len(doc_topics), self.topic_count))
        
        for i, _doc in enumerate(doc_topics):
            for _topic, _weight in _doc:
                _doc_vect[i][_topic] = _weight
        
        return _doc_vect
        
    def transform(self, texts):
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]
#         _tfidf_corpus = self.tfidf_model[_corpus]
        
        return self.toarray(self.vect_model[_corpus])

lda_vect = MyLDAVectorizer()
lda_vect.fit(train_stemmed_text)
train_lda = lda_vect.transform(train_stemmed_text)
test_lda = lda_vect.transform(test_stemmed_text)

CPU times: user 5.75 s, sys: 246 ms, total: 5.99 s
Wall time: 57.7 s


## Doc2Vec

In [12]:
%%time

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

class MyDoc2Vectorizer(MyCountVectorizer):
    def __init__(self, size = 100):
        self.embedding_size = size
            
    def fit(self, texts):
        _docs = [TaggedDocument(_doc, [i]) for i, _doc in enumerate(texts)]
        self.vect_model = Doc2Vec(_docs, 
                                  vector_size = self.embedding_size, 
                                  window = 8,
                                  epochs=40, 
                                  workers=4)
        
    def transform(self, texts):
        return np.asarray([self.vect_model.infer_vector(_text) for _text in texts])

doc2vec_vect = MyDoc2Vectorizer(size = 400)
# doc2vec_vect.fit(ALL_STEMMED_TEXTS)
doc2vec_vect.fit(train_stemmed_text)
train_doc2vec = doc2vec_vect.transform(train_stemmed_text)
test_doc2vec = doc2vec_vect.transform(test_stemmed_text)

CPU times: user 39min 46s, sys: 23.6 s, total: 40min 10s
Wall time: 13min 37s


In [13]:
#Test quality

import collections

ranks = []
for doc_id in range(len(train_stemmed_text)):
    inferred_vector = doc2vec_vect.vect_model.infer_vector(train_stemmed_text[train_stemmed_text.index[doc_id]])
    sims = doc2vec_vect.vect_model.docvecs.most_similar([inferred_vector], topn=len(doc2vec_vect.vect_model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

collections.Counter(ranks)

Counter({10351: 1,
         8630: 1,
         9857: 1,
         522: 1,
         9457: 1,
         11979: 1,
         9759: 1,
         11165: 1,
         7415: 1,
         6445: 1,
         3062: 1,
         504: 1,
         3260: 1,
         3973: 1,
         3004: 1,
         8366: 1,
         4633: 1,
         11473: 1,
         12368: 1,
         413: 1,
         11545: 1,
         7815: 1,
         7079: 1,
         7713: 1,
         2217: 1,
         8586: 1,
         1008: 2,
         8247: 1,
         12221: 1,
         2184: 1,
         9873: 1,
         6551: 1,
         9359: 1,
         4441: 1,
         10253: 1,
         4816: 1,
         7403: 1,
         9394: 1,
         12814: 2,
         12097: 1,
         12835: 1,
         1893: 1,
         2233: 1,
         8998: 1,
         12056: 1,
         4840: 1,
         6394: 1,
         11186: 1,
         9851: 1,
         1570: 1,
         3777: 1,
         1494: 1,
         1178: 1,
         3667: 1,
         10589: 1,

# Models

In [58]:
from sklearn import linear_model, naive_bayes, metrics, svm

def show_score(classifier_name, score):
    print("Accuracy:%0.2f Precission:%0.2f Recall:%0.2f F1:%0.2f"%scores, "-> [%s]"%(classifier_name))
    
def train_model(classifier, train_feature, train_label, test_feature, test_label, is_neural_net=False):
#     print(train_feature)
    # fit the training dataset on the classifier
    classifier.fit(train_feature, train_label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(test_feature)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return (metrics.accuracy_score(predictions, test_label),
            metrics.precision_score(predictions, test_label),
            metrics.recall_score(predictions, test_label),
            metrics.f1_score(predictions, test_label))

def run_model(classfier_configs):
    for _name in classfier_configs:
#         print(classfier_configs[_name].train)
        scores = train_model(classfier_configs[_name][0], classfier_configs[_name][1], train_label, classfier_configs[_name][2], test_label)
        show_score(_name, scores)
        
class ClassifierConfig:
     def __init__(self, classifier, train, test):
            self.classifier = classifier
            self.train = train
            self.test = test

## Naive Bayes 

In [73]:
nb_classifiers = {
    "NB Count": (naive_bayes.MultinomialNB(), train_count, test_count),
    "NB TFIDF": (naive_bayes.MultinomialNB(), train_tfidf, test_tfidf),
    "NB TFIDF NGram": (naive_bayes.MultinomialNB(), train_tfidf_ngram, test_tfidf_ngram),
    "NB TFIDF NGram Chars": (naive_bayes.MultinomialNB(), train_tfidf_ngram_chars, test_tfidf_ngram_chars),
    "NB LDA": (naive_bayes.MultinomialNB(), train_lda, test_lda),
}

for name in nb_classifiers:
    scores = train_model(nb_classifiers[name][0], nb_classifiers[name][1], train_label, nb_classifiers[name][2], test_label)
    show_score(name, scores)
# run_model(nb_classifiers)

# print(train_model(naive_bayes.MultinomialNB(), train_count, train_label, test_count, test_label))
# print(train_model(naive_bayes.MultinomialNB(), train_tfidf_ngram, train_label, test_tfidf_ngram, test_label))

Accuracy:0.65 Precission:0.64 Recall:0.67 F1:0.65 -> [NB Count]
Accuracy:0.62 Precission:0.57 Recall:0.66 F1:0.61 -> [NB TFIDF]
Accuracy:0.62 Precission:0.60 Recall:0.65 F1:0.62 -> [NB TFIDF NGram]
Accuracy:0.62 Precission:0.38 Recall:0.79 F1:0.51 -> [NB TFIDF NGram Chars]
Accuracy:0.65 Precission:0.41 Recall:0.83 F1:0.55 -> [NB LDA]


##  Linear Classifier

In [63]:
linear_classifiers = {
    "Linear Count": (linear_model.LogisticRegression(), train_count, test_count),
    "Linear TFIDF": (linear_model.LogisticRegression(), train_tfidf, test_tfidf),
    "Linear TFIDF NGram": (linear_model.LogisticRegression(), train_tfidf_ngram, test_tfidf_ngram),
    "Linear TFIDF NGram Chars": (linear_model.LogisticRegression(), train_tfidf_ngram_chars, test_tfidf_ngram_chars),
    "Linear FastText Embedding": (linear_model.LogisticRegression(), train_fasttext_embedding, test_fasttext_embedding),
    "Linear LDA": (linear_model.LogisticRegression(), train_lda, test_lda),
    "Linear Doc2Vec": (linear_model.LogisticRegression(), train_doc2vec, test_doc2vec),
}

for name in linear_classifiers:
    scores = train_model(linear_classifiers[name][0], linear_classifiers[name][1], train_label, linear_classifiers[name][2], test_label)
    show_score(name, scores)
run_model(linear_classifiers)



Accuracy:0.54 Precission:0.40 Recall:0.59 F1:0.47 -> [Linear Count]
Accuracy:0.65 Precission:0.55 Recall:0.73 F1:0.63 -> [Linear TFIDF]
Accuracy:0.60 Precission:0.53 Recall:0.65 F1:0.58 -> [Linear TFIDF NGram]
Accuracy:0.65 Precission:0.59 Recall:0.69 F1:0.64 -> [Linear TFIDF NGram Chars]
Accuracy:0.59 Precission:0.57 Recall:0.62 F1:0.59 -> [Linear FastText Embedding]
Accuracy:0.65 Precission:0.41 Recall:0.83 F1:0.55 -> [Linear LDA]




Accuracy:0.50 Precission:0.48 Recall:0.53 F1:0.50 -> [Linear Doc2Vec]
Accuracy:0.50 Precission:0.48 Recall:0.53 F1:0.50 -> [Linear Count]
Accuracy:0.50 Precission:0.48 Recall:0.53 F1:0.50 -> [Linear TFIDF]
Accuracy:0.50 Precission:0.48 Recall:0.53 F1:0.50 -> [Linear TFIDF NGram]
Accuracy:0.50 Precission:0.48 Recall:0.53 F1:0.50 -> [Linear TFIDF NGram Chars]
Accuracy:0.50 Precission:0.48 Recall:0.53 F1:0.50 -> [Linear FastText Embedding]
Accuracy:0.50 Precission:0.48 Recall:0.53 F1:0.50 -> [Linear LDA]
Accuracy:0.50 Precission:0.48 Recall:0.53 F1:0.50 -> [Linear Doc2Vec]




## SVM

In [None]:
svm_classifiers = {
    "SVM Count": (svm.SVC(), train_count, test_count),
    "SVM TFIDF": (svm.SVC(), train_tfidf, test_tfidf),
    "SVM TFIDF NGram": (svm.SVC(), train_tfidf_ngram, test_tfidf_ngram),
    "SVM TFIDF NGram Chars": (svm.SVC(), train_tfidf_ngram_chars, test_tfidf_ngram_chars),
    "SVM FastText Embedding": (svm.SVC(), train_fasttext_embedding, test_fasttext_embedding),
    "SVM LDA": (svm.SVC(), train_lda, test_lda),
    "SVM Doc2Vec": (svm.SVC(), train_doc2vec, test_doc2vec),
}

run_model(svm_classifiers)