Packages:
* [NLTK](http://www.nltk.org/howto/classify.html)
* [SpaCy](https://spacy.io/)
* [AllenNLP](https://allennlp.org/tutorials)

Articles:
* [A Comprehensive Guide to Understand and Implement Text Classification in Python](https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/)
* [Machine Learning, NLP: Text Classification using scikit-learn, python and NLTK](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)
* [State-of-the-Art Text Classification using BERT model: “Predict the Happiness” Challenge](https://appliedmachinelearning.blog/2019/03/04/state-of-the-art-text-classification-using-bert-model-predict-the-happiness-hackerearth-challenge/)

In [1]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from tqdm import tqdm
import numpy as np
import pandas as pd
import string
import time


stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    result_stemmed = []
    for token in simple_preprocess(text, min_len = 2):
        result.append(token)
#         if token not in STOPWORDS:
        result_stemmed.append(lemmatize_stemming(token))
    
    return (result, result_stemmed)

with open("./data/raw/Hygiene/hygiene.dat.labels") as f:
    LABELS = [int(l) for l in f.readlines() if l[0].isdigit()]

ALL_RAW_TEXTS = []
ALL_TEXTS = []
ALL_STEMMED_TEXTS = []
ALL_CONCAT_STEMMED_TEXTS = []
LABELED_TEXTS = []
LABELED_CONCAT_TEXTS = []
LABELED_STEMMED_TEXTS = []
LABELED_CONCAT_STEMMED_TEXTS = []

with open("./data/raw/Hygiene/hygiene.dat") as f:
    ALL_RAW_TEXTS = f.readlines()

for _text in tqdm(ALL_RAW_TEXTS):
    _result, _result_stemmed = preprocess(_text)
    ALL_TEXTS.append(_result)
    ALL_STEMMED_TEXTS.append(_result_stemmed)

ALL_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in ALL_STEMMED_TEXTS]

LABELED_TEXTS = ALL_TEXTS[0:len(LABELS)]
LABELED_CONCAT_TEXTS = [" ".join(_text) for _text in ALL_TEXTS]

LABELED_STEMMED_TEXTS = ALL_STEMMED_TEXTS[0:len(LABELS)]
LABELED_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in LABELED_STEMMED_TEXTS]

FEATURE_MORE =pd.read_csv("./data/raw/Hygiene/hygiene.dat.additional", header=None)

100%|██████████| 13299/13299 [06:12<00:00, 35.72it/s]


In [2]:
# create a dataframe using texts and lables
labeled_df = pd.DataFrame()
labeled_df['concat_stemmed_text'] = LABELED_CONCAT_STEMMED_TEXTS
labeled_df['stemmed_text'] = LABELED_STEMMED_TEXTS
labeled_df['label'] = LABELS

# split the dataset into training and validation datasets 
train_concat_stemmed_text, test_concat_stemmed_text, train_label, test_label = model_selection.train_test_split(labeled_df['concat_stemmed_text'], 
                                                                                  labeled_df['label'],
                                                                                  test_size = 0.2)
train_stemmed_text = labeled_df['stemmed_text'][train_concat_stemmed_text.index]
test_stemmed_text = labeled_df['stemmed_text'][test_concat_stemmed_text.index]

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_label = encoder.fit_transform(train_label)
test_label = encoder.fit_transform(test_label)

In [3]:
# dictionary = corpora.Dictionary(processed_docs)
# print("Before prunn:%d"%(len(dictionary)))
# dictionary.filter_extremes(no_below = 2, no_above = 0.5)
# print("After prunn:%d"%(len(dictionary)))
# corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Count Vectors as features

In [4]:
class MyCountVectorizer:
    def __init__(self, max_df = 0.5):
        self.count_vect = CountVectorizer(analyzer='word', max_df = max_df)
        
    def fit(self, concat_stemmed_text):
        self.count_vect.fit(concat_stemmed_text)
    
    def transform(self, concat_stemmed_text):
        return self.count_vect.transform(concat_stemmed_text)

count_vect = MyCountVectorizer()
count_vect.fit(train_concat_stemmed_text)
train_count = count_vect.transform(train_concat_stemmed_text)
test_count =  count_vect.transform(test_concat_stemmed_text)

# TF-IDF Vectors as features

In [5]:
%%time

# word level tf-idf
class MyTfidfVectorizer:
    def __init__(self, analyzer='word', ngram_range = None, max_features=5000):
        if ngram_range is None:
            self.tfidf_vect = TfidfVectorizer(analyzer = analyzer, max_features = max_features)
        else:
            self.tfidf_vect = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, 
                                              max_features = max_features)
            
    def fit(self, concat_stemmed_text):   
        self.tfidf_vect.fit(concat_stemmed_text)
        self.vocabulary = self.tfidf_vect.vocabulary_
        
    def transform(self, concat_stemmed_text):
        return self.tfidf_vect.transform(concat_stemmed_text)


tfidf_vect = MyTfidfVectorizer()
tfidf_vect.fit(train_concat_stemmed_text)
train_tfidf =  tfidf_vect.transform(train_concat_stemmed_text)
test_tfidf =  tfidf_vect.transform(test_concat_stemmed_text)

# ngram level tf-idf 
tfidf_vect_ngram = MyTfidfVectorizer(ngram_range=(2,3))
tfidf_vect_ngram.fit(train_concat_stemmed_text)
train_tfidf_ngram =  tfidf_vect_ngram.transform(train_concat_stemmed_text)
test_tfidf_ngram =  tfidf_vect_ngram.transform(test_concat_stemmed_text)

# characters level tf-idf
tfidf_vect_ngram_chars = MyTfidfVectorizer(analyzer='char', ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(train_concat_stemmed_text)
train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_concat_stemmed_text) 
test_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_concat_stemmed_text) 

CPU times: user 14.2 s, sys: 195 ms, total: 14.4 s
Wall time: 13.8 s


# Word Embeddings

## Build from review corpus

In [41]:
%%time

from gensim.models.fasttext import FastText

class MyFastTextTfidfVectorizer:
    def __init__(self, tfidf_vectorizer, size = 100):
        self.embedding_size = size
        self.tfidf_vectorizer = tfidf_vectorizer
        self.fasttext_model = FastText(size = size, window = 5, min_count = 5)

    def tfidf2embedding(self, value_vector):
        _weighted_value = np.zeros(self.embedding_size)
        for key in self.tfidf_vectorizer.vocabulary:
            _index = self.tfidf_vectorizer.vocabulary[key]
            if value_vector[_index] != 0:
                _weighted_value += self.fasttext_model[key] * value_vector[_index]

        return _weighted_value
    
    def fit(self, stemmed_text):
        _concat_stemmed_text = [" ".join(_text) for _text in stemmed_text]
        self.tfidf_vectorizer = MyTfidfVectorizer()
        self.tfidf_vectorizer.fit(_concat_stemmed_text)
        
        self.fasttext_model.build_vocab(sentences = stemmed_text)
        self.fasttext_model.train(sentences = stemmed_text, 
                                  total_examples = len(stemmed_text), 
                                  epochs=10)
        
    def transform(self, stemmed_text):
        _concat_stemmed_text = [" ".join(_text) for _text in stemmed_text]
        _tfidf_values = self.tfidf_vectorizer.transform(_concat_stemmed_text)
        return np.asarray([self.tfidf2embedding(_value.toarray()[0]) for _value in _tfidf_values])

fasttext_tfidf_vect = MyFastTextTfidfVectorizer(tfidf_vect)
fasttext_tfidf_vect.fit(ALL_STEMMED_TEXTS)
train_fasttext_embedding = fasttext_tfidf_vect.transform(train_stemmed_text)
test_fasttext_embedding = fasttext_tfidf_vect.transform(test_stemmed_text)

NameError: name 'EMBEDDING_SIZE' is not defined

## Prebuilt Embedding

In [7]:
# %%time
# from keras.preprocessing import text, sequence
# from keras import layers, models, optimizers

# # load the pre-trained word-embedding vectors 
# embeddings_index = {}
# for i, line in enumerate(open('data/model/wiki-news-300d-1M.vec')):
#     values = line.split()
#     embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# # create a tokenizer 
# token = text.Tokenizer()
# token.fit_on_texts(LABELED_CONCAT_STEMMED_TEXTS)
# word_index = token.word_index

# # convert text to sequence of tokens and pad them to ensure equal length vectors 
# text_train_seq = sequence.pad_sequences(token.texts_to_sequences(train_text), maxlen=70)
# text_test_seq = sequence.pad_sequences(token.texts_to_sequences(test_text), maxlen=70)

# # create token-embedding mapping
# embedding_matrix = np.zeros((len(word_index) + 1, 300))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

# Text / NLP based features

In [8]:
# %%time
# trainDF['char_count'] = trainDF['text'].apply(len)
# trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
# trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
# trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
# trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
# trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [9]:
# %%time

# import textblob

# pos_family = {
#     'noun' : ['NN','NNS','NNP','NNPS'],
#     'pron' : ['PRP','PRP$','WP','WP$'],
#     'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
#     'adj' :  ['JJ','JJR','JJS'],
#     'adv' : ['RB','RBR','RBS','WRB']
# }

# # function to check and get the part of speech tag count of a words in a given sentence
# def check_pos_tag(x, flag):
#     cnt = 0
#     try:
#         wiki = textblob.TextBlob(x)
#         for tup in wiki.tags:
#             ppo = list(tup)[1]
#             if ppo in pos_family[flag]:
#                 cnt += 1
#     except:
#         pass
#     return cnt

# trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
# trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
# trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
# trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
# trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

# Topic Models as features

In [10]:
# # train a LDA Model
# lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
# X_topics = lda_model.fit_transform(text_train_count)
# topic_word = lda_model.components_
# vocab = count_vect.get_feature_names()

# # view the topic models
# n_top_words = 10
# topic_summaries = []
# for i, topic_dist in enumerate(topic_word):
#     topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
#     topic_summaries.append(' '.join(topic_words))

In [16]:
%%time

from gensim import corpora, models

class MyLDAVectorizer:
    def __init__(self, TOPIC_COUNT = 100):
        self.topic_count = TOPIC_COUNT
    
    def fit(self, stemmed_text):
        self.dictionary = corpora.Dictionary(stemmed_text)
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in stemmed_text]
        self.tfidf_model = models.TfidfModel(_corpus)
        _tfidf_corpus = self.tfidf_model[_corpus]

        self.lda_model = models.LdaModel(_tfidf_corpus, 
                            num_topics = self.topic_count, 
                            id2word = self.dictionary,
                            random_state = 100,
                            eval_every = 5, 
                            alpha = 'auto', 
                            gamma_threshold = 0.01)
    
    def toarray(self, doc_topics):
        _doc_vect  = np.zeros((len(doc_topics), self.topic_count))
        
        for i, _doc in enumerate(doc_topics):
            for _topic, _weight in _doc:
                _doc_vect[i][_topic] = _weight
        
        return _doc_vect
        
    def transform(self, stemmed_text):
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in stemmed_text]
        _tfidf_corpus = self.tfidf_model[_corpus]
        
        return self.toarray(self.lda_model[_tfidf_corpus])

lda_vect = MyLDAVectorizer()
lda_vect.fit(train_stemmed_text)
train_lda = lda_vect.transform(train_stemmed_text)
test_lda = lda_vect.transform(test_stemmed_text)

CPU times: user 24.7 s, sys: 401 ms, total: 25.1 s
Wall time: 7.85 s


# Doc2Vec

In [47]:
%%time

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

class MyDoc2Vectorizer:
    def __init__(self, size = 100):
        self.embedding_size = size
            
    def fit(self, stemmed_text):
        _docs = [TaggedDocument(_doc, [i]) for i, _doc in enumerate(stemmed_text)]
        self.doc2vec_model = Doc2Vec(_docs, vector_size = self.embedding_size, epochs=40, workers=4)
        
    def transform(self, stemmed_text):
        return np.asarray([self.doc2vec_model.infer_vector(_text) for _text in stemmed_text])

doc2vec_vect = MyDoc2Vectorizer(size = 200)
doc2vec_vect.fit(train_stemmed_text)
train_doc2vec = doc2vec_vect.transform(train_stemmed_text)
test_doc2vec = doc2vec_vect.transform(test_stemmed_text)

CPU times: user 48.7 s, sys: 621 ms, total: 49.3 s
Wall time: 18.8 s
