Packages:
* [NLTK](http://www.nltk.org/howto/classify.html)
* [SpaCy](https://spacy.io/)
* [AllenNLP](https://allennlp.org/tutorials)

Articles:
* [A Comprehensive Guide to Understand and Implement Text Classification in Python](https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/)
* [Machine Learning, NLP: Text Classification using scikit-learn, python and NLTK](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)
* [State-of-the-Art Text Classification using BERT model: “Predict the Happiness” Challenge](https://appliedmachinelearning.blog/2019/03/04/state-of-the-art-text-classification-using-bert-model-predict-the-happiness-hackerearth-challenge/)

In [129]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from tqdm import tqdm
import numpy as np
import pandas as pd
import string
import time


stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    result_stemmed = []
    for token in simple_preprocess(text, min_len = 2):
        result.append(token)
        if token not in STOPWORDS:
            result_stemmed.append(lemmatize_stemming(token))
    
    return (result, result_stemmed)

with open("./data/raw/Hygiene/hygiene.dat.labels") as f:
    LABELS = [int(l) for l in f.readlines() if l[0].isdigit()]

ALL_RAW_TEXTS = []
ALL_TEXTS = []
ALL_STEMMED_TEXTS = []
ALL_CONCAT_STEMMED_TEXTS = []
LABELED_TEXTS = []
LABELED_CONCAT_TEXTS = []
LABELED_STEMMED_TEXTS = []
LABELED_CONCAT_STEMMED_TEXTS = []

with open("./data/raw/Hygiene/hygiene.dat") as f:
    ALL_RAW_TEXTS = f.readlines()

for _text in tqdm(ALL_RAW_TEXTS):
    _result, _result_stemmed = preprocess(_text)
    ALL_TEXTS.append(_result)
    ALL_STEMMED_TEXTS.append(_result_stemmed)

ALL_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in ALL_STEMMED_TEXTS]

LABELED_TEXTS = ALL_TEXTS[0:len(LABELS)]
LABELED_CONCAT_TEXTS = [" ".join(_text) for _text in LABELED_TEXTS]

LABELED_STEMMED_TEXTS = ALL_STEMMED_TEXTS[0:len(LABELS)]
LABELED_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in LABELED_STEMMED_TEXTS]

100%|██████████| 13299/13299 [03:48<00:00, 58.26it/s]


In [29]:
### Handle Extra Features

In [130]:
# code_map = 'abcdefghij'
code_map = '0123456789'

def encode_zipcode(zip_code):
    _code = ''
    _rest = zip_code    
    while _rest > 0:
        _code = code_map[_rest % 10] + _code
        _rest = int(_rest / 10)

    return _code

FEATURE_MORE =pd.read_csv("./data/raw/Hygiene/hygiene.dat.additional", header=None)
EXTRA_FEATURE = pd.DataFrame()

EXTRA_FEATURE['Cuisines'] = [simple_preprocess(_text) for _text in FEATURE_MORE[0]]

EXTRA_FEATURE['Stars'] = ['PoorStars' for _star in FEATURE_MORE[3]]
star_std_range = (FEATURE_MORE[3].mean() - FEATURE_MORE[3].std(), FEATURE_MORE[3].mean() + FEATURE_MORE[3].std())
# EXTRA_FEATURE['Stars'][FEATURE_MORE[3] < star_std_range[0]] = 'PoorStars'
EXTRA_FEATURE['Stars'][(FEATURE_MORE[3] >= star_std_range[0])] = 'StandardStars'
EXTRA_FEATURE['Stars'][FEATURE_MORE[3] > star_std_range[1]] = 'GoodStars'

EXTRA_FEATURE['ReviewCount'] = ['NoReviews' for _star in FEATURE_MORE[2]]
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 2] = "FewReviews"
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 6] = "SomeReviews"
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 13] = "ManyReviews"
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 50] = "LotReviews"

EXTRA_FEATURE['ZIPCode'] = [encode_zipcode(_code) for _code in FEATURE_MORE[1]]

EXTRA_FEATURE['MergedText'] = [ [_cuisine for _cuisine in _record[1][0]]
                                + [_record[1][1]] + [_record[1][2]] + [_record[1][3]]
                               for _record in EXTRA_FEATURE.iterrows()]

In [156]:
",".join(EXTRA_FEATURE['MergedText'][0])

'vietnamese,sandwiches,restaurants,StandardStars,FewReviews,98118'

### Add Extra Features to the original text

In [157]:
LABELED_EXTRA_TEXTS = EXTRA_FEATURE['MergedText'][0:len(LABELS)]
LABELED_CONCAT_EXTRA_TEXTS = [",".join(_text) for _text in LABELED_EXTRA_TEXTS]

LABELED_TEXTS = [LABELED_EXTRA_TEXTS[i] + _text for i, _text in enumerate(LABELED_TEXTS)]
LABELED_CONCAT_TEXTS = [" ".join(_text) for _text in LABELED_TEXTS]

LABELED_STEMMED_TEXTS = [LABELED_EXTRA_TEXTS[i] + _text for i, _text in enumerate(LABELED_STEMMED_TEXTS)]
LABELED_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in LABELED_STEMMED_TEXTS]

In [158]:
LABLED_RAW_TEXTS_1 = [LABELED_CONCAT_EXTRA_TEXTS[i] + "|" + _text for i, _text in enumerate(ALL_RAW_TEXTS[0:len(LABELS)]) if LABELS[i] == 1]
LABLED_RAW_TEXTS_0 = [LABELED_CONCAT_EXTRA_TEXTS[i] + "|" + _text for i, _text in enumerate(ALL_RAW_TEXTS[0:len(LABELS)]) if LABELS[i] == 0]

In [159]:
SMALL_RAW_TEXTS_1 = [_text for _text in LABLED_RAW_TEXTS_1 if len(_text) < 1000]
SMALL_RAW_TEXTS_0 = [_text for _text in LABLED_RAW_TEXTS_0 if len(_text) < 1000]
print(len(SMALL_RAW_TEXTS_1), len(SMALL_RAW_TEXTS_0))

27 52


In [160]:
SMALL_RAW_TEXTS_1[24]

"breakfast,brunch,greek,restaurants,StandardStars,NoReviews,98199|Tickled to find a good breakfast in Magnolia - because there aren't any others. Great home cooking - thin potatoes and two great eggs. Coffee, orange juice and the Greek language.Perfect early morning.\n"

In [161]:
SMALL_RAW_TEXTS_0[24]

"pizza,restaurants,GoodStars,NoReviews,98109|Pizza Hut Internet Delivery is uncomplicated. They take cash and credit orders. Gave me an estimated time of delivery. And emailed the receipt. Really simple.A timely delivery in rain was enough to impress me. But the delivery woman was smiling and courteous. She gave me parmesan and pepper packets. Thanks.Pizza Hut couldn't stop there.... The Veggie-Lovers Pizza came piping hot. It was adorned by tons of fresh chopped veggies like more than I expected. So much so that the sauce couldn't even peek. Fer real.And today is Pizza Hut's Wing Wednesday. Thus decided to add some boneless wings with ranch dressing to my pizza order. Too much food. But it'll make enjoyable leftovers. Also they gave me a free liter of the new Pepsi Max on top of their newly lowered pizza prices. Sinful spinster.... I am!\n"

In [162]:
# create a dataframe using texts and lables
labeled_df = pd.DataFrame()
labeled_df['concat_stemmed_text'] = LABELED_CONCAT_STEMMED_TEXTS
labeled_df['stemmed_text'] = LABELED_STEMMED_TEXTS
# labeled_df['concat_stemmed_text'] = LABELED_CONCAT_TEXTS
# labeled_df['stemmed_text'] = LABELED_TEXTS
# labeled_df['concat_stemmed_text'] = LABELED_CONCAT_EXTRA_TEXTS
# labeled_df['stemmed_text'] = LABELED_EXTRA_TEXTS
labeled_df['label'] = LABELS

# split the dataset into training and validation datasets 
train_concat_stemmed_text, test_concat_stemmed_text, train_label, test_label = model_selection.train_test_split(labeled_df['concat_stemmed_text'], 
                                                                                  labeled_df['label'],
                                                                                  test_size = 0.2,
                                                                                  random_state = 10)
train_stemmed_text = labeled_df['stemmed_text'][train_concat_stemmed_text.index]
test_stemmed_text = labeled_df['stemmed_text'][test_concat_stemmed_text.index]

# # label encode the target variable 
# encoder = preprocessing.LabelEncoder()
# train_label = encoder.fit_transform(train_label)
# test_label = encoder.fit_transform(test_label)

In [163]:
# dictionary = corpora.Dictionary(processed_docs)
# print("Before prunn:%d"%(len(dictionary)))
# dictionary.filter_extremes(no_below = 2, no_above = 0.5)
# print("After prunn:%d"%(len(dictionary)))
# corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Feature Engineering

## Count Vectors as features

In [164]:
class MyCountVectorizer:
    def __init__(self, max_df = 0.5):
        self.vect_model = CountVectorizer(analyzer='word', max_df = max_df)
        
    def fit(self, texts_concat):
        self.vect_model.fit(texts_concat)
    
    def transform(self, texts_concat):
        return self.vect_model.transform(texts_concat)

count_vect = MyCountVectorizer()
count_vect.fit(train_concat_stemmed_text)
train_count = count_vect.transform(train_concat_stemmed_text)
test_count =  count_vect.transform(test_concat_stemmed_text)

## TF-IDF Vectors as features

In [165]:
%%time

# word level tf-idf
class MyTfidfVectorizer(MyCountVectorizer):
    def __init__(self, analyzer='word', ngram_range = None, max_features=5000):
        if ngram_range is None:
            self.vect_model = TfidfVectorizer(analyzer = analyzer, max_features = max_features)
        else:
            self.vect_model = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, 
                                              max_features = max_features)
            
    def fit(self, texts_concat):   
        self.vect_model.fit(texts_concat)
        self.vocabulary = self.vect_model.vocabulary_


tfidf_vect = MyTfidfVectorizer()
tfidf_vect.fit(train_concat_stemmed_text)
train_tfidf =  tfidf_vect.transform(train_concat_stemmed_text)
test_tfidf =  tfidf_vect.transform(test_concat_stemmed_text)

# ngram level tf-idf 
tfidf_vect_ngram = MyTfidfVectorizer(ngram_range=(2,3))
tfidf_vect_ngram.fit(train_concat_stemmed_text)
train_tfidf_ngram =  tfidf_vect_ngram.transform(train_concat_stemmed_text)
test_tfidf_ngram =  tfidf_vect_ngram.transform(test_concat_stemmed_text)

# characters level tf-idf
tfidf_vect_ngram_chars = MyTfidfVectorizer(analyzer='char', ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(train_concat_stemmed_text)
train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_concat_stemmed_text) 
test_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_concat_stemmed_text) 

CPU times: user 8.44 s, sys: 180 ms, total: 8.62 s
Wall time: 8.16 s


## Word Embeddings

### Build from review corpus

In [166]:
# %%time

# from gensim.models.fasttext import FastText

# class MyFastTextTfidfVectorizer(MyCountVectorizer):
#     def __init__(self, tfidf_vectorizer, size = 100):
#         self.embedding_size = size
#         self.tfidf_vectorizer = tfidf_vectorizer
#         self.fasttext_model = FastText(size = size, window = 5, min_count = 5)

#     def tfidf2embedding(self, value_vector):
#         _weighted_value = np.zeros(self.embedding_size)
#         for key in self.tfidf_vectorizer.vocabulary:
#             _index = self.tfidf_vectorizer.vocabulary[key]
#             if value_vector[_index] != 0:
#                 _weighted_value += self.fasttext_model[key] * value_vector[_index]

#         return _weighted_value
    
#     def fit(self, texts):
#         _texts_concat = [" ".join(_text) for _text in texts]
#         self.tfidf_vectorizer = MyTfidfVectorizer()
#         self.tfidf_vectorizer.fit(_texts_concat)
        
#         self.fasttext_model.build_vocab(sentences = texts)
#         self.fasttext_model.train(sentences = texts, 
#                                   total_examples = len(texts), 
#                                   epochs=10)
        
#     def transform(self, texts):
#         _texts_concat = [" ".join(_text) for _text in texts]
#         _tfidf_values = self.tfidf_vectorizer.transform(_texts_concat)
#         return np.asarray([self.tfidf2embedding(_value.toarray()[0]) for _value in _tfidf_values])

# fasttext_tfidf_vect = MyFastTextTfidfVectorizer(tfidf_vect)
# fasttext_tfidf_vect.fit(ALL_STEMMED_TEXTS)
# train_fasttext_embedding = fasttext_tfidf_vect.transform(train_stemmed_text)
# test_fasttext_embedding = fasttext_tfidf_vect.transform(test_stemmed_text)

### Prebuilt Embedding

In [167]:
# %%time
# from keras.preprocessing import text, sequence
# from keras import layers, models, optimizers

# # load the pre-trained word-embedding vectors 
# embeddings_index = {}
# for i, line in enumerate(open('data/model/wiki-news-300d-1M.vec')):
#     values = line.split()
#     embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# # create a tokenizer 
# token = text.Tokenizer()
# token.fit_on_texts(LABELED_CONCAT_STEMMED_TEXTS)
# word_index = token.word_index

# # convert text to sequence of tokens and pad them to ensure equal length vectors 
# text_train_seq = sequence.pad_sequences(token.texts_to_sequences(train_text), maxlen=70)
# text_test_seq = sequence.pad_sequences(token.texts_to_sequences(test_text), maxlen=70)

# # create token-embedding mapping
# embedding_matrix = np.zeros((len(word_index) + 1, 300))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

## Text / NLP based features

In [168]:
# %%time
# trainDF['char_count'] = trainDF['text'].apply(len)
# trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
# trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
# trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
# trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
# trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [169]:
# %%time

# import textblob

# pos_family = {
#     'noun' : ['NN','NNS','NNP','NNPS'],
#     'pron' : ['PRP','PRP$','WP','WP$'],
#     'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
#     'adj' :  ['JJ','JJR','JJS'],
#     'adv' : ['RB','RBR','RBS','WRB']
# }

# # function to check and get the part of speech tag count of a words in a given sentence
# def check_pos_tag(x, flag):
#     cnt = 0
#     try:
#         wiki = textblob.TextBlob(x)
#         for tup in wiki.tags:
#             ppo = list(tup)[1]
#             if ppo in pos_family[flag]:
#                 cnt += 1
#     except:
#         pass
#     return cnt

# trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
# trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
# trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
# trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
# trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

## Topic Models as features

In [170]:
# # train a LDA Model
# lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
# X_topics = lda_model.fit_transform(text_train_count)
# topic_word = lda_model.components_
# vocab = count_vect.get_feature_names()

# # view the topic models
# n_top_words = 10
# topic_summaries = []
# for i, topic_dist in enumerate(topic_word):
#     topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
#     topic_summaries.append(' '.join(topic_words))

In [171]:
%%time

import os
from gensim import corpora, models

class MyLDAVectorizer(MyCountVectorizer):
    mallet_path = ".." + os.sep + "mallet-2.0.8"+ os.sep + "bin" + os.sep +"mallet"
    
    def __init__(self, TOPIC_COUNT = 100):
        self.topic_count = TOPIC_COUNT
    
    def fit(self, texts):
        self.dictionary = corpora.Dictionary(texts)
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]
        self.tfidf_model = models.TfidfModel(_corpus)
        _tfidf_corpus = self.tfidf_model[_corpus]

#         self.vect_model = models.LdaModel(_tfidf_corpus, 
#                             num_topics = self.topic_count, 
#                             id2word = self.dictionary,
#                             random_state = 100,
#                             eval_every = 5, 
#                             alpha = 'auto', 
#                             gamma_threshold = 0.01)
        
        self.vect_model = models.wrappers.LdaMallet(self.mallet_path, 
                                                     corpus = _corpus, 
                                                     num_topics = self.topic_count, 
                                                     id2word = self.dictionary)
    
    def toarray(self, doc_topics):
        _doc_vect  = np.zeros((len(doc_topics), self.topic_count))
        
        for i, _doc in enumerate(doc_topics):
            for _topic, _weight in _doc:
                _doc_vect[i][_topic] = _weight
        
        return _doc_vect
        
    def transform(self, texts):
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]
#         _tfidf_corpus = self.tfidf_model[_corpus]
        
        return self.toarray(self.vect_model[_corpus])

lda_vect = MyLDAVectorizer()
lda_vect.fit(train_stemmed_text)
train_lda = lda_vect.transform(train_stemmed_text)
test_lda = lda_vect.transform(test_stemmed_text)

CPU times: user 3.13 s, sys: 115 ms, total: 3.24 s
Wall time: 55.4 s


## Doc2Vec

In [172]:
%%time

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

class MyDoc2Vectorizer(MyCountVectorizer):
    def __init__(self, size = 100):
        self.embedding_size = size
            
    def fit(self, texts):
        _docs = [TaggedDocument(_doc, [i]) for i, _doc in enumerate(texts)]
        self.vect_model = Doc2Vec(_docs, 
                                  vector_size = self.embedding_size, 
#                                   window = 8,
                                  epochs=40, 
                                  workers=4)
        
    def transform(self, texts):
        return np.asarray([self.vect_model.infer_vector(_text) for _text in texts])

doc2vec_vect = MyDoc2Vectorizer(size = 200)
# doc2vec_vect.fit(ALL_STEMMED_TEXTS)
doc2vec_vect.fit(train_stemmed_text)
train_doc2vec = doc2vec_vect.transform(train_stemmed_text)
test_doc2vec = doc2vec_vect.transform(test_stemmed_text)

CPU times: user 29.1 s, sys: 546 ms, total: 29.7 s
Wall time: 11.9 s


In [173]:
#Test quality

import collections

ranks = []
for doc_id in range(len(train_stemmed_text)):
    inferred_vector = doc2vec_vect.vect_model.infer_vector(train_stemmed_text[train_stemmed_text.index[doc_id]])
    sims = doc2vec_vect.vect_model.docvecs.most_similar([inferred_vector], topn=len(doc2vec_vect.vect_model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

collections.Counter(ranks)

Counter({0: 436})

# Models

In [174]:
from sklearn import linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import GridSearchCV

def show_score(classifier_name, scores):
    print("Accuracy:%0.2f Precission:%0.2f Recall:%0.2f F1:%0.2f"%scores, "-> [%s]"%(classifier_name))
    
def train_model(classifier, train_feature, train_label, test_feature, test_label, is_neural_net=False):
#     print(train_feature)
    # fit the training dataset on the classifier
    classifier.fit(train_feature, train_label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(test_feature)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    print(metrics.confusion_matrix(test_label, predictions))
    return (metrics.accuracy_score(test_label, predictions),
            metrics.precision_score(test_label, predictions),
            metrics.recall_score(test_label, predictions),
            metrics.f1_score(test_label, predictions))

def run_model(classfier_configs):
    for _name in classfier_configs:
#         print(classfier_configs[_name].train)
        scores = train_model(classfier_configs[_name][0], classfier_configs[_name][1], train_label, classfier_configs[_name][2], test_label)
        show_score(_name, scores)
        
class ClassifierConfig:
     def __init__(self, classifier, train, test):
            self.classifier = classifier
            self.train = train
            self.test = test

## Naive Bayes 

In [175]:
nb_config = {
    "NB Count": (naive_bayes.MultinomialNB(), train_count, test_count),
    "NB TFIDF": (naive_bayes.MultinomialNB(), train_tfidf, test_tfidf),
    "NB TFIDF NGram": (naive_bayes.MultinomialNB(), train_tfidf_ngram, test_tfidf_ngram),
    "NB TFIDF NGram Chars": (naive_bayes.MultinomialNB(), train_tfidf_ngram_chars, test_tfidf_ngram_chars),
    "NB LDA": (naive_bayes.MultinomialNB(), train_lda, test_lda),
}

run_model(nb_config)

[[34 20]
 [20 36]]
Accuracy:0.64 Precission:0.64 Recall:0.64 F1:0.64 -> [NB Count]
[[29 25]
 [17 39]]
Accuracy:0.62 Precission:0.61 Recall:0.70 F1:0.65 -> [NB TFIDF]
[[34 20]
 [23 33]]
Accuracy:0.61 Precission:0.62 Recall:0.59 F1:0.61 -> [NB TFIDF NGram]
[[17 37]
 [ 8 48]]
Accuracy:0.59 Precission:0.56 Recall:0.86 F1:0.68 -> [NB TFIDF NGram Chars]
[[42 12]
 [27 29]]
Accuracy:0.65 Precission:0.71 Recall:0.52 F1:0.60 -> [NB LDA]


In [145]:
nb_model = naive_bayes.MultinomialNB()
nb_model.fit(train_tfidf, train_label)
predictions = nb_model.predict(test_tfidf)     
predictions_probs = nb_model.predict_proba(test_tfidf)

new_predictions = []
for p in predictions_probs:
    if p[0] > 0.4:
         new_predictions.append(0)
    else:
        new_predictions.append(1)
        
print(metrics.confusion_matrix(test_label, predictions))
print (metrics.accuracy_score(test_label, predictions),
            metrics.precision_score(test_label, predictions),
            metrics.recall_score(test_label, predictions),
            metrics.f1_score(test_label, predictions))

print(metrics.confusion_matrix(test_label, new_predictions))
print (metrics.accuracy_score(test_label, new_predictions),
            metrics.precision_score(test_label, new_predictions),
            metrics.recall_score(test_label, new_predictions),
            metrics.f1_score(test_label, new_predictions))

[[29 25]
 [17 39]]
0.6181818181818182 0.609375 0.6964285714285714 0.65
[[44 10]
 [26 30]]
0.6727272727272727 0.75 0.5357142857142857 0.6250000000000001


Accuracy:0.55 Precission:0.62 Recall:0.52 F1:0.56 -> [NB Count]
Accuracy:0.51 Precission:0.88 Recall:0.49 F1:0.63 -> [NB TFIDF]
Accuracy:0.52 Precission:0.58 Recall:0.49 F1:0.53 -> [NB TFIDF NGram]
Accuracy:0.49 Precission:0.96 Recall:0.48 F1:0.64 -> [NB TFIDF NGram Chars]
Accuracy:0.55 Precission:0.69 Recall:0.52 F1:0.60 -> [NB LDA]

##  Linear Classifier

In [146]:
from sklearn.linear_model import LogisticRegression

# linear_parameters = {'penalty':('l1', 'l2'), 'C':[10, 1, 0.1, 0.01]}
# linear_model = GridSearchCV(LogisticRegression(solver='saga'), scoring = "f1",param_grid = linear_parameters, cv=5)

# linear_model.fit(train_count, train_label)

# predictions = linear_model.predict(test_count)

# (metrics.accuracy_score(predictions, test_label),
#             metrics.precision_score(predictions, test_label),
#             metrics.recall_score(predictions, test_label),
#             metrics.f1_score(predictions, test_label))

# print(linear_model.best_params_)

In [147]:
linear_config = {
    "Linear Count": (LogisticRegression(C= 0.1, penalty='l1', tol=0.1, solver='saga'), train_count, test_count)
    , "Linear TFIDF": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf, test_tfidf)
    , "Linear TFIDF NGram": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf_ngram, test_tfidf_ngram)
    , "Linear TFIDF NGram Chars": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
#     , "Linear FastText Embedding": (LogisticRegression(solver='lbfgs', max_iter=int(1e6)), train_fasttext_embedding, test_fasttext_embedding)
    , "Linear LDA": (LogisticRegression(solver='lbfgs'), train_lda, test_lda)
    , "Linear Doc2Vec": (LogisticRegression(solver='lbfgs'), train_doc2vec, test_doc2vec)
}

# linear_classifiers = {
#     "Linear Count": (LogisticRegression(), train_count, test_count)
#     , "Linear TFIDF": (LogisticRegression(), train_tfidf, test_tfidf)
#     , "Linear TFIDF NGram": (LogisticRegression(), train_tfidf_ngram, test_tfidf_ngram)
#     , "Linear TFIDF NGram Chars": (LogisticRegression(), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
#     , "Linear FastText Embedding": (LogisticRegression(), train_fasttext_embedding, test_fasttext_embedding)
#     , "Linear LDA": (LogisticRegression(), train_lda, test_lda)
#     , "Linear Doc2Vec": (LogisticRegression(), train_doc2vec, test_doc2vec)
# }

run_model(linear_config)

[[ 9 45]
 [10 46]]
Accuracy:0.50 Precission:0.51 Recall:0.82 F1:0.63 -> [Linear Count]
[[42 12]
 [25 31]]
Accuracy:0.66 Precission:0.72 Recall:0.55 F1:0.63 -> [Linear TFIDF]
[[36 18]
 [23 33]]
Accuracy:0.63 Precission:0.65 Recall:0.59 F1:0.62 -> [Linear TFIDF NGram]
[[21 33]
 [14 42]]
Accuracy:0.57 Precission:0.56 Recall:0.75 F1:0.64 -> [Linear TFIDF NGram Chars]
[[42 12]
 [25 31]]
Accuracy:0.66 Precission:0.72 Recall:0.55 F1:0.63 -> [Linear LDA]
[[29 25]
 [31 25]]
Accuracy:0.49 Precission:0.50 Recall:0.45 F1:0.47 -> [Linear Doc2Vec]


## SVM

In [148]:
from sklearn.svm import SVC

# svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 10, 100], "gamma": np.logspace(-2, 2, 5)}
# svm_model = GridSearchCV(SVC(), scoring = "f1", param_grid = svm_parameters, cv=5)

# svm_model.fit(train_tfidf_ngram_chars, train_label)

# predictions = svm_model.predict(test_tfidf_ngram_chars)

# print(metrics.accuracy_score(predictions, test_label),
#             metrics.precision_score(predictions, test_label),
#             metrics.recall_score(predictions, test_label),
#             metrics.f1_score(predictions, test_label))
# print(svm_model.best_params_)

In [149]:
svm_config = {
    "SVM Count": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_count, test_count),
    "SVM TFIDF": (SVC(C = 10, gamma = 0.01, kernel='rbf'), train_tfidf, test_tfidf),
    "SVM TFIDF NGram": (SVC(C = 1, gamma = 0.01, kernel='linear'), train_tfidf_ngram, test_tfidf_ngram),
    "SVM TFIDF NGram Chars": (SVC(C = 10, gamma = 0.01, kernel='rbf'), train_tfidf_ngram_chars, test_tfidf_ngram_chars),
#     "SVM FastText Embedding": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_fasttext_embedding, test_fasttext_embedding),
    "SVM LDA": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_lda, test_lda),
    "SVM Doc2Vec": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_doc2vec, test_doc2vec),
}
    
run_model(svm_config)

[[ 2 52]
 [ 3 53]]
Accuracy:0.50 Precission:0.50 Recall:0.95 F1:0.66 -> [SVM Count]
[[46  8]
 [28 28]]
Accuracy:0.67 Precission:0.78 Recall:0.50 F1:0.61 -> [SVM TFIDF]
[[34 20]
 [22 34]]
Accuracy:0.62 Precission:0.63 Recall:0.61 F1:0.62 -> [SVM TFIDF NGram]
[[13 41]
 [ 8 48]]
Accuracy:0.55 Precission:0.54 Recall:0.86 F1:0.66 -> [SVM TFIDF NGram Chars]
[[54  0]
 [56  0]]
Accuracy:0.49 Precission:0.00 Recall:0.00 F1:0.00 -> [SVM LDA]
[[15 39]
 [10 46]]
Accuracy:0.55 Precission:0.54 Recall:0.82 F1:0.65 -> [SVM Doc2Vec]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Boosting

In [150]:
# %%time

from xgboost import XGBClassifier

# xgb_parameters = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [3, 4, 5]
#         }

# xgb_model = GridSearchCV(XGBClassifier(), scoring = "f1", param_grid = xgb_parameters, cv=5, verbose = 3)

# xgb_model.fit(train_count, train_label)

# predictions = xgb_model.predict(test_count)

# print(metrics.accuracy_score(predictions, test_label),
#             metrics.precision_score(predictions, test_label),
#             metrics.recall_score(predictions, test_label),
#             metrics.f1_score(predictions, test_label))

# print(xgb_model.best_params_)

In [151]:
# xgboost_config = {
#     "XGBoost Count": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_count, test_count)
#     , "XGBoost TFIDF": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf, test_tfidf)
#     , "XGBoost TFIDF NGram": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf_ngram.tocsc(), test_tfidf_ngram.tocsc())
#     , "XGBoost TFIDF NGram Chars": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
#     , "XGBoost LDA": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_lda, test_lda)
#     , "XGBoost Doc2Vec": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_doc2vec, test_doc2vec)
# }

xgboost_config = {
    "XGBoost Count": (XGBClassifier(), train_count, test_count)
    , "XGBoost TFIDF": (XGBClassifier(), train_tfidf, test_tfidf)
    , "XGBoost TFIDF NGram": (XGBClassifier(), train_tfidf_ngram.tocsc(), test_tfidf_ngram.tocsc())
    , "XGBoost TFIDF NGram Chars": (XGBClassifier(), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
    , "XGBoost LDA": (XGBClassifier(), train_lda, test_lda)
    , "XGBoost Doc2Vec": (XGBClassifier(), train_doc2vec, test_doc2vec)
}

run_model(xgboost_config)

[[37 17]
 [23 33]]
Accuracy:0.64 Precission:0.66 Recall:0.59 F1:0.62 -> [XGBoost Count]
[[31 23]
 [20 36]]
Accuracy:0.61 Precission:0.61 Recall:0.64 F1:0.63 -> [XGBoost TFIDF]
[[37 17]
 [30 26]]
Accuracy:0.57 Precission:0.60 Recall:0.46 F1:0.53 -> [XGBoost TFIDF NGram]
[[24 30]
 [24 32]]
Accuracy:0.51 Precission:0.52 Recall:0.57 F1:0.54 -> [XGBoost TFIDF NGram Chars]
[[40 14]
 [28 28]]
Accuracy:0.62 Precission:0.67 Recall:0.50 F1:0.57 -> [XGBoost LDA]
[[33 21]
 [26 30]]
Accuracy:0.57 Precission:0.59 Recall:0.54 F1:0.56 -> [XGBoost Doc2Vec]


In [26]:
from nltk.classify import NaiveBayesClassifier

_dictionary = corpora.Dictionary(train_stemmed_text)
train_doc_count = [_dictionary.doc2bow(_doc) for _doc in train_stemmed_text]
test_doc_count = [_dictionary.doc2bow(_doc) for _doc in test_stemmed_text]

tfidf_model = models.TfidfModel(train_doc_count)
train_doc_tfidf = [dict(_doc) for _doc in tfidf_model[train_doc_count]]
train_doc_tfidf = [(_doc, train_label[train_label.index[i]]) for i, _doc in enumerate(train_doc_tfidf)]
test_doc_tfidf = [dict(_doc) for _doc in tfidf_model[test_doc_count]]

train_doc_count = [(dict(_doc), train_label[train_label.index[i]]) for i, _doc in enumerate(train_doc_count)]
test_doc_count = [dict(_doc) for _doc in test_doc_count]

NB_classifier_count = NaiveBayesClassifier.train(train_doc_count)

predictions = NB_classifier_count.classify_many(test_doc_count)

print(metrics.confusion_matrix(test_label, predictions))
print(metrics.accuracy_score(test_label, predictions),
            metrics.precision_score(test_label, predictions),
            metrics.recall_score(test_label, predictions),
            metrics.f1_score(test_label, predictions))

[[ 3 51]
 [ 5 51]]
0.4909090909090909 0.5 0.9107142857142857 0.6455696202531646


In [27]:
NB_classifier_tfidf = NaiveBayesClassifier.train(train_doc_tfidf)
predictions = NB_classifier_tfidf.classify_many(test_doc_tfidf)

print(metrics.confusion_matrix(test_label, predictions))
print(metrics.accuracy_score(test_label, predictions),
            metrics.precision_score(test_label, predictions),
            metrics.recall_score(test_label, predictions),
            metrics.f1_score(test_label, predictions))

[[ 0 54]
 [ 0 56]]
0.509090909090909 0.509090909090909 1.0 0.6746987951807228


In [28]:
from nltk.classify import DecisionTreeClassifier


DT_classifier_count = DecisionTreeClassifier.train(train_doc_tfidf)
predictions = DT_classifier_count.classify_many(test_doc_tfidf)

print(metrics.confusion_matrix(test_label, predictions))
print(metrics.accuracy_score(test_label, predictions),
            metrics.precision_score(test_label, predictions),
            metrics.recall_score(test_label, predictions),
            metrics.f1_score(test_label, predictions))

[[54  0]
 [56  0]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.4909090909090909 0.0 0.0 0.0
