Packages:
* [NLTK](http://www.nltk.org/howto/classify.html)
* [SpaCy](https://spacy.io/)
* [AllenNLP](https://allennlp.org/tutorials)

Articles:
* [A Comprehensive Guide to Understand and Implement Text Classification in Python](https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/)
* [Machine Learning, NLP: Text Classification using scikit-learn, python and NLTK](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)
* [State-of-the-Art Text Classification using BERT model: “Predict the Happiness” Challenge](https://appliedmachinelearning.blog/2019/03/04/state-of-the-art-text-classification-using-bert-model-predict-the-happiness-hackerearth-challenge/)

In [98]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from tqdm import tqdm
import numpy as np
import pandas as pd
import string
import time


stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    result_stemmed = []
    for token in simple_preprocess(text, min_len = 2):
        result.append(token)
        if token not in STOPWORDS:
            result_stemmed.append(lemmatize_stemming(token))
    
    return (result, result_stemmed)

with open("./data/raw/Hygiene/hygiene.dat.labels") as f:
    LABELS = [int(l) for l in f.readlines() if l[0].isdigit()]

ALL_RAW_TEXTS = []
ALL_TEXTS = []
ALL_STEMMED_TEXTS = []
ALL_CONCAT_STEMMED_TEXTS = []
LABELED_TEXTS = []
LABELED_CONCAT_TEXTS = []
LABELED_STEMMED_TEXTS = []
LABELED_CONCAT_STEMMED_TEXTS = []

with open("./data/raw/Hygiene/hygiene.dat") as f:
    ALL_RAW_TEXTS = f.readlines()

for _text in tqdm(ALL_RAW_TEXTS):
    _result, _result_stemmed = preprocess(_text)
    ALL_TEXTS.append(_result)
    ALL_STEMMED_TEXTS.append(_result_stemmed)

ALL_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in ALL_STEMMED_TEXTS]

LABELED_TEXTS = ALL_TEXTS[0:len(LABELS)]
LABELED_CONCAT_TEXTS = [" ".join(_text) for _text in LABELED_TEXTS]

LABELED_STEMMED_TEXTS = ALL_STEMMED_TEXTS[0:len(LABELS)]
LABELED_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in LABELED_STEMMED_TEXTS]

100%|█████████████████████████████████████████████████████████████████████████████████████████| 13299/13299 [03:30<00:00, 63.32it/s]


In [177]:
### Handle Extra Features

In [99]:
code_map = 'abcdefghij'
# code_map = '0123456789'

def encode_zipcode(zip_code):
    _code = ''
    _rest = zip_code    
    while _rest > 0:
        _code = code_map[_rest % 10] + _code
        _rest = int(_rest / 10)

    return _code

FEATURE_MORE =pd.read_csv("./data/raw/Hygiene/hygiene.dat.additional", header=None)
EXTRA_FEATURE = pd.DataFrame()

EXTRA_FEATURE['Cuisines'] = [simple_preprocess(_text) for _text in FEATURE_MORE[0]]

EXTRA_FEATURE['Stars'] = ['PoorStars' for _star in FEATURE_MORE[3]]
star_std_range = (FEATURE_MORE[3].mean() - FEATURE_MORE[3].std(), FEATURE_MORE[3].mean() + FEATURE_MORE[3].std())
# EXTRA_FEATURE['Stars'][FEATURE_MORE[3] < star_std_range[0]] = 'PoorStars'
EXTRA_FEATURE['Stars'][(FEATURE_MORE[3] >= star_std_range[0])] = 'StandardStars'
EXTRA_FEATURE['Stars'][FEATURE_MORE[3] > star_std_range[1]] = 'GoodStars'

EXTRA_FEATURE['ReviewCount'] = ['NoReviews' for _star in FEATURE_MORE[2]]
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 2] = "FewReviews"
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 6] = "SomeReviews"
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 13] = "ManyReviews"
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 50] = "LotReviews"

EXTRA_FEATURE['ZIPCode'] = [encode_zipcode(_code) for _code in FEATURE_MORE[1]]

EXTRA_FEATURE['MergedText'] = [ [_cuisine for _cuisine in _record[1][0]]
                                + [_record[1][1]] + [_record[1][2]] + [_record[1][3]]
                               for _record in EXTRA_FEATURE.iterrows()]

In [224]:
vect = CountVectorizer(analyzer='word')

vect.fit(FEATURE_MORE[0].replace('Restaurants', '', regex=True))
CUISINES_FEATURE = vect.transform(FEATURE_MORE[0]).todense()

_zipcode = [str(i) for i in FEATURE_MORE[1]]
vect.fit(_zipcode)
ZIPCODE_FEATURE = vect.transform(_zipcode).todense()

REVIEW_COUNT_FEATURE = np.array(FEATURE_MORE[2]).reshape((len(FEATURE_MORE[2]),1))
RATING_FEATURE = np.array(FEATURE_MORE[3]).reshape((len(FEATURE_MORE[3]),1))

ALL_EXTRA_FEATURE = np.hstack((CUISINES_FEATURE, ZIPCODE_FEATURE, REVIEW_COUNT_FEATURE, RATING_FEATURE))

### Add Extra Features to the original text

In [183]:
EXTRA_FEATURES = EXTRA_FEATURE['MergedText'][0:len(LABELS)]
EXTRA_CONCAT_FEATURES = [" ".join(_text) for _text in EXTRA_FEATURES]
                  
LABELED_EXTRA_TEXTS =  [EXTRA_FEATURES[i] + _text for i, _text in enumerate(LABELED_TEXTS)]
LABELED_EXTRA_CONCAT_TEXTS = [" ".join(_text) for _text in LABELED_EXTRA_TEXTS]

LABELED_EXTRA_STEMMED_TEXTS = [LABELED_EXTRA_TEXTS[i] + _text for i, _text in enumerate(LABELED_STEMMED_TEXTS)]
LABELED_EXTRA_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in LABELED_EXTRA_STEMMED_TEXTS]

In [185]:
LABLED_RAW_TEXTS_1 = [EXTRA_CONCAT_FEATURES[i] + "|" + _text for i, _text in enumerate(ALL_RAW_TEXTS[0:len(LABELS)]) if LABELS[i] == 1]
LABLED_RAW_TEXTS_0 = [EXTRA_CONCAT_FEATURES[i] + "|" + _text for i, _text in enumerate(ALL_RAW_TEXTS[0:len(LABELS)]) if LABELS[i] == 0]

In [186]:
SMALL_RAW_TEXTS_1 = [_text for _text in LABLED_RAW_TEXTS_1 if len(_text) < 1000]
SMALL_RAW_TEXTS_0 = [_text for _text in LABLED_RAW_TEXTS_0 if len(_text) < 1000]
print(len(SMALL_RAW_TEXTS_1), len(SMALL_RAW_TEXTS_0))

27 52


In [187]:
SMALL_RAW_TEXTS_1[24]

"breakfast brunch greek restaurants StandardStars NoReviews jibjj|Tickled to find a good breakfast in Magnolia - because there aren't any others. Great home cooking - thin potatoes and two great eggs. Coffee, orange juice and the Greek language.Perfect early morning.\n"

In [188]:
SMALL_RAW_TEXTS_0[24]

"pizza restaurants GoodStars NoReviews jibaj|Pizza Hut Internet Delivery is uncomplicated. They take cash and credit orders. Gave me an estimated time of delivery. And emailed the receipt. Really simple.A timely delivery in rain was enough to impress me. But the delivery woman was smiling and courteous. She gave me parmesan and pepper packets. Thanks.Pizza Hut couldn't stop there.... The Veggie-Lovers Pizza came piping hot. It was adorned by tons of fresh chopped veggies like more than I expected. So much so that the sauce couldn't even peek. Fer real.And today is Pizza Hut's Wing Wednesday. Thus decided to add some boneless wings with ranch dressing to my pizza order. Too much food. But it'll make enjoyable leftovers. Also they gave me a free liter of the new Pepsi Max on top of their newly lowered pizza prices. Sinful spinster.... I am!\n"

## Build train/test dataset

In [189]:
# create a dataframe using texts and lables
labeled_df = pd.DataFrame()
labeled_df['concat_stemmed_text'] = LABELED_CONCAT_STEMMED_TEXTS
labeled_df['stemmed_text'] = LABELED_STEMMED_TEXTS
# labeled_df['concat_stemmed_text'] = LABELED_CONCAT_TEXTS
# labeled_df['stemmed_text'] = LABELED_TEXTS
# labeled_df['concat_stemmed_text'] = LABELED_CONCAT_EXTRA_TEXTS
# labeled_df['stemmed_text'] = LABELED_EXTRA_TEXTS
labeled_df['label'] = LABELS

# split the dataset into training and validation datasets 
train_concat_stemmed_text, test_concat_stemmed_text, train_label, test_label = model_selection.train_test_split(labeled_df['concat_stemmed_text'], 
                                                                                  labeled_df['label'],
                                                                                  test_size = 0.2,
                                                                                  random_state = 10)
train_stemmed_text = labeled_df['stemmed_text'][train_concat_stemmed_text.index]
test_stemmed_text = labeled_df['stemmed_text'][test_concat_stemmed_text.index]

# # label encode the target variable 
# encoder = preprocessing.LabelEncoder()
# train_label = encoder.fit_transform(train_label)
# test_label = encoder.fit_transform(test_label)

In [190]:
# dictionary = corpora.Dictionary(processed_docs)
# print("Before prunn:%d"%(len(dictionary)))
# dictionary.filter_extremes(no_below = 2, no_above = 0.5)
# print("After prunn:%d"%(len(dictionary)))
# corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Feature Engineering

## Count Vectors as features

In [247]:
class MyCountVectorizer:
    def __init__(self, max_df = 0.5):
        self.vect_model = CountVectorizer(analyzer='word', max_df = max_df)
        
    def fit(self, texts_concat):
        self.vect_model.fit(texts_concat)
    
    def transform(self, texts_concat):
        return self.vect_model.transform(texts_concat)

count_vect = MyCountVectorizer()
count_vect.fit(train_concat_stemmed_text)
train_count = count_vect.transform(train_concat_stemmed_text)
test_count =  count_vect.transform(test_concat_stemmed_text)
labeled_count = count_vect.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_count_extra = np.hstack([labeled_count.todense(), ALL_EXTRA_FEATURE[0:len(LABELS)]])

## TF-IDF Vectors as features

In [248]:
%%time

# word level tf-idf
class MyTfidfVectorizer(MyCountVectorizer):
    def __init__(self, analyzer='word', ngram_range = None, max_features=5000):
        if ngram_range is None:
            self.vect_model = TfidfVectorizer(analyzer = analyzer, max_features = max_features)
        else:
            self.vect_model = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, 
                                              max_features = max_features)
            
    def fit(self, texts_concat):   
        self.vect_model.fit(texts_concat)
        self.vocabulary = self.vect_model.vocabulary_


tfidf_vect = MyTfidfVectorizer(max_features = 10000)
tfidf_vect.fit(train_concat_stemmed_text)
train_tfidf =  tfidf_vect.transform(train_concat_stemmed_text)
test_tfidf =  tfidf_vect.transform(test_concat_stemmed_text)
labeled_tfidf =  tfidf_vect.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf_extra = np.hstack([labeled_tfidf.todense(), ALL_EXTRA_FEATURE[0:len(LABELS)]])

# ngram level tf-idf 
tfidf_vect_ngram = MyTfidfVectorizer(ngram_range=(2,3), max_features = 10000)
tfidf_vect_ngram.fit(train_concat_stemmed_text)
train_tfidf_ngram =  tfidf_vect_ngram.transform(train_concat_stemmed_text)
test_tfidf_ngram =  tfidf_vect_ngram.transform(test_concat_stemmed_text)
labeled_tfidf_ngram = tfidf_vect_ngram.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf_ngram_extra = np.hstack([labeled_tfidf_ngram.todense(), ALL_EXTRA_FEATURE[0:len(LABELS)]])

# characters level tf-idf
tfidf_vect_ngram_chars = MyTfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features = 10000)
tfidf_vect_ngram_chars.fit(train_concat_stemmed_text)
train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_concat_stemmed_text) 
test_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_concat_stemmed_text)
labeled_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf_ngram_chars_extra = np.hstack([labeled_tfidf_ngram_chars.todense(), ALL_EXTRA_FEATURE[0:len(LABELS)]])

Wall time: 14.2 s


## Word Embeddings

### Build from review corpus

In [193]:
# %%time

# from gensim.models.fasttext import FastText

# class MyFastTextTfidfVectorizer(MyCountVectorizer):
#     def __init__(self, tfidf_vectorizer, size = 100):
#         self.embedding_size = size
#         self.tfidf_vectorizer = tfidf_vectorizer
#         self.fasttext_model = FastText(size = size, window = 5, min_count = 5)

#     def tfidf2embedding(self, value_vector):
#         _weighted_value = np.zeros(self.embedding_size)
#         for key in self.tfidf_vectorizer.vocabulary:
#             _index = self.tfidf_vectorizer.vocabulary[key]
#             if value_vector[_index] != 0:
#                 _weighted_value += self.fasttext_model[key] * value_vector[_index]

#         return _weighted_value
    
#     def fit(self, texts):
#         _texts_concat = [" ".join(_text) for _text in texts]
#         self.tfidf_vectorizer = MyTfidfVectorizer()
#         self.tfidf_vectorizer.fit(_texts_concat)
        
#         self.fasttext_model.build_vocab(sentences = texts)
#         self.fasttext_model.train(sentences = texts, 
#                                   total_examples = len(texts), 
#                                   epochs=10)
        
#     def transform(self, texts):
#         _texts_concat = [" ".join(_text) for _text in texts]
#         _tfidf_values = self.tfidf_vectorizer.transform(_texts_concat)
#         return np.asarray([self.tfidf2embedding(_value.toarray()[0]) for _value in _tfidf_values])

# fasttext_tfidf_vect = MyFastTextTfidfVectorizer(tfidf_vect)
# fasttext_tfidf_vect.fit(ALL_STEMMED_TEXTS)
# train_fasttext_embedding = fasttext_tfidf_vect.transform(train_stemmed_text)
# test_fasttext_embedding = fasttext_tfidf_vect.transform(test_stemmed_text)

### Prebuilt Embedding

In [194]:
# %%time
# from keras.preprocessing import text, sequence
# from keras import layers, models, optimizers

# # load the pre-trained word-embedding vectors 
# embeddings_index = {}
# for i, line in enumerate(open('data/model/wiki-news-300d-1M.vec')):
#     values = line.split()
#     embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# # create a tokenizer 
# token = text.Tokenizer()
# token.fit_on_texts(LABELED_CONCAT_STEMMED_TEXTS)
# word_index = token.word_index

# # convert text to sequence of tokens and pad them to ensure equal length vectors 
# text_train_seq = sequence.pad_sequences(token.texts_to_sequences(train_text), maxlen=70)
# text_test_seq = sequence.pad_sequences(token.texts_to_sequences(test_text), maxlen=70)

# # create token-embedding mapping
# embedding_matrix = np.zeros((len(word_index) + 1, 300))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

## Text / NLP based features

In [195]:
# %%time
# trainDF['char_count'] = trainDF['text'].apply(len)
# trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
# trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
# trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
# trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
# trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [196]:
# %%time

# import textblob

# pos_family = {
#     'noun' : ['NN','NNS','NNP','NNPS'],
#     'pron' : ['PRP','PRP$','WP','WP$'],
#     'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
#     'adj' :  ['JJ','JJR','JJS'],
#     'adv' : ['RB','RBR','RBS','WRB']
# }

# # function to check and get the part of speech tag count of a words in a given sentence
# def check_pos_tag(x, flag):
#     cnt = 0
#     try:
#         wiki = textblob.TextBlob(x)
#         for tup in wiki.tags:
#             ppo = list(tup)[1]
#             if ppo in pos_family[flag]:
#                 cnt += 1
#     except:
#         pass
#     return cnt

# trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
# trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
# trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
# trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
# trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

## Topic Models as features

In [197]:
# # train a LDA Model
# lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
# X_topics = lda_model.fit_transform(text_train_count)
# topic_word = lda_model.components_
# vocab = count_vect.get_feature_names()

# # view the topic models
# n_top_words = 10
# topic_summaries = []
# for i, topic_dist in enumerate(topic_word):
#     topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
#     topic_summaries.append(' '.join(topic_words))

In [249]:
%%time

import os
from gensim import corpora, models

class MyLDAVectorizer(MyCountVectorizer):
    mallet_path = ".." + os.sep + "mallet-2.0.8"+ os.sep + "bin" + os.sep +"mallet"
    
    def __init__(self, TOPIC_COUNT = 100):
        self.topic_count = TOPIC_COUNT
    
    def fit(self, texts):
        self.dictionary = corpora.Dictionary(texts)
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]
        self.tfidf_model = models.TfidfModel(_corpus)
        _tfidf_corpus = self.tfidf_model[_corpus]

#         self.vect_model = models.LdaModel(_tfidf_corpus, 
#                             num_topics = self.topic_count, 
#                             id2word = self.dictionary,
#                             random_state = 100,
#                             eval_every = 5, 
#                             alpha = 'auto', 
#                             gamma_threshold = 0.01)
        
        self.vect_model = models.wrappers.LdaMallet(self.mallet_path, 
                                                     corpus = _corpus, 
                                                     num_topics = self.topic_count, 
                                                     id2word = self.dictionary)
    
    def toarray(self, doc_topics):
        _doc_vect  = np.zeros((len(doc_topics), self.topic_count))
        
        for i, _doc in enumerate(doc_topics):
            for _topic, _weight in _doc:
                _doc_vect[i][_topic] = _weight
        
        return _doc_vect
        
    def transform(self, texts):
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]
#         _tfidf_corpus = self.tfidf_model[_corpus]
        
        return self.toarray(self.vect_model[_corpus])

lda_vect = MyLDAVectorizer(TOPIC_COUNT = 200)
# lda_vect.fit(train_stemmed_text)
lda_vect.fit(LABELED_STEMMED_TEXTS)
train_lda = lda_vect.transform(train_stemmed_text)
test_lda = lda_vect.transform(test_stemmed_text)
labeled_lda = lda_vect.transform(LABELED_STEMMED_TEXTS)
labeled_lda_extra = np.hstack([labeled_lda, ALL_EXTRA_FEATURE[0:len(LABELS)]])

Wall time: 1min 2s


## Doc2Vec

In [250]:
%%time

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

class MyDoc2Vectorizer(MyCountVectorizer):
    def __init__(self, size = 100):
        self.embedding_size = size
            
    def fit(self, texts):
        _docs = [TaggedDocument(_doc, [i]) for i, _doc in enumerate(texts)]
        self.vect_model = Doc2Vec(_docs, 
                                  vector_size = self.embedding_size, 
                                  window = 5,
                                  min_count = 3,
                                  epochs = 40, 
                                  workers = 4)
        
    def transform(self, texts):
        return np.asarray([self.vect_model.infer_vector(_text) for _text in texts])

doc2vec_vect = MyDoc2Vectorizer(size = 200)
doc2vec_vect.fit(ALL_STEMMED_TEXTS)
# doc2vec_vect.fit(train_stemmed_text)
train_doc2vec = doc2vec_vect.transform(train_stemmed_text)
test_doc2vec = doc2vec_vect.transform(test_stemmed_text)
labeled_doc2vec = doc2vec_vect.transform(LABELED_STEMMED_TEXTS)
labeled_doc2vec_extra = np.hstack([labeled_doc2vec, ALL_EXTRA_FEATURE[0:len(LABELS)]])

Wall time: 4min 24s


In [251]:
#Test quality

import collections

ranks = []
for doc_id in range(len(train_stemmed_text)):
    inferred_vector = doc2vec_vect.vect_model.infer_vector(train_stemmed_text[train_stemmed_text.index[doc_id]])
    sims = doc2vec_vect.vect_model.docvecs.most_similar([inferred_vector], topn=len(doc2vec_vect.vect_model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

collections.Counter(ranks)

Counter({8802: 1,
         9236: 1,
         1051: 1,
         215: 2,
         6014: 1,
         1446: 1,
         2724: 1,
         6238: 1,
         6541: 1,
         8432: 1,
         5117: 1,
         615: 1,
         11350: 1,
         12215: 1,
         8371: 1,
         3863: 1,
         613: 1,
         10199: 1,
         13138: 1,
         1596: 1,
         2245: 1,
         9925: 1,
         12054: 1,
         7801: 1,
         6037: 1,
         3205: 1,
         7688: 2,
         3554: 1,
         12447: 1,
         7015: 1,
         1377: 1,
         11017: 1,
         5868: 1,
         11183: 1,
         5240: 1,
         5527: 1,
         4615: 1,
         10800: 1,
         7770: 1,
         12497: 1,
         12264: 1,
         7155: 1,
         2548: 1,
         3375: 1,
         7069: 1,
         11131: 1,
         13202: 1,
         7906: 1,
         8522: 1,
         12349: 1,
         13061: 1,
         5026: 1,
         2595: 1,
         10909: 1,
         4138: 

# Models

In [262]:
from sklearn import linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

def show_score(classifier_name, scores):
    print("Accuracy:%0.2f Precission:%0.2f Recall:%0.2f F1:%0.2f"%scores, "-> [%s]"%(classifier_name))
    
def train_model(classifier, train_feature, train_label, test_feature, test_label, is_neural_net=False):
#     print(train_feature)
    # fit the training dataset on the classifier
    classifier.fit(train_feature, train_label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(test_feature)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    print(metrics.confusion_matrix(test_label, predictions))
    return (metrics.accuracy_score(test_label, predictions),
            metrics.precision_score(test_label, predictions),
            metrics.recall_score(test_label, predictions),
            metrics.f1_score(test_label, predictions))

def run_model(classfier_configs):
    for _name in classfier_configs:
#         print(classfier_configs[_name].train)
        scores = train_model(classfier_configs[_name][0], classfier_configs[_name][1], train_label, classfier_configs[_name][2], test_label)
        show_score(_name, scores)

def eval_model(classfier_configs):
    for _name in classfier_configs:
        cv_results = cross_validate(classfier_configs[_name][0], 
                                    classfier_configs[_name][1], 
                                    classfier_configs[_name][2],
                                    scoring = 'f1',
                                    cv = 5)
        print(cv_results['test_score'], np.mean(cv_results['test_score']), '\t', _name)

## Naive Bayes 

In [263]:
# nb_config = {
#     "NB Count": (naive_bayes.MultinomialNB(), train_count, test_count),
#     "NB TFIDF": (naive_bayes.MultinomialNB(), train_tfidf, test_tfidf),
#     "NB TFIDF NGram": (naive_bayes.MultinomialNB(), train_tfidf_ngram, test_tfidf_ngram),
#     "NB TFIDF NGram Chars": (naive_bayes.MultinomialNB(), train_tfidf_ngram_chars, test_tfidf_ngram_chars),
#     "NB LDA": (naive_bayes.MultinomialNB(), train_lda, test_lda),
# }

# run_model(nb_config)

nb_config = {
    "NB Extra": (naive_bayes.MultinomialNB(), labeled_extra, LABELS)
    , "NB Count": (naive_bayes.MultinomialNB(), labeled_count, LABELS)
    , "NB Count + Extra": (naive_bayes.MultinomialNB(), labeled_count_extra, LABELS)
    , "NB TFIDF": (naive_bayes.MultinomialNB(), labeled_tfidf, LABELS)
    , "NB TFIDF + Extra": (naive_bayes.MultinomialNB(), labeled_tfidf_extra, LABELS)
    , "NB TFIDF NGram": (naive_bayes.MultinomialNB(), labeled_tfidf_ngram, LABELS)
    , "NB TFIDF NGram + Extra": (naive_bayes.MultinomialNB(), labeled_tfidf_ngram_extra, LABELS)
    , "NB TFIDF NGram Chars": (naive_bayes.MultinomialNB(), labeled_tfidf_ngram_chars, LABELS)
    , "NB TFIDF NGram Chars + Extra": (naive_bayes.MultinomialNB(), labeled_tfidf_ngram_chars_extra, LABELS)
    , "NB LDA": (naive_bayes.MultinomialNB(), labeled_lda, LABELS)
    , "NB LDA + Extra": (naive_bayes.MultinomialNB(), labeled_lda_extra, LABELS)
}

eval_model(nb_config)

[0.61818182 0.68468468 0.54054054 0.64864865 0.53763441] 0.6059380201315685 	 NB Extra
[0.57943925 0.72881356 0.7008547  0.67241379 0.62385321] 0.6610749033251612 	 NB Count
[0.59615385 0.73043478 0.69642857 0.67826087 0.61386139] 0.663027891178989 	 NB Count + Extra
[0.62295082 0.69230769 0.70503597 0.71532847 0.67768595] 0.6826617801538706 	 NB TFIDF
[0.72058824 0.73134328 0.69503546 0.66666667 0.70491803] 0.7037103358645334 	 NB TFIDF + Extra
[0.57627119 0.66115702 0.66666667 0.6557377  0.56363636] 0.6246937892910258 	 NB TFIDF NGram
[0.69064748 0.71111111 0.68493151 0.66197183 0.6984127 ] 0.6894149258746858 	 NB TFIDF NGram + Extra
[0.62745098 0.6835443  0.66666667 0.65789474 0.64383562] 0.6558784608273507 	 NB TFIDF NGram Chars
[0.703125   0.72307692 0.65671642 0.64661654 0.67241379] 0.6803897350888406 	 NB TFIDF NGram Chars + Extra
[0.60377358 0.69724771 0.67857143 0.66037736 0.56      ] 0.6399940156779347 	 NB LDA
[0.60714286 0.7079646  0.59130435 0.63716814 0.52631579] 0.613979

In [264]:
# nb_model = naive_bayes.MultinomialNB()
# nb_model.fit(train_tfidf, train_label)
# predictions = nb_model.predict(test_tfidf)     
# predictions_probs = nb_model.predict_proba(test_tfidf)

# new_predictions = []
# for p in predictions_probs:
#     if p[0] > 0.4:
#          new_predictions.append(0)
#     else:
#         new_predictions.append(1)
        
# print(metrics.confusion_matrix(test_label, predictions))
# print (metrics.accuracy_score(test_label, predictions),
#             metrics.precision_score(test_label, predictions),
#             metrics.recall_score(test_label, predictions),
#             metrics.f1_score(test_label, predictions))

# print(metrics.confusion_matrix(test_label, new_predictions))
# print (metrics.accuracy_score(test_label, new_predictions),
#             metrics.precision_score(test_label, new_predictions),
#             metrics.recall_score(test_label, new_predictions),
#             metrics.f1_score(test_label, new_predictions))

Accuracy:0.55 Precission:0.62 Recall:0.52 F1:0.56 -> [NB Count]
Accuracy:0.51 Precission:0.88 Recall:0.49 F1:0.63 -> [NB TFIDF]
Accuracy:0.52 Precission:0.58 Recall:0.49 F1:0.53 -> [NB TFIDF NGram]
Accuracy:0.49 Precission:0.96 Recall:0.48 F1:0.64 -> [NB TFIDF NGram Chars]
Accuracy:0.55 Precission:0.69 Recall:0.52 F1:0.60 -> [NB LDA]

##  Linear Classifier

In [265]:
from sklearn.linear_model import LogisticRegression

# linear_parameters = {'penalty':('l1', 'l2'), 'C':[10, 1, 0.1, 0.01]}
# linear_model = GridSearchCV(LogisticRegression(solver='saga'), scoring = "f1",param_grid = linear_parameters, cv=5)

# linear_model.fit(train_count, train_label)

# predictions = linear_model.predict(test_count)

# (metrics.accuracy_score(predictions, test_label),
#             metrics.precision_score(predictions, test_label),
#             metrics.recall_score(predictions, test_label),
#             metrics.f1_score(predictions, test_label))

# print(linear_model.best_params_)

In [266]:
# linear_config = {
#     "Linear Count": (LogisticRegression(C= 0.1, penalty='l1', tol=0.1, solver='saga'), train_count, test_count)
#     , "Linear TFIDF": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf, test_tfidf)
#     , "Linear TFIDF NGram": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf_ngram, test_tfidf_ngram)
#     , "Linear TFIDF NGram Chars": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
# #     , "Linear FastText Embedding": (LogisticRegression(solver='lbfgs', max_iter=int(1e6)), train_fasttext_embedding, test_fasttext_embedding)
#     , "Linear LDA": (LogisticRegression(solver='lbfgs'), train_lda, test_lda)
#     , "Linear Doc2Vec": (LogisticRegression(solver='lbfgs'), train_doc2vec, test_doc2vec)
# }

# run_model(linear_config)

linear_config = {
    "Linear Extra": (LogisticRegression(C= 0.1, penalty='l1', tol=0.1, solver='saga'), labeled_extra, LABELS)
    , "Linear Count": (LogisticRegression(C= 0.1, penalty='l1', tol=0.1, solver='saga'), labeled_count, LABELS)
    , "Linear Count Extra": (LogisticRegression(C= 0.1, penalty='l1', tol=0.1, solver='saga'), labeled_count_extra, LABELS)
    , "Linear TFIDF": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), labeled_tfidf, LABELS)
    , "Linear TFIDF Extra": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), labeled_tfidf_extra, LABELS)
    , "Linear TFIDF NGram": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), labeled_tfidf_ngram_extra, LABELS)
    , "Linear TFIDF NGram Extra": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), labeled_tfidf_ngram, LABELS)
    , "Linear TFIDF NGram Chars": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), labeled_tfidf_ngram_chars, LABELS)
    , "Linear TFIDF NGram Chars Extra": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), labeled_tfidf_ngram_chars_extra, LABELS)
    , "Linear LDA": (LogisticRegression(solver='lbfgs'), labeled_lda, LABELS)
    , "Linear LDA Extra": (LogisticRegression(solver='lbfgs'), labeled_lda, LABELS)
    , "Linear Doc2Vec": (LogisticRegression(solver='lbfgs'), labeled_doc2vec, LABELS)
    , "Linear Doc2Vec Extra": (LogisticRegression(solver='lbfgs'), labeled_doc2vec_extra, LABELS)
}

eval_model(linear_config)

[0.6407767  0.59813084 0.51546392 0.53465347 0.45454545] 0.5487140755136768 	 Linear Extra
[0.64285714 0.67532468 0.65079365 0.65306122 0.65693431] 0.6557942000069216 	 Linear Count
[0.61538462 0.73381295 0.59047619 0.67153285 0.66666667] 0.6555746537766177 	 Linear Count Extra
[0.60606061 0.65306122 0.59615385 0.60674157 0.52747253] 0.5978979554420967 	 Linear TFIDF
[0.64220183 0.59813084 0.54716981 0.58715596 0.58585859] 0.5921034072931948 	 Linear TFIDF Extra
[0.64761905 0.59813084 0.5        0.59813084 0.55670103] 0.5801163521579747 	 Linear TFIDF NGram
[0.59183673 0.52873563 0.62809917 0.56818182 0.47619048] 0.5586087669607599 	 Linear TFIDF NGram Extra
[0.66101695 0.64661654 0.63076923 0.671875   0.67226891] 0.6565093257676363 	 Linear TFIDF NGram Chars
[0.63461538 0.59813084 0.51020408 0.58715596 0.55670103] 0.577361460320024 	 Linear TFIDF NGram Chars Extra
[0.60606061 0.63265306 0.63461538 0.6        0.54945055] 0.604555920270206 	 Linear LDA
[0.60606061 0.63265306 0.63461538 



[0.57657658 0.55357143 0.59130435 0.50909091 0.46153846] 0.5384163447206924 	 Linear Doc2Vec Extra


## SVM

In [206]:
from sklearn.svm import SVC

# svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 10, 100], "gamma": np.logspace(-2, 2, 5)}
# svm_model = GridSearchCV(SVC(), scoring = "f1", param_grid = svm_parameters, cv=5)

# svm_model.fit(train_tfidf_ngram_chars, train_label)

# predictions = svm_model.predict(test_tfidf_ngram_chars)

# print(metrics.accuracy_score(predictions, test_label),
#             metrics.precision_score(predictions, test_label),
#             metrics.recall_score(predictions, test_label),
#             metrics.f1_score(predictions, test_label))
# print(svm_model.best_params_)

In [267]:
# svm_config = {
#     "SVM Count": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_count, test_count),
#     "SVM TFIDF": (SVC(C = 10, gamma = 0.01, kernel='rbf'), train_tfidf, test_tfidf),
#     "SVM TFIDF NGram": (SVC(C = 1, gamma = 0.01, kernel='linear'), train_tfidf_ngram, test_tfidf_ngram),
#     "SVM TFIDF NGram Chars": (SVC(C = 10, gamma = 0.01, kernel='rbf'), train_tfidf_ngram_chars, test_tfidf_ngram_chars),
# #     "SVM FastText Embedding": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_fasttext_embedding, test_fasttext_embedding),
#     "SVM LDA": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_lda, test_lda),
#     "SVM Doc2Vec": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_doc2vec, test_doc2vec),
# }
    
# run_model(svm_config)

svm_config = {
    "SVM Extra Only": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_extra, LABELS)
    , "SVM Count": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_count, LABELS)
    , "SVM Count Extra": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_count_extra, LABELS)
    , "SVM TFIDF": (SVC(C = 10, gamma = 0.01, kernel='rbf'), labeled_tfidf, LABELS)
    , "SVM TFIDF Extra": (SVC(C = 10, gamma = 0.01, kernel='rbf'), labeled_tfidf_extra, LABELS)
    , "SVM TFIDF NGram": (SVC(C = 1, gamma = 0.01, kernel='linear'), labeled_tfidf_ngram, LABELS)
    , "SVM TFIDF NGram Extra": (SVC(C = 1, gamma = 0.01, kernel='linear'), labeled_tfidf_ngram_extra, LABELS)
    , "SVM TFIDF NGram Chars": (SVC(C = 10, gamma = 0.01, kernel='rbf'), labeled_tfidf_ngram_chars, LABELS)
    , "SVM TFIDF NGram Chars Extra": (SVC(C = 10, gamma = 0.01, kernel='rbf'), labeled_tfidf_ngram_chars_extra, LABELS)
    , "SVM LDA": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_lda, LABELS)
    , "SVM LDA Extra": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_lda_extra, LABELS)
    , "SVM Doc2Vec": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_doc2vec, LABELS)
    , "SVM Doc2Vec Extra": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_doc2vec_extra, LABELS)
}

eval_model(svm_config)

[0.59649123 0.54       0.49122807 0.59677419 0.57692308] 0.5602833137434157 	 SVM Extra Only
[0.675      0.6625     0.66242038 0.66216216 0.65771812] 0.6639601330266273 	 SVM Count
[0.675      0.6625     0.65822785 0.65771812 0.65771812] 0.6622328179424009 	 SVM Count Extra
[0.58695652 0.56818182 0.55913978 0.60674157 0.42666667] 0.549537272913512 	 SVM TFIDF
[0.59047619 0.66666667 0.55855856 0.62295082 0.54545455] 0.5968213561656184 	 SVM TFIDF Extra
[0.64285714 0.6        0.63157895 0.54545455 0.57142857] 0.5982638414217363 	 SVM TFIDF NGram
[0.60162602 0.67961165 0.63492063 0.58928571 0.59405941] 0.6199006843785086 	 SVM TFIDF NGram Extra
[0.67096774 0.67114094 0.64335664 0.64383562 0.67625899] 0.6611119868267108 	 SVM TFIDF NGram Chars
[0.59047619 0.66       0.55855856 0.61788618 0.55445545] 0.5962752746882183 	 SVM TFIDF NGram Chars Extra
[0.58064516 0.54320988 0.47619048 0.45945946 0.34782609] 0.481466212087998 	 SVM LDA
[0.59649123 0.55445545 0.49122807 0.59677419 0.57692308] 0.

## Random Forest

In [None]:
rf_parameters = {'n_estimators':[100, 200, 500], 'max_features':[None, 0.25, 0.5, 0.75],
                'max_depth': [None, 5, 10], 'min_samples_leaf': [0.0005, 0.01, 0.05, 0.1],
                 'min_samples_split':[2, 5, 10]}
rf_model = GridSearchCV(RandomForestClassifier(), scoring = "f1", param_grid = rf_parameters, cv=5)

rf_model.fit(labeled_doc2vec, LABELS)

print(rf_model.best_score_)
print(rf_model.best_params_)
# predictions = rf_model.predict(test_tfidf_ngram_chars)

# print(metrics.accuracy_score(predictions, test_label),
#             metrics.precision_score(predictions, test_label),
#             metrics.recall_score(predictions, test_label),
#             metrics.f1_score(predictions, test_label))
# print(rf_model.best_params_)

In [268]:
from sklearn.ensemble import RandomForestClassifier

rf_config = {
    "RF Extra Only": (RandomForestClassifier(criterion = 'entropy', n_estimators = 100), labeled_extra, LABELS)
    ,"RF Count": (RandomForestClassifier(criterion = 'entropy', n_estimators = 100), labeled_count, LABELS)
    ,"RF Count Extra": (RandomForestClassifier(criterion = 'entropy', n_estimators = 100), labeled_count_extra, LABELS)
    , "RF TFIDF": (RandomForestClassifier(criterion = 'entropy', n_estimators = 200), labeled_tfidf, LABELS)
    , "RF TFIDF Extra": (RandomForestClassifier(criterion = 'entropy', n_estimators = 200), labeled_tfidf_extra, LABELS)
    , "RF TFIDF NGram": (RandomForestClassifier(criterion = 'gini', n_estimators = 200), labeled_tfidf_ngram, LABELS)
    , "RF TFIDF NGram Extra": (RandomForestClassifier(criterion = 'gini', n_estimators = 200), labeled_tfidf_ngram_extra, LABELS)
    , "RF TFIDF NGram Chars": (RandomForestClassifier(criterion = 'entropy', n_estimators = 200), labeled_tfidf_ngram_chars, LABELS)
    , "RF TFIDF NGram Chars Extra": (RandomForestClassifier(criterion = 'entropy', n_estimators = 200), labeled_tfidf_ngram_chars_extra, LABELS)
    , "RF LDA": (RandomForestClassifier(criterion = 'gini', n_estimators = 300), labeled_lda, LABELS)
    , "RF LDA Extra": (RandomForestClassifier(criterion = 'gini', n_estimators = 300), labeled_lda_extra, LABELS)
    , "RF Doc2Vec": (RandomForestClassifier(criterion = 'gini', n_estimators = 300), labeled_doc2vec, LABELS)
    , "RF Doc2Vec Extra": (RandomForestClassifier(criterion = 'gini', n_estimators = 300), labeled_doc2vec_extra, LABELS)
}

eval_model(rf_config)

[0.57142857 0.66       0.63793103 0.65454545 0.60416667] 0.6256143454246903 	 RF Extra Only
[0.65486726 0.67857143 0.57391304 0.59813084 0.59813084] 0.6207226821859697 	 RF Count
[0.63461538 0.66071429 0.57657658 0.61818182 0.55319149] 0.6086559108899534 	 RF Count Extra
[0.64220183 0.68518519 0.60550459 0.59615385 0.53333333] 0.6124757573381426 	 RF TFIDF
[0.62385321 0.62962963 0.59813084 0.61111111 0.57142857] 0.6068306728599964 	 RF TFIDF Extra
[0.63551402 0.69026549 0.60344828 0.65       0.45454545] 0.6067546471649552 	 RF TFIDF NGram
[0.62857143 0.64814815 0.59813084 0.66055046 0.52747253] 0.6125746808058391 	 RF TFIDF NGram Extra
[0.66666667 0.62608696 0.58928571 0.6        0.54736842] 0.6058815517053503 	 RF TFIDF NGram Chars
[0.57943925 0.61946903 0.55357143 0.6407767  0.50526316] 0.5797039128760826 	 RF TFIDF NGram Chars Extra
[0.58181818 0.63636364 0.55357143 0.67857143 0.48421053] 0.586907040328093 	 RF LDA
[0.61403509 0.64814815 0.55855856 0.66055046 0.53846154] 0.603950758

## Boosting

In [208]:
# %%time

from xgboost import XGBClassifier

# xgb_parameters = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [3, 4, 5]
#         }

# xgb_model = GridSearchCV(XGBClassifier(), scoring = "f1", param_grid = xgb_parameters, cv=5, verbose = 3)

# xgb_model.fit(train_count, train_label)

# predictions = xgb_model.predict(test_count)

# print(metrics.accuracy_score(predictions, test_label),
#             metrics.precision_score(predictions, test_label),
#             metrics.recall_score(predictions, test_label),
#             metrics.f1_score(predictions, test_label))

# print(xgb_model.best_params_)

In [269]:
# xgboost_config = {
#     "XGBoost Count": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_count, test_count)
#     , "XGBoost TFIDF": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf, test_tfidf)
#     , "XGBoost TFIDF NGram": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf_ngram.tocsc(), test_tfidf_ngram.tocsc())
#     , "XGBoost TFIDF NGram Chars": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
#     , "XGBoost LDA": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_lda, test_lda)
#     , "XGBoost Doc2Vec": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_doc2vec, test_doc2vec)
# }

# xgboost_config = {
#     "XGBoost Count": (XGBClassifier(), train_count, test_count)
#     , "XGBoost TFIDF": (XGBClassifier(), train_tfidf, test_tfidf)
#     , "XGBoost TFIDF NGram": (XGBClassifier(), train_tfidf_ngram.tocsc(), test_tfidf_ngram.tocsc())
#     , "XGBoost TFIDF NGram Chars": (XGBClassifier(), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
#     , "XGBoost LDA": (XGBClassifier(), train_lda, test_lda)
#     , "XGBoost Doc2Vec": (XGBClassifier(), train_doc2vec, test_doc2vec)
# }

# run_model(xgboost_config)

xgboost_config = {
    "XGBoost Extra Only": (XGBClassifier(), labeled_count, LABELS)
    , "XGBoost Count": (XGBClassifier(), labeled_count, LABELS)
    , "XGBoost Count Extra": (XGBClassifier(), labeled_count_extra, LABELS)
    , "XGBoost TFIDF": (XGBClassifier(), labeled_tfidf, LABELS)
    , "XGBoost TFIDF Extra": (XGBClassifier(), labeled_tfidf_extra, LABELS)
    , "XGBoost TFIDF NGram": (XGBClassifier(), labeled_tfidf_ngram, LABELS)
    , "XGBoost TFIDF NGram Extra": (XGBClassifier(), labeled_tfidf_ngram_extra, LABELS)
    , "XGBoost TFIDF NGram Chars": (XGBClassifier(), labeled_tfidf_ngram_chars, LABELS)
    , "XGBoost TFIDF NGram Chars Extra": (XGBClassifier(), labeled_tfidf_ngram_chars_extra, LABELS)
    , "XGBoost LDA": (XGBClassifier(), labeled_lda, LABELS)
    , "XGBoost LDA Extra": (XGBClassifier(), labeled_lda_extra, LABELS)
    , "XGBoost Doc2Vec": (XGBClassifier(), labeled_doc2vec, LABELS)
    , "XGBoost Doc2Vec Extra": (XGBClassifier(), labeled_doc2vec_extra, LABELS)
}

eval_model(xgboost_config)

[0.67924528 0.59047619 0.52941176 0.66037736 0.57142857] 0.6061878336240156 	 XGBoost Extra Only
[0.67924528 0.59047619 0.52941176 0.66037736 0.57142857] 0.6061878336240156 	 XGBoost Count
[0.62809917 0.57692308 0.57425743 0.62385321 0.53061224] 0.5867490264253007 	 XGBoost Count Extra
[0.66071429 0.59259259 0.57657658 0.67857143 0.56      ] 0.6136909766909767 	 XGBoost TFIDF
[0.66055046 0.65420561 0.62385321 0.63636364 0.56842105] 0.6286787932393243 	 XGBoost TFIDF Extra
[0.62264151 0.59615385 0.50980392 0.53608247 0.47191011] 0.5473183727485582 	 XGBoost TFIDF NGram
[0.65420561 0.63636364 0.57142857 0.52525253 0.53191489] 0.5838330468276779 	 XGBoost TFIDF NGram Extra
[0.56363636 0.52173913 0.56896552 0.57142857 0.59405941] 0.5639657977363383 	 XGBoost TFIDF NGram Chars
[0.57407407 0.56637168 0.59459459 0.62745098 0.58064516] 0.5886272983534154 	 XGBoost TFIDF NGram Chars Extra
[0.61946903 0.66071429 0.58928571 0.61261261 0.55855856] 0.6081280395439688 	 XGBoost LDA
[0.62608696 0.672

In [26]:
from nltk.classify import NaiveBayesClassifier

_dictionary = corpora.Dictionary(train_stemmed_text)
train_doc_count = [_dictionary.doc2bow(_doc) for _doc in train_stemmed_text]
test_doc_count = [_dictionary.doc2bow(_doc) for _doc in test_stemmed_text]

tfidf_model = models.TfidfModel(train_doc_count)
train_doc_tfidf = [dict(_doc) for _doc in tfidf_model[train_doc_count]]
train_doc_tfidf = [(_doc, train_label[train_label.index[i]]) for i, _doc in enumerate(train_doc_tfidf)]
test_doc_tfidf = [dict(_doc) for _doc in tfidf_model[test_doc_count]]

train_doc_count = [(dict(_doc), train_label[train_label.index[i]]) for i, _doc in enumerate(train_doc_count)]
test_doc_count = [dict(_doc) for _doc in test_doc_count]

NB_classifier_count = NaiveBayesClassifier.train(train_doc_count)

predictions = NB_classifier_count.classify_many(test_doc_count)

print(metrics.confusion_matrix(test_label, predictions))
print(metrics.accuracy_score(test_label, predictions),
            metrics.precision_score(test_label, predictions),
            metrics.recall_score(test_label, predictions),
            metrics.f1_score(test_label, predictions))

[[ 3 51]
 [ 5 51]]
0.4909090909090909 0.5 0.9107142857142857 0.6455696202531646


In [27]:
NB_classifier_tfidf = NaiveBayesClassifier.train(train_doc_tfidf)
predictions = NB_classifier_tfidf.classify_many(test_doc_tfidf)

print(metrics.confusion_matrix(test_label, predictions))
print(metrics.accuracy_score(test_label, predictions),
            metrics.precision_score(test_label, predictions),
            metrics.recall_score(test_label, predictions),
            metrics.f1_score(test_label, predictions))

[[ 0 54]
 [ 0 56]]
0.509090909090909 0.509090909090909 1.0 0.6746987951807228


In [28]:
from nltk.classify import DecisionTreeClassifier


DT_classifier_count = DecisionTreeClassifier.train(train_doc_tfidf)
predictions = DT_classifier_count.classify_many(test_doc_tfidf)

print(metrics.confusion_matrix(test_label, predictions))
print(metrics.accuracy_score(test_label, predictions),
            metrics.precision_score(test_label, predictions),
            metrics.recall_score(test_label, predictions),
            metrics.f1_score(test_label, predictions))

[[54  0]
 [56  0]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.4909090909090909 0.0 0.0 0.0
