Packages:
* [NLTK](http://www.nltk.org/howto/classify.html)
* [SpaCy](https://spacy.io/)
* [AllenNLP](https://allennlp.org/tutorials)

Articles:
* [A Comprehensive Guide to Understand and Implement Text Classification in Python](https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/)
* [Machine Learning, NLP: Text Classification using scikit-learn, python and NLTK](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)
* [State-of-the-Art Text Classification using BERT model: “Predict the Happiness” Challenge](https://appliedmachinelearning.blog/2019/03/04/state-of-the-art-text-classification-using-bert-model-predict-the-happiness-hackerearth-challenge/)

In [232]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from tqdm import tqdm
import numpy as np
import pandas as pd
import string
import time


stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    result_stemmed = []
    for token in simple_preprocess(text, min_len = 2):
        result.append(token)
        if token not in STOPWORDS:
            result_stemmed.append(lemmatize_stemming(token))
    
    return (result, result_stemmed)

with open("./data/raw/Hygiene/hygiene.dat.labels") as f:
    LABELS = [int(l) for l in f.readlines() if l[0].isdigit()]

ALL_RAW_TEXTS = []
ALL_TEXTS = []
ALL_STEMMED_TEXTS = []
ALL_CONCAT_STEMMED_TEXTS = []
LABELED_TEXTS = []
LABELED_CONCAT_TEXTS = []
LABELED_STEMMED_TEXTS = []
LABELED_CONCAT_STEMMED_TEXTS = []

with open("./data/raw/Hygiene/hygiene.dat") as f:
    ALL_RAW_TEXTS = f.readlines()

for _text in tqdm(ALL_RAW_TEXTS):
    _result, _result_stemmed = preprocess(_text)
    ALL_TEXTS.append(_result)
    ALL_STEMMED_TEXTS.append(_result_stemmed)

ALL_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in ALL_STEMMED_TEXTS]

LABELED_TEXTS = ALL_TEXTS[0:len(LABELS)]
LABELED_CONCAT_TEXTS = [" ".join(_text) for _text in LABELED_TEXTS]

LABELED_STEMMED_TEXTS = ALL_STEMMED_TEXTS[0:len(LABELS)]
LABELED_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in LABELED_STEMMED_TEXTS]

100%|██████████| 13299/13299 [03:49<00:00, 58.04it/s]


In [233]:
### Handle Extra Features

In [243]:
code_map = 'abcdefghij'
# code_map = '0123456789'

def encode_zipcode(zip_code):
    _code = ''
    _rest = zip_code    
    while _rest > 0:
        _code = code_map[_rest % 10] + _code
        _rest = int(_rest / 10)

    return _code

FEATURE_MORE =pd.read_csv("./data/raw/Hygiene/hygiene.dat.additional", header=None)
EXTRA_FEATURE = pd.DataFrame()

EXTRA_FEATURE['Cuisines'] = [simple_preprocess(_text) for _text in FEATURE_MORE[0]]

EXTRA_FEATURE['Stars'] = ['PoorStars' for _star in FEATURE_MORE[3]]
star_std_range = (FEATURE_MORE[3].mean() - FEATURE_MORE[3].std(), FEATURE_MORE[3].mean() + FEATURE_MORE[3].std())
# EXTRA_FEATURE['Stars'][FEATURE_MORE[3] < star_std_range[0]] = 'PoorStars'
EXTRA_FEATURE['Stars'][(FEATURE_MORE[3] >= star_std_range[0])] = 'StandardStars'
EXTRA_FEATURE['Stars'][FEATURE_MORE[3] > star_std_range[1]] = 'GoodStars'

EXTRA_FEATURE['ReviewCount'] = ['NoReviews' for _star in FEATURE_MORE[2]]
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 2] = "FewReviews"
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 6] = "SomeReviews"
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 13] = "ManyReviews"
EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 50] = "LotReviews"

EXTRA_FEATURE['ZIPCode'] = [encode_zipcode(_code) for _code in FEATURE_MORE[1]]

EXTRA_FEATURE['MergedText'] = [ [_cuisine for _cuisine in _record[1][0]]
                                + [_record[1][1]] + [_record[1][2]] + [_record[1][3]]
                               for _record in EXTRA_FEATURE.iterrows()]

In [244]:
vect = CountVectorizer(analyzer='word')

vect.fit(FEATURE_MORE[0].replace('Restaurants', '', regex=True))
CUISINES_FEATURE = vect.transform(FEATURE_MORE[0]).todense()

_zipcode = [str(i) for i in FEATURE_MORE[1]]
vect.fit(_zipcode)
ZIPCODE_FEATURE = vect.transform(_zipcode).todense()

REVIEW_COUNT_FEATURE = np.array(FEATURE_MORE[2]).reshape((len(FEATURE_MORE[2]),1))
RATING_FEATURE = np.array(FEATURE_MORE[3]).reshape((len(FEATURE_MORE[3]),1))

ALL_EXTRA_FEATURE = np.hstack((CUISINES_FEATURE, ZIPCODE_FEATURE, REVIEW_COUNT_FEATURE, RATING_FEATURE))
COUSINE_ZIPCODE_EXTRA_FEATURE = np.hstack((CUISINES_FEATURE, ZIPCODE_FEATURE))

### Add Extra Features to the original text

In [245]:
# EXTRA_FEATURES = EXTRA_FEATURE['MergedText'][0:len(LABELS)]
# EXTRA_CONCAT_FEATURES = [" ".join(_text) for _text in EXTRA_FEATURES]
                  
# LABELED_EXTRA_TEXTS =  [EXTRA_FEATURES[i] + _text for i, _text in enumerate(LABELED_TEXTS)]
# LABELED_EXTRA_CONCAT_TEXTS = [" ".join(_text) for _text in LABELED_EXTRA_TEXTS]

# LABELED_EXTRA_STEMMED_TEXTS = [LABELED_EXTRA_TEXTS[i] + _text for i, _text in enumerate(LABELED_STEMMED_TEXTS)]
# LABELED_EXTRA_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in LABELED_EXTRA_STEMMED_TEXTS]

# LABLED_RAW_TEXTS_1 = [EXTRA_CONCAT_FEATURES[i] + "|" + _text for i, _text in enumerate(ALL_RAW_TEXTS[0:len(LABELS)]) if LABELS[i] == 1]
# LABLED_RAW_TEXTS_0 = [EXTRA_CONCAT_FEATURES[i] + "|" + _text for i, _text in enumerate(ALL_RAW_TEXTS[0:len(LABELS)]) if LABELS[i] == 0]

# SMALL_RAW_TEXTS_1 = [_text for _text in LABLED_RAW_TEXTS_1 if len(_text) < 1000]
# SMALL_RAW_TEXTS_0 = [_text for _text in LABLED_RAW_TEXTS_0 if len(_text) < 1000]
# print(len(SMALL_RAW_TEXTS_1), len(SMALL_RAW_TEXTS_0))

## Build train/test dataset

In [275]:
# create a dataframe using texts and lables
labeled_df = pd.DataFrame()
labeled_df['concat_stemmed_text'] = LABELED_CONCAT_STEMMED_TEXTS
labeled_df['stemmed_text'] = LABELED_STEMMED_TEXTS
# labeled_df['concat_stemmed_text'] = LABELED_CONCAT_TEXTS
# labeled_df['stemmed_text'] = LABELED_TEXTS
# labeled_df['concat_stemmed_text'] = LABELED_CONCAT_EXTRA_TEXTS
# labeled_df['stemmed_text'] = LABELED_EXTRA_TEXTS
labeled_df['label'] = LABELS
labeled_extra_all = ALL_EXTRA_FEATURE[0:len(LABELS)]
labeled_extra_cuisine_zipcode = COUSINE_ZIPCODE_EXTRA_FEATURE[0:len(LABELS)]

# split the dataset into training and validation datasets 
train_concat_stemmed_text, test_concat_stemmed_text, train_label, test_label = model_selection.train_test_split(labeled_df['concat_stemmed_text'], 
                                                                                  labeled_df['label'],
                                                                                  test_size = 0.2,
                                                                                  random_state = 10)
train_stemmed_text = labeled_df['stemmed_text'][train_concat_stemmed_text.index]
test_stemmed_text = labeled_df['stemmed_text'][test_concat_stemmed_text.index]

# # label encode the target variable 
# encoder = preprocessing.LabelEncoder()
# train_label = encoder.fit_transform(train_label)
# test_label = encoder.fit_transform(test_label)

In [276]:
# dictionary = corpora.Dictionary(processed_docs)
# print("Before prunn:%d"%(len(dictionary)))
# dictionary.filter_extremes(no_below = 2, no_above = 0.5)
# print("After prunn:%d"%(len(dictionary)))
# corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Feature Engineering

## Count Vectors as features

In [277]:
class MyCountVectorizer:
    def __init__(self, max_df = 0.5):
        self.vect_model = CountVectorizer(analyzer='word', max_df = max_df)
        
    def fit(self, texts_concat):
        self.vect_model.fit(texts_concat)
    
    def transform(self, texts_concat):
        return self.vect_model.transform(texts_concat)

count_vect = MyCountVectorizer()
count_vect.fit(train_concat_stemmed_text)
train_count = count_vect.transform(train_concat_stemmed_text)
test_count =  count_vect.transform(test_concat_stemmed_text)
labeled_count = count_vect.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_count_extra_all = np.hstack([labeled_count.todense(), labeled_extra_all])
labeled_count_extra_cuisine_zipcode = np.hstack([labeled_count.todense(), labeled_extra_cuisine_zipcode])

## TF-IDF Vectors as features

In [283]:
%%time

# word level tf-idf
class MyTfidfVectorizer(MyCountVectorizer):
    def __init__(self, analyzer='word', ngram_range = None, max_features=5000):
        if ngram_range is None:
            self.vect_model = TfidfVectorizer(analyzer = analyzer, max_features = max_features)
        else:
            self.vect_model = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, 
                                              max_features = max_features)
            
    def fit(self, texts_concat):   
        self.vect_model.fit(texts_concat)
        self.vocabulary = self.vect_model.vocabulary_


tfidf_vect = MyTfidfVectorizer(max_features = 10000)
tfidf_vect.fit(train_concat_stemmed_text)
train_tfidf =  tfidf_vect.transform(train_concat_stemmed_text)
test_tfidf =  tfidf_vect.transform(test_concat_stemmed_text)
labeled_tfidf =  tfidf_vect.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf_extra_all = np.hstack([labeled_tfidf.todense(), labeled_extra_all])
labeled_tfidf_extra_cuisine_zipcode = np.hstack([labeled_tfidf.todense(), labeled_extra_cuisine_zipcode])

# ngram level tf-idf 
tfidf_vect_ngram = MyTfidfVectorizer(ngram_range=(2,3), max_features = 10000)
tfidf_vect_ngram.fit(train_concat_stemmed_text)
train_tfidf_ngram =  tfidf_vect_ngram.transform(train_concat_stemmed_text)
test_tfidf_ngram =  tfidf_vect_ngram.transform(test_concat_stemmed_text)
labeled_tfidf_ngram = tfidf_vect_ngram.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf_ngram_extra_all = np.hstack([labeled_tfidf_ngram.todense(), labeled_extra_all])
labeled_tfidf_ngram_extra_cuisine_zipcode = np.hstack([labeled_tfidf_ngram.todense(), labeled_extra_cuisine_zipcode])

# characters level tf-idf
tfidf_vect_ngram_chars = MyTfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features = 10000)
tfidf_vect_ngram_chars.fit(train_concat_stemmed_text)
train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_concat_stemmed_text) 
test_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_concat_stemmed_text)
labeled_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf_ngram_chars_extra_all = np.hstack([labeled_tfidf_ngram_chars.todense(), labeled_extra_all])
labeled_tfidf_ngram_chars_extra_cuisine_zipcode = np.hstack([labeled_tfidf_ngram_chars.todense(), labeled_extra_cuisine_zipcode])

CPU times: user 12.6 s, sys: 421 ms, total: 13 s
Wall time: 12.6 s


## Word Embeddings

### Build from review corpus

In [256]:
# %%time

# from gensim.models.fasttext import FastText

# class MyFastTextTfidfVectorizer(MyCountVectorizer):
#     def __init__(self, tfidf_vectorizer, size = 100):
#         self.embedding_size = size
#         self.tfidf_vectorizer = tfidf_vectorizer
#         self.fasttext_model = FastText(size = size, window = 5, min_count = 5)

#     def tfidf2embedding(self, value_vector):
#         _weighted_value = np.zeros(self.embedding_size)
#         for key in self.tfidf_vectorizer.vocabulary:
#             _index = self.tfidf_vectorizer.vocabulary[key]
#             if value_vector[_index] != 0:
#                 _weighted_value += self.fasttext_model[key] * value_vector[_index]

#         return _weighted_value
    
#     def fit(self, texts):
#         _texts_concat = [" ".join(_text) for _text in texts]
#         self.tfidf_vectorizer = MyTfidfVectorizer()
#         self.tfidf_vectorizer.fit(_texts_concat)
        
#         self.fasttext_model.build_vocab(sentences = texts)
#         self.fasttext_model.train(sentences = texts, 
#                                   total_examples = len(texts), 
#                                   epochs=10)
        
#     def transform(self, texts):
#         _texts_concat = [" ".join(_text) for _text in texts]
#         _tfidf_values = self.tfidf_vectorizer.transform(_texts_concat)
#         return np.asarray([self.tfidf2embedding(_value.toarray()[0]) for _value in _tfidf_values])

# fasttext_tfidf_vect = MyFastTextTfidfVectorizer(tfidf_vect)
# fasttext_tfidf_vect.fit(ALL_STEMMED_TEXTS)
# train_fasttext_embedding = fasttext_tfidf_vect.transform(train_stemmed_text)
# test_fasttext_embedding = fasttext_tfidf_vect.transform(test_stemmed_text)

### Prebuilt Embedding

In [257]:
# %%time
# from keras.preprocessing import text, sequence
# from keras import layers, models, optimizers

# # load the pre-trained word-embedding vectors 
# embeddings_index = {}
# for i, line in enumerate(open('data/model/wiki-news-300d-1M.vec')):
#     values = line.split()
#     embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# # create a tokenizer 
# token = text.Tokenizer()
# token.fit_on_texts(LABELED_CONCAT_STEMMED_TEXTS)
# word_index = token.word_index

# # convert text to sequence of tokens and pad them to ensure equal length vectors 
# text_train_seq = sequence.pad_sequences(token.texts_to_sequences(train_text), maxlen=70)
# text_test_seq = sequence.pad_sequences(token.texts_to_sequences(test_text), maxlen=70)

# # create token-embedding mapping
# embedding_matrix = np.zeros((len(word_index) + 1, 300))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

## Text / NLP based features

In [258]:
# %%time
# trainDF['char_count'] = trainDF['text'].apply(len)
# trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
# trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
# trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
# trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
# trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [259]:
# %%time

# import textblob

# pos_family = {
#     'noun' : ['NN','NNS','NNP','NNPS'],
#     'pron' : ['PRP','PRP$','WP','WP$'],
#     'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
#     'adj' :  ['JJ','JJR','JJS'],
#     'adv' : ['RB','RBR','RBS','WRB']
# }

# # function to check and get the part of speech tag count of a words in a given sentence
# def check_pos_tag(x, flag):
#     cnt = 0
#     try:
#         wiki = textblob.TextBlob(x)
#         for tup in wiki.tags:
#             ppo = list(tup)[1]
#             if ppo in pos_family[flag]:
#                 cnt += 1
#     except:
#         pass
#     return cnt

# trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
# trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
# trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
# trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
# trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

## Topic Models as features

In [260]:
# # train a LDA Model
# lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
# X_topics = lda_model.fit_transform(text_train_count)
# topic_word = lda_model.components_
# vocab = count_vect.get_feature_names()

# # view the topic models
# n_top_words = 10
# topic_summaries = []
# for i, topic_dist in enumerate(topic_word):
#     topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
#     topic_summaries.append(' '.join(topic_words))

In [279]:
%%time

import os
from gensim import corpora, models

class MyLDAVectorizer(MyCountVectorizer):
    mallet_path = ".." + os.sep + "mallet-2.0.8"+ os.sep + "bin" + os.sep +"mallet"
    
    def __init__(self, TOPIC_COUNT = 100):
        self.topic_count = TOPIC_COUNT
    
    def fit(self, texts):
        self.dictionary = corpora.Dictionary(texts)
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]
        self.tfidf_model = models.TfidfModel(_corpus)
        _tfidf_corpus = self.tfidf_model[_corpus]

#         self.vect_model = models.LdaModel(_tfidf_corpus, 
#                             num_topics = self.topic_count, 
#                             id2word = self.dictionary,
#                             random_state = 100,
#                             eval_every = 5, 
#                             alpha = 'auto', 
#                             gamma_threshold = 0.01)
        
        self.vect_model = models.wrappers.LdaMallet(self.mallet_path, 
                                                     corpus = _corpus, 
                                                     num_topics = self.topic_count, 
                                                     id2word = self.dictionary)
    
    def toarray(self, doc_topics):
        _doc_vect  = np.zeros((len(doc_topics), self.topic_count))
        
        for i, _doc in enumerate(doc_topics):
            for _topic, _weight in _doc:
                _doc_vect[i][_topic] = _weight
        
        return _doc_vect
        
    def transform(self, texts):
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]
#         _tfidf_corpus = self.tfidf_model[_corpus]
        
        return self.toarray(self.vect_model[_corpus])

lda_vect = MyLDAVectorizer(TOPIC_COUNT = 200)
# lda_vect.fit(train_stemmed_text)
lda_vect.fit(LABELED_STEMMED_TEXTS)
train_lda = lda_vect.transform(train_stemmed_text)
test_lda = lda_vect.transform(test_stemmed_text)
labeled_lda = lda_vect.transform(LABELED_STEMMED_TEXTS)
labeled_lda_extra_all = np.hstack([labeled_lda, labeled_extra_all])
labeled_lda_extra_cuisine_zipcode = np.hstack([labeled_lda, labeled_extra_cuisine_zipcode])

CPU times: user 4.99 s, sys: 275 ms, total: 5.27 s
Wall time: 1min 4s


## Doc2Vec

In [280]:
%%time

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

class MyDoc2Vectorizer(MyCountVectorizer):
    def __init__(self, size = 100):
        self.embedding_size = size
            
    def fit(self, texts):
        _docs = [TaggedDocument(_doc, [i]) for i, _doc in enumerate(texts)]
        self.vect_model = Doc2Vec(_docs, 
                                  vector_size = self.embedding_size, 
                                  window = 5,
                                  min_count = 3,
                                  epochs = 40, 
                                  workers = 4)
        
    def transform(self, texts):
        return np.asarray([self.vect_model.infer_vector(_text) for _text in texts])

doc2vec_vect = MyDoc2Vectorizer(size = 200)
doc2vec_vect.fit(ALL_STEMMED_TEXTS)
# doc2vec_vect.fit(train_stemmed_text)
train_doc2vec = doc2vec_vect.transform(train_stemmed_text)
test_doc2vec = doc2vec_vect.transform(test_stemmed_text)
labeled_doc2vec = doc2vec_vect.transform(LABELED_STEMMED_TEXTS)
labeled_doc2vec_extra_all = np.hstack([labeled_doc2vec, labeled_extra_all])
labeled_doc2vec_extra_cuisine_zipcode = np.hstack([labeled_doc2vec, labeled_extra_cuisine_zipcode])

CPU times: user 14min 2s, sys: 14.8 s, total: 14min 17s
Wall time: 4min 35s


In [265]:
#Test quality

import collections

ranks = []
for doc_id in range(len(train_stemmed_text)):
    inferred_vector = doc2vec_vect.vect_model.infer_vector(train_stemmed_text[train_stemmed_text.index[doc_id]])
    sims = doc2vec_vect.vect_model.docvecs.most_similar([inferred_vector], topn=len(doc2vec_vect.vect_model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

collections.Counter(ranks)

Counter({11596: 1,
         8485: 1,
         3093: 1,
         221: 1,
         8521: 2,
         1237: 1,
         5879: 1,
         6731: 1,
         1611: 1,
         12589: 1,
         5535: 1,
         5: 1,
         5310: 1,
         10324: 1,
         7754: 1,
         4890: 1,
         391: 1,
         1976: 1,
         11106: 1,
         2432: 1,
         1014: 1,
         9043: 1,
         8783: 1,
         3411: 1,
         3377: 1,
         8359: 1,
         4268: 1,
         1599: 1,
         9125: 1,
         1594: 1,
         8707: 1,
         10676: 1,
         11748: 1,
         8219: 1,
         3273: 1,
         3302: 1,
         4692: 1,
         7485: 1,
         7201: 1,
         7834: 1,
         8897: 1,
         7142: 1,
         610: 1,
         6360: 1,
         10809: 1,
         9684: 1,
         13025: 1,
         8871: 1,
         12817: 1,
         8566: 1,
         12674: 1,
         9265: 1,
         633: 1,
         9753: 1,
         7830: 1,
       

# Models

In [332]:
from sklearn import linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate

def show_score(classifier_name, scores):
    print("Accuracy:%0.2f Precission:%0.2f Recall:%0.2f F1:%0.2f"%scores, "-> [%s]"%(classifier_name))
    
def train_model(classifier, train_feature, train_label, test_feature, test_label, is_neural_net=False):
#     print(train_feature)
    # fit the training dataset on the classifier
    classifier.fit(train_feature, train_label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(test_feature)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    print(metrics.confusion_matrix(test_label, predictions))
    return (metrics.accuracy_score(test_label, predictions),
            metrics.precision_score(test_label, predictions),
            metrics.recall_score(test_label, predictions),
            metrics.f1_score(test_label, predictions))

def run_model(classfier_configs):
    for _name in classfier_configs:
#         print(classfier_configs[_name].train)
        scores = train_model(classfier_configs[_name][0], classfier_configs[_name][1], train_label, classfier_configs[_name][2], test_label)
        show_score(_name, scores)

def eval_model(classfier_configs, param_distributions = None, do_grid_search = False):
    test_scores = []
    best_params = []
    model_names = []
    
    for _name in classfier_configs:
        model_names.append(_name)
        if not (param_distributions is None):
            if do_grid_search:
                best_model = GridSearchCV(classfier_configs[_name][0], 
                                             scoring = "f1", 
                                             param_grid = param_distributions, 
                                             cv = 5)
            else:
                best_model = RandomizedSearchCV(classfier_configs[_name][0], 
                                             scoring = "f1", 
                                             param_distributions = param_distributions, 
                                           cv = 5)
            
            best_model.fit(classfier_configs[_name][1], classfier_configs[_name][2])
            _score = best_model.best_score_
            _params = best_model.best_params_

        else:
            cv_results = cross_validate(classfier_configs[_name][0], 
                                        classfier_configs[_name][1], 
                                        classfier_configs[_name][2],
                                        scoring = 'f1',
                                        cv = 5)
            
            _score = np.mean(cv_results['test_score'])
            _params = None
        
        print(_score, '\t', _params, '\t', _name)
        test_scores.append(_score)
        best_params.append(_params)
    
    best_of_best = np.argmax(test_scores)
    print('Best Model:[', model_names[best_of_best], '] Test Score:', "%0.3f"%(test_scores[best_of_best]))
        
    return {'test_scores': test_scores, 'best_parameters': best_params}

## Naive Bayes 

In [333]:
# nb_config = {
#     "NB Count": (naive_bayes.MultinomialNB(), train_count, test_count),
#     "NB TFIDF": (naive_bayes.MultinomialNB(), train_tfidf, test_tfidf),
#     "NB TFIDF NGram": (naive_bayes.MultinomialNB(), train_tfidf_ngram, test_tfidf_ngram),
#     "NB TFIDF NGram Chars": (naive_bayes.MultinomialNB(), train_tfidf_ngram_chars, test_tfidf_ngram_chars),
#     "NB LDA": (naive_bayes.MultinomialNB(), train_lda, test_lda),
# }

# run_model(nb_config)

nb_config = {
    "NB Extra All": (naive_bayes.MultinomialNB(), labeled_extra_all, LABELS)
    , "NB Extra Cuisine Zipcode": (naive_bayes.MultinomialNB(), labeled_extra_cuisin_zipcode, LABELS)
    , "NB Count": (naive_bayes.MultinomialNB(), labeled_count, LABELS)
    , "NB Count + Extra All": (naive_bayes.MultinomialNB(), labeled_count_extra_all, LABELS)
    , "NB Count + Extra Cuisine Zipcode": (naive_bayes.MultinomialNB(), labeled_count_extra_cuisine_zipcode, LABELS)
    , "NB TFIDF": (naive_bayes.MultinomialNB(), labeled_tfidf, LABELS)
    , "NB TFIDF + Extra All": (naive_bayes.MultinomialNB(), labeled_tfidf_extra_all, LABELS)
    , "NB TFIDF + Extra Cuisine Zipcode": (naive_bayes.MultinomialNB(), labeled_tfidf_extra_cuisine_zipcode, LABELS)
    , "NB TFIDF NGram": (naive_bayes.MultinomialNB(), labeled_tfidf_ngram, LABELS)
    , "NB TFIDF NGram + Extra All": (naive_bayes.MultinomialNB(), labeled_tfidf_ngram_extra, LABELS)
    , "NB TFIDF NGram + Extra Cuisine ZipCode": (naive_bayes.MultinomialNB(), labeled_tfidf_ngram_extra_cuisine_zipcode, LABELS)
#     , "NB TFIDF NGram Chars": (naive_bayes.MultinomialNB(), labeled_tfidf_ngram_chars, LABELS)
#     , "NB TFIDF NGram Chars + Extra": (naive_bayes.MultinomialNB(), labeled_tfidf_ngram_chars_extra, LABELS)
    , "NB LDA": (naive_bayes.MultinomialNB(), labeled_lda, LABELS)
    , "NB LDA + Extra All": (naive_bayes.MultinomialNB(), labeled_lda_extra_all, LABELS)
    , "NB LDA + Extra Cuisine Zipcode": (naive_bayes.MultinomialNB(), labeled_lda_extra_cuisine_zipcode, LABELS)
}

nb_eval = eval_model(nb_config)

0.6059380201315685 	 None 	 NB Extra All
0.6272072595274806 	 None 	 NB Extra Cuisine Zipcode
0.657405178554519 	 None 	 NB Count
0.6618923600434579 	 None 	 NB Count + Extra All
0.6525285916293603 	 None 	 NB Count + Extra Cuisine Zipcode
0.6816912232452474 	 None 	 NB TFIDF
0.7037103358645334 	 None 	 NB TFIDF + Extra All
0.6760530479848176 	 None 	 NB TFIDF + Extra Cuisine Zipcode
0.6502677467936558 	 None 	 NB TFIDF NGram
0.6928174173727925 	 None 	 NB TFIDF NGram + Extra All
0.6623921796651286 	 None 	 NB TFIDF NGram + Extra Cuisine ZipCode
0.6466715366588932 	 None 	 NB LDA
0.6139791475610921 	 None 	 NB LDA + Extra All
0.6454713229354379 	 None 	 NB LDA + Extra Cuisine Zipcode
Best Model:[ NB TFIDF + Extra All ] Test Score: 0.704


In [334]:
# nb_model = naive_bayes.MultinomialNB()
# nb_model.fit(train_tfidf, train_label)
# predictions = nb_model.predict(test_tfidf)     
# predictions_probs = nb_model.predict_proba(test_tfidf)

# new_predictions = []
# for p in predictions_probs:
#     if p[0] > 0.4:
#          new_predictions.append(0)
#     else:
#         new_predictions.append(1)
        
# print(metrics.confusion_matrix(test_label, predictions))
# print (metrics.accuracy_score(test_label, predictions),
#             metrics.precision_score(test_label, predictions),
#             metrics.recall_score(test_label, predictions),
#             metrics.f1_score(test_label, predictions))

# print(metrics.confusion_matrix(test_label, new_predictions))
# print (metrics.accuracy_score(test_label, new_predictions),
#             metrics.precision_score(test_label, new_predictions),
#             metrics.recall_score(test_label, new_predictions),
#             metrics.f1_score(test_label, new_predictions))

Accuracy:0.55 Precission:0.62 Recall:0.52 F1:0.56 -> [NB Count]
Accuracy:0.51 Precission:0.88 Recall:0.49 F1:0.63 -> [NB TFIDF]
Accuracy:0.52 Precission:0.58 Recall:0.49 F1:0.53 -> [NB TFIDF NGram]
Accuracy:0.49 Precission:0.96 Recall:0.48 F1:0.64 -> [NB TFIDF NGram Chars]
Accuracy:0.55 Precission:0.69 Recall:0.52 F1:0.60 -> [NB LDA]

##  Linear Classifier

In [335]:
from sklearn.linear_model import LogisticRegression

# linear_parameters = {'penalty':('l1', 'l2'), 'C':[10, 1, 0.1, 0.01]}
# linear_model = GridSearchCV(LogisticRegression(solver='saga'), scoring = "f1",param_grid = linear_parameters, cv=5)

# linear_model.fit(train_count, train_label)

# predictions = linear_model.predict(test_count)

# (metrics.accuracy_score(predictions, test_label),
#             metrics.precision_score(predictions, test_label),
#             metrics.recall_score(predictions, test_label),
#             metrics.f1_score(predictions, test_label))

# print(linear_model.best_params_)

In [None]:
%% time
# linear_config = {
#     "Linear Count": (LogisticRegression(C= 0.1, penalty='l1', tol=0.1, solver='saga'), train_count, test_count)
#     , "Linear TFIDF": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf, test_tfidf)
#     , "Linear TFIDF NGram": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf_ngram, test_tfidf_ngram)
#     , "Linear TFIDF NGram Chars": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
# #     , "Linear FastText Embedding": (LogisticRegression(solver='lbfgs', max_iter=int(1e6)), train_fasttext_embedding, test_fasttext_embedding)
#     , "Linear LDA": (LogisticRegression(solver='lbfgs'), train_lda, test_lda)
#     , "Linear Doc2Vec": (LogisticRegression(solver='lbfgs'), train_doc2vec, test_doc2vec)
# }

# run_model(linear_config)

linear_parameters = {'penalty':('l1', 'l2'), 'C':[10, 1, 0.1, 0.01], 'solver':['saga']}

linear_config = {
    "Linear Extra All": (LogisticRegression(), labeled_extra_all, LABELS)
    , "Linear Extra Cuisine Zipcode": (LogisticRegression(), labeled_extra_cuisin_zipcode, LABELS)
    , "Linear Count": (LogisticRegression(), labeled_count, LABELS)
    , "Linear Count Extra All": (LogisticRegression(), labeled_count_extra_all, LABELS)
    , "Linear Count Extra Cuisine+Zipcode": (LogisticRegression(), labeled_count_extra_cuisine_zipcode, LABELS)
    , "Linear TFIDF": (LogisticRegression(), labeled_tfidf, LABELS)
    , "Linear TFIDF Extra All": (LogisticRegression(), labeled_tfidf_extra_all, LABELS)
    , "Linear TFIDF Extra Cuisine Zipcode": (LogisticRegression(), labeled_tfidf_extra_cuisine_zipcode, LABELS)
    , "Linear TFIDF NGram": (LogisticRegression(), labeled_tfidf_ngram, LABELS)
    , "Linear TFIDF NGram Extra All": (LogisticRegression(), labeled_tfidf_ngram_extra_all, LABELS)
    , "Linear TFIDF NGram Extra Cuisine Zipcode": (LogisticRegression(), labeled_tfidf_ngram_extra_cuisine_zipcode, LABELS)

#     , "Linear TFIDF NGram Chars": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), labeled_tfidf_ngram_chars, LABELS)
#     , "Linear TFIDF NGram Chars Extra": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), labeled_tfidf_ngram_chars_extra, LABELS)
    , "Linear LDA": (LogisticRegression(), labeled_lda, LABELS)
    , "Linear LDA Extra All": (LogisticRegression(), labeled_lda_extra_all, LABELS)
    , "Linear LDA Extra Cuisine Zipcode": (LogisticRegression(), labeled_lda_extra_cuisine_zipcode, LABELS)
    , "Linear Doc2Vec": (LogisticRegression(), labeled_doc2vec, LABELS)
    , "Linear Doc2Vec Extra All": (LogisticRegression(), labeled_doc2vec_extra_all, LABELS)
    , "Linear Doc2Vec Extra Cuisine Zipcode": (LogisticRegression(), labeled_doc2vec_extra_all, LABELS)
}

linear_eval = eval_model(linear_config, param_distributions = linear_parameters, do_grid_search = True)





0.5735167000739209 	 {'C': 1, 'penalty': 'l1', 'solver': 'saga'} 	 Linear Extra All


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.642433672505511 	 {'C': 1, 'penalty': 'l2', 'solver': 'saga'} 	 Linear Extra Cuisine Zipcode




0.6515652424608699 	 {'C': 10, 'penalty': 'l2', 'solver': 'saga'} 	 Linear Count






0.6524011955097618 	 {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'} 	 Linear Count Extra All




In [337]:
linear_eval

{'test_scores': [0.626801630824723,
  0.6449984819134719,
  0.578211846856285,
  0.5722496078405415,
  0.5815346549774638,
  0.6262137864319106,
  0.649781461802015,
  0.6525754468816669,
  0.6133425467129582,
  0.6487110688085156,
  0.6508696294298575,
  0.6070709000705748,
  0.6256979013438977,
  0.6407545503102018,
  0.5443858998144713,
  0.558356583800378,
  0.558356583800378],
 'best_parameters': [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None]}

## SVM

In [206]:
from sklearn.svm import SVC

# svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 10, 100], "gamma": np.logspace(-2, 2, 5)}
# svm_model = GridSearchCV(SVC(), scoring = "f1", param_grid = svm_parameters, cv=5)

# svm_model.fit(train_tfidf_ngram_chars, train_label)

# predictions = svm_model.predict(test_tfidf_ngram_chars)

# print(metrics.accuracy_score(predictions, test_label),
#             metrics.precision_score(predictions, test_label),
#             metrics.recall_score(predictions, test_label),
#             metrics.f1_score(predictions, test_label))
# print(svm_model.best_params_)

In [270]:
# svm_config = {
#     "SVM Count": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_count, test_count),
#     "SVM TFIDF": (SVC(C = 10, gamma = 0.01, kernel='rbf'), train_tfidf, test_tfidf),
#     "SVM TFIDF NGram": (SVC(C = 1, gamma = 0.01, kernel='linear'), train_tfidf_ngram, test_tfidf_ngram),
#     "SVM TFIDF NGram Chars": (SVC(C = 10, gamma = 0.01, kernel='rbf'), train_tfidf_ngram_chars, test_tfidf_ngram_chars),
# #     "SVM FastText Embedding": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_fasttext_embedding, test_fasttext_embedding),
#     "SVM LDA": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_lda, test_lda),
#     "SVM Doc2Vec": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_doc2vec, test_doc2vec),
# }
    
# run_model(svm_config)

svm_config = {
    "SVM Extra Only": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_extra_all, LABELS)
    , "SVM Extra Cuisin & Zipcode": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_extra_cuisin_zipcode, LABELS)
    , "SVM Count": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_count, LABELS)
    , "SVM Count Extra All": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_count_extra_all, LABELS)
    , "SVM Count Extra Cuisine Zipcode": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_count_extra_cuisine_zipcode, LABELS)
    , "SVM TFIDF": (SVC(C = 10, gamma = 0.01, kernel='rbf'), labeled_tfidf, LABELS)
    , "SVM TFIDF Extra": (SVC(C = 10, gamma = 0.01, kernel='rbf'), labeled_tfidf_extra, LABELS)
    , "SVM TFIDF NGram": (SVC(C = 1, gamma = 0.01, kernel='linear'), labeled_tfidf_ngram, LABELS)
    , "SVM TFIDF NGram Extra": (SVC(C = 1, gamma = 0.01, kernel='linear'), labeled_tfidf_ngram_extra, LABELS)
    , "SVM TFIDF NGram Chars": (SVC(C = 10, gamma = 0.01, kernel='rbf'), labeled_tfidf_ngram_chars, LABELS)
    , "SVM TFIDF NGram Chars Extra": (SVC(C = 10, gamma = 0.01, kernel='rbf'), labeled_tfidf_ngram_chars_extra, LABELS)
    , "SVM LDA": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_lda, LABELS)
    , "SVM LDA Extra": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_lda_extra, LABELS)
    , "SVM Doc2Vec": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_doc2vec, LABELS)
    , "SVM Doc2Vec Extra": (SVC(C = 1, gamma = 0.1, kernel='rbf'), labeled_doc2vec_extra, LABELS)
}

eval_model(svm_config)

[0.59649123 0.54       0.49122807 0.59677419 0.57692308] 0.5602833137434157 	 SVM Extra Only
[0.6        0.71028037 0.66071429 0.60952381 0.58333333] 0.632770360480641 	 SVM Extra Cuisin & Zipcode
[0.675      0.6625     0.65822785 0.66216216 0.65771812] 0.6631216262137595 	 SVM Count
[0.675      0.6625     0.65822785 0.65771812 0.65771812] 0.6622328179424009 	 SVM Count Extra
[0.58695652 0.56818182 0.55913978 0.60674157 0.42666667] 0.549537272913512 	 SVM TFIDF
[0.59047619 0.66666667 0.55855856 0.62295082 0.54545455] 0.5968213561656184 	 SVM TFIDF Extra
[0.58333333 0.62626263 0.71428571 0.56       0.56179775] 0.6091358853381326 	 SVM TFIDF NGram
[0.63247863 0.72897196 0.65       0.63247863 0.58585859] 0.6459575626865347 	 SVM TFIDF NGram Extra
[0.67096774 0.67114094 0.63380282 0.64827586 0.67647059] 0.6601315897476935 	 SVM TFIDF NGram Chars
[0.59047619 0.66       0.55855856 0.61788618 0.55445545] 0.5962752746882183 	 SVM TFIDF NGram Chars Extra
[0.57471264 0.30555556 0.48192771 0.4166

## Random Forest

In [290]:
%%time

rf_parameters = {'n_estimators':[100, 200, 500], 'max_features':[None, 0.25, 0.5, 0.75],
                'max_depth': [None, 5, 10], 'min_samples_leaf': [0.0005, 0.01, 0.05, 0.1],
                 'min_samples_split':[2, 5, 10]}
labeled_data = [labeled_extra_all, labeled_extra_cuisin_zipcode, 
                , labeled_count, labeled_count_extra_all, labeled_count_extra_cuisin_zipcode
                , labeled_tfidf, labeled_tfidf_extra_all, labeled_tfidf_extra_cuisin_zipcode,
                , labeled_tfidf_ngram, labeled_tfidf_ngram_extra_all, labeled_tfidf_ngram_extra_cuisin_zipcode,
                , labeled_lda, labeled_lda_extra_all, labeled_lda_extra__cuisin_zipcode
                , labeled_doc2vec, labeled_doc2vec_extra_all, labeled_lda_extra__cuisin_zipcode]

rf_model = RandomizedSearchCV(RandomForestClassifier(), n_iter = 100, scoring = "f1", param_distributions = rf_parameters, cv=5)

for _data in labeled_data:
    %time rf_model.fit(_data, LABELS)
    print(rf_model.best_score_)
    print(rf_model.best_params_)
    
# predictions = rf_model.predict(test_tfidf_ngram_chars)

# print(metrics.accuracy_score(predictions, test_label),
#             metrics.precision_score(predictions, test_label),
#             metrics.recall_score(predictions, test_label),
#             metrics.f1_score(predictions, test_label))
# print(rf_model.best_params_)

KeyboardInterrupt: 

In [271]:
from sklearn.ensemble import RandomForestClassifier

rf_config = {
    "RF Extra Only": (RandomForestClassifier(criterion = 'entropy', n_estimators = 500), 
                      labeled_extra_all, LABELS)
    , "RF Extra Cuisine + Zipcode": (RandomForestClassifier(criterion = 'entropy', n_estimators = 100), labeled_extra_cuisin_zipcode, LABELS)
    , "RF Count": (RandomForestClassifier(criterion = 'entropy', n_estimators = 100), labeled_count, LABELS)
    , "RF Count Extra": (RandomForestClassifier(criterion = 'entropy', n_estimators = 100), labeled_count_extra, LABELS)
    , "RF TFIDF": (RandomForestClassifier(criterion = 'entropy', n_estimators = 200), labeled_tfidf, LABELS)
    , "RF TFIDF Extra": (RandomForestClassifier(criterion = 'entropy', n_estimators = 200), labeled_tfidf_extra, LABELS)
    , "RF TFIDF NGram": (RandomForestClassifier(criterion = 'gini', n_estimators = 200), labeled_tfidf_ngram, LABELS)
    , "RF TFIDF NGram Extra": (RandomForestClassifier(criterion = 'gini', n_estimators = 200), labeled_tfidf_ngram_extra, LABELS)
    , "RF TFIDF NGram Chars": (RandomForestClassifier(criterion = 'entropy', n_estimators = 200), labeled_tfidf_ngram_chars, LABELS)
    , "RF TFIDF NGram Chars Extra": (RandomForestClassifier(criterion = 'entropy', n_estimators = 200), labeled_tfidf_ngram_chars_extra, LABELS)
    , "RF LDA": (RandomForestClassifier(criterion = 'gini', n_estimators = 300), labeled_lda, LABELS)
    , "RF LDA Extra": (RandomForestClassifier(criterion = 'gini', n_estimators = 300), labeled_lda_extra, LABELS)
    , "RF Doc2Vec": (RandomForestClassifier(criterion = 'gini', n_estimators = 300), labeled_doc2vec, LABELS)
    , "RF Doc2Vec Extra": (RandomForestClassifier(criterion = 'gini', n_estimators = 300), labeled_doc2vec_extra, LABELS)
}

eval_model(rf_config)

[0.58715596 0.64705882 0.62608696 0.60377358 0.6       ] 0.6128150656519127 	 RF Extra Only
[0.63247863 0.69724771 0.65546218 0.61403509 0.59405941] 0.6386566034868986 	 RF Extra Cuisine + Zipcode
[0.65420561 0.58       0.5840708  0.64814815 0.5625    ] 0.6057849104169921 	 RF Count
[0.65486726 0.69565217 0.57657658 0.66055046 0.55670103] 0.6288694993540439 	 RF Count Extra
[0.55238095 0.67857143 0.60194175 0.64150943 0.61363636] 0.6176079852247649 	 RF TFIDF
[0.59047619 0.64912281 0.65420561 0.64761905 0.59183673] 0.626652077456659 	 RF TFIDF Extra
[0.56470588 0.55319149 0.53763441 0.53488372 0.44444444] 0.5269719891382941 	 RF TFIDF NGram
[0.64583333 0.63366337 0.59183673 0.59574468 0.45783133] 0.5849818881032226 	 RF TFIDF NGram Extra
[0.62962963 0.61403509 0.59130435 0.61403509 0.56842105] 0.6034850411051784 	 RF TFIDF NGram Chars
[0.57692308 0.63247863 0.55045872 0.66037736 0.51612903] 0.5872733631493341 	 RF TFIDF NGram Chars Extra
[0.58490566 0.6504065  0.55652174 0.63636364 0.4

## Boosting

In [208]:
# %%time

from xgboost import XGBClassifier

# xgb_parameters = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [3, 4, 5]
#         }

# xgb_model = GridSearchCV(XGBClassifier(), scoring = "f1", param_grid = xgb_parameters, cv=5, verbose = 3)

# xgb_model.fit(train_count, train_label)

# predictions = xgb_model.predict(test_count)

# print(metrics.accuracy_score(predictions, test_label),
#             metrics.precision_score(predictions, test_label),
#             metrics.recall_score(predictions, test_label),
#             metrics.f1_score(predictions, test_label))

# print(xgb_model.best_params_)

In [272]:
# xgboost_config = {
#     "XGBoost Count": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_count, test_count)
#     , "XGBoost TFIDF": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf, test_tfidf)
#     , "XGBoost TFIDF NGram": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf_ngram.tocsc(), test_tfidf_ngram.tocsc())
#     , "XGBoost TFIDF NGram Chars": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
#     , "XGBoost LDA": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_lda, test_lda)
#     , "XGBoost Doc2Vec": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_doc2vec, test_doc2vec)
# }

# xgboost_config = {
#     "XGBoost Count": (XGBClassifier(), train_count, test_count)
#     , "XGBoost TFIDF": (XGBClassifier(), train_tfidf, test_tfidf)
#     , "XGBoost TFIDF NGram": (XGBClassifier(), train_tfidf_ngram.tocsc(), test_tfidf_ngram.tocsc())
#     , "XGBoost TFIDF NGram Chars": (XGBClassifier(), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
#     , "XGBoost LDA": (XGBClassifier(), train_lda, test_lda)
#     , "XGBoost Doc2Vec": (XGBClassifier(), train_doc2vec, test_doc2vec)
# }

# run_model(xgboost_config)

xgboost_config = {
    "XGBoost Extra Only": (XGBClassifier(), labeled_extra_all, LABELS)
    , "XGBoost Extra Cuisine + Zipcode": (XGBClassifier(), labeled_extra_cuisin_zipcode, LABELS)
    , "XGBoost Count": (XGBClassifier(), labeled_count, LABELS)
    , "XGBoost Count Extra": (XGBClassifier(), labeled_count_extra, LABELS)
    , "XGBoost TFIDF": (XGBClassifier(), labeled_tfidf, LABELS)
    , "XGBoost TFIDF Extra": (XGBClassifier(), labeled_tfidf_extra, LABELS)
    , "XGBoost TFIDF NGram": (XGBClassifier(), labeled_tfidf_ngram, LABELS)
    , "XGBoost TFIDF NGram Extra": (XGBClassifier(), labeled_tfidf_ngram_extra, LABELS)
    , "XGBoost TFIDF NGram Chars": (XGBClassifier(), labeled_tfidf_ngram_chars, LABELS)
    , "XGBoost TFIDF NGram Chars Extra": (XGBClassifier(), labeled_tfidf_ngram_chars_extra, LABELS)
    , "XGBoost LDA": (XGBClassifier(), labeled_lda, LABELS)
    , "XGBoost LDA Extra": (XGBClassifier(), labeled_lda_extra, LABELS)
    , "XGBoost Doc2Vec": (XGBClassifier(), labeled_doc2vec, LABELS)
    , "XGBoost Doc2Vec Extra": (XGBClassifier(), labeled_doc2vec_extra, LABELS)
}

eval_model(xgboost_config)

[0.66666667 0.63265306 0.61261261 0.60176991 0.57446809] 0.6176340674229154 	 XGBoost Extra Only
[0.61818182 0.71559633 0.63793103 0.60714286 0.5625    ] 0.6282704080165327 	 XGBoost Extra Cuisine + Zipcode
[0.66666667 0.61111111 0.5631068  0.65420561 0.56      ] 0.6110180362741835 	 XGBoost Count
[0.62184874 0.57692308 0.56862745 0.63551402 0.49462366] 0.5795073884009668 	 XGBoost Count Extra
[0.625      0.60377358 0.59459459 0.62385321 0.58585859] 0.606615995273603 	 XGBoost TFIDF
[0.62385321 0.59405941 0.59813084 0.64864865 0.60416667] 0.6137717546773158 	 XGBoost TFIDF Extra
[0.63366337 0.62857143 0.62264151 0.58333333 0.61538462] 0.6167188506119947 	 XGBoost TFIDF NGram
[0.62626263 0.60784314 0.59615385 0.59405941 0.62745098] 0.6103539992008251 	 XGBoost TFIDF NGram Extra
[0.4952381  0.62711864 0.55855856 0.55238095 0.59405941] 0.5654711312371995 	 XGBoost TFIDF NGram Chars
[0.54054054 0.57657658 0.57391304 0.55769231 0.56565657] 0.5628758067888503 	 XGBoost TFIDF NGram Chars Extr