###### Packages:
* [NLTK](http://www.nltk.org/howto/classify.html)
* [SpaCy](https://spacy.io/)
* [AllenNLP](https://allennlp.org/tutorials)

Articles:
* [A Comprehensive Guide to Understand and Implement Text Classification in Python](https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/)
* [Machine Learning, NLP: Text Classification using scikit-learn, python and NLTK](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)
* [State-of-the-Art Text Classification using BERT model: “Predict the Happiness” Challenge](https://appliedmachinelearning.blog/2019/03/04/state-of-the-art-text-classification-using-bert-model-predict-the-happiness-hackerearth-challenge/)

In [1]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from tqdm import tqdm
import numpy as np
import pandas as pd
import string
import time

stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    result_stemmed = []
    for token in simple_preprocess(text, min_len = 2):
        result.append(token)
        if token not in STOPWORDS:
            result_stemmed.append(lemmatize_stemming(token))
    
    return (result, result_stemmed)

with open("./data/raw/Hygiene/hygiene.dat.labels") as f:
    LABELS = [int(l) for l in f.readlines() if l[0].isdigit()]

ALL_RAW_TEXTS = []
ALL_TEXTS = []
ALL_STEMMED_TEXTS = []
ALL_CONCAT_STEMMED_TEXTS = []
LABELED_TEXTS = []
LABELED_CONCAT_TEXTS = []
LABELED_STEMMED_TEXTS = []
LABELED_CONCAT_STEMMED_TEXTS = []

with open("./data/raw/Hygiene/hygiene.dat") as f:
    ALL_RAW_TEXTS = f.readlines()

for _text in tqdm(ALL_RAW_TEXTS):
    _result, _result_stemmed = preprocess(_text)
    ALL_TEXTS.append(_result)
    ALL_STEMMED_TEXTS.append(_result_stemmed)

ALL_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in ALL_STEMMED_TEXTS]

LABELED_TEXTS = ALL_TEXTS[0:len(LABELS)]
LABELED_CONCAT_TEXTS = [" ".join(_text) for _text in LABELED_TEXTS]

LABELED_STEMMED_TEXTS = ALL_STEMMED_TEXTS[0:len(LABELS)]
LABELED_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in LABELED_STEMMED_TEXTS]

SUBMIT_TEXTS = ALL_TEXTS[len(LABELS):]
SUBMIT_CONCAT_TEXTS = [" ".join(_text) for _text in SUBMIT_TEXTS]

SUBMIT_STEMMED_TEXTS = ALL_STEMMED_TEXTS[len(LABELS):]
SUBMIT_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in SUBMIT_STEMMED_TEXTS]

100%|████████████████████████████████████████████████████████████████████████████| 13299/13299 [03:19<00:00, 66.66it/s]


### Handle Extra Features

In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

FEATURE_MORE =pd.read_csv("./data/raw/Hygiene/hygiene.dat.additional", header=None)
EXTRA_FEATURE = pd.DataFrame()


# ALL_SENTIMENT = []
# sid = SentimentIntensityAnalyzer()
# for _text in tqdm(ALL_RAW_TEXTS):
#     _s = sid.polarity_scores(_text)
#     ALL_SENTIMENT.append([_s['neg'], _s['pos']]) 

# code_map = 'abcdefghij'
# # code_map = '0123456789'

# def encode_zipcode(zip_code):
#     _code = ''
#     _rest = zip_code    
#     while _rest > 0:
#         _code = code_map[_rest % 10] + _code
#         _rest = int(_rest / 10)

#     return _code

# EXTRA_FEATURE['Cuisines'] = [simple_preprocess(_text) for _text in FEATURE_MORE[0]]

# EXTRA_FEATURE['Stars'] = ['PoorStars' for _star in FEATURE_MORE[3]]
# star_std_range = (FEATURE_MORE[3].mean() - FEATURE_MORE[3].std(), FEATURE_MORE[3].mean() + FEATURE_MORE[3].std())
# # EXTRA_FEATURE['Stars'][FEATURE_MORE[3] < star_std_range[0]] = 'PoorStars'
# EXTRA_FEATURE['Stars'][(FEATURE_MORE[3] >= star_std_range[0])] = 'StandardStars'
# EXTRA_FEATURE['Stars'][FEATURE_MORE[3] > star_std_range[1]] = 'GoodStars'

# EXTRA_FEATURE['ReviewCount'] = ['NoReviews' for _star in FEATURE_MORE[2]]
# EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 2] = "FewReviews"
# EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 6] = "SomeReviews"
# EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 13] = "ManyReviews"
# EXTRA_FEATURE['ReviewCount'][FEATURE_MORE[2] > 50] = "LotReviews"

# EXTRA_FEATURE['ZIPCode'] = [encode_zipcode(_code) for _code in FEATURE_MORE[1]]

# EXTRA_FEATURE['MergedText'] = [ [_cuisine for _cuisine in _record[1][0]]
#                                 + [_record[1][1]] + [_record[1][2]] + [_record[1][3]]
#                                for _record in EXTRA_FEATURE.iterrows()]

In [3]:
vect = CountVectorizer(analyzer='word')

vect.fit(FEATURE_MORE[0].replace('Restaurants', '', regex=True))
CUISINES_FEATURE = vect.transform(FEATURE_MORE[0]).todense()

_zipcode = [str(i) for i in FEATURE_MORE[1]]
vect.fit(_zipcode)
ZIPCODE_FEATURE = vect.transform(_zipcode).todense()

REVIEW_COUNT_FEATURE = np.array(FEATURE_MORE[2]).reshape((len(FEATURE_MORE[2]),1))
RATING_FEATURE = np.array(FEATURE_MORE[3]).reshape((len(FEATURE_MORE[3]),1))

ALL_EXTRA_FEATURE = np.hstack((CUISINES_FEATURE, ZIPCODE_FEATURE, REVIEW_COUNT_FEATURE, RATING_FEATURE))
COUSINE_ZIPCODE_EXTRA_FEATURE = np.hstack((CUISINES_FEATURE, ZIPCODE_FEATURE))
# ALL_EXTRA_FEATURE = np.hstack((CUISINES_FEATURE, ZIPCODE_FEATURE, REVIEW_COUNT_FEATURE, RATING_FEATURE, ALL_SENTIMENT))
# COUSINE_ZIPCODE_EXTRA_FEATURE = np.hstack((CUISINES_FEATURE, ZIPCODE_FEATURE, ALL_SENTIMENT))

### Add Extra Features to the original text

In [4]:
# EXTRA_FEATURES = EXTRA_FEATURE['MergedText'][0:len(LABELS)]
# EXTRA_CONCAT_FEATURES = [" ".join(_text) for _text in EXTRA_FEATURES]
                  
# LABELED_EXTRA_TEXTS =  [EXTRA_FEATURES[i] + _text for i, _text in enumerate(LABELED_TEXTS)]
# LABELED_EXTRA_CONCAT_TEXTS = [" ".join(_text) for _text in LABELED_EXTRA_TEXTS]

# LABELED_EXTRA_STEMMED_TEXTS = [LABELED_EXTRA_TEXTS[i] + _text for i, _text in enumerate(LABELED_STEMMED_TEXTS)]
# LABELED_EXTRA_CONCAT_STEMMED_TEXTS = [" ".join(_text) for _text in LABELED_EXTRA_STEMMED_TEXTS]

# LABLED_RAW_TEXTS_1 = [EXTRA_CONCAT_FEATURES[i] + "|" + _text for i, _text in enumerate(ALL_RAW_TEXTS[0:len(LABELS)]) if LABELS[i] == 1]
# LABLED_RAW_TEXTS_0 = [EXTRA_CONCAT_FEATURES[i] + "|" + _text for i, _text in enumerate(ALL_RAW_TEXTS[0:len(LABELS)]) if LABELS[i] == 0]

# SMALL_RAW_TEXTS_1 = [_text for _text in LABLED_RAW_TEXTS_1 if len(_text) < 1000]
# SMALL_RAW_TEXTS_0 = [_text for _text in LABLED_RAW_TEXTS_0 if len(_text) < 1000]
# print(len(SMALL_RAW_TEXTS_1), len(SMALL_RAW_TEXTS_0))

## Build train/test dataset

In [5]:
# create a dataframe using texts and lables
labeled_df = pd.DataFrame()
labeled_df['concat_stemmed_text'] = LABELED_CONCAT_STEMMED_TEXTS
labeled_df['stemmed_text'] = LABELED_STEMMED_TEXTS
# labeled_df['concat_stemmed_text'] = LABELED_CONCAT_TEXTS
# labeled_df['stemmed_text'] = LABELED_TEXTS
# labeled_df['concat_stemmed_text'] = LABELED_CONCAT_EXTRA_TEXTS
# labeled_df['stemmed_text'] = LABELED_EXTRA_TEXTS
labeled_df['label'] = LABELS
labeled_extra_all = ALL_EXTRA_FEATURE[0:len(LABELS)]
labeled_extra_cuisine_zipcode = COUSINE_ZIPCODE_EXTRA_FEATURE[0:len(LABELS)]

submit_extra_all = ALL_EXTRA_FEATURE[len(LABELS):]
submit_extra_cuisine_zipcode = COUSINE_ZIPCODE_EXTRA_FEATURE[len(LABELS):]

# # split the dataset into training and validation datasets 
# train_concat_stemmed_text, test_concat_stemmed_text, train_label, test_label = model_selection.train_test_split(labeled_df['concat_stemmed_text'], 
#                                                                                   labeled_df['label'],
#                                                                                   test_size = 0.2,
#                                                                                   random_state = 10)
# train_stemmed_text = labeled_df['stemmed_text'][train_concat_stemmed_text.index]
# test_stemmed_text = labeled_df['stemmed_text'][test_concat_stemmed_text.index]

# # label encode the target variable 
# encoder = preprocessing.LabelEncoder()
# train_label = encoder.fit_transform(train_label)
# test_label = encoder.fit_transform(test_label)

In [6]:
# dictionary = corpora.Dictionary(processed_docs)
# print("Before prunn:%d"%(len(dictionary)))
# dictionary.filter_extremes(no_below = 2, no_above = 0.5)
# print("After prunn:%d"%(len(dictionary)))
# corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Feature Engineering

## Count Vectors as features

In [7]:
class MyCountVectorizer:
    def __init__(self, max_df = 0.5):
        self.vect_model = CountVectorizer(analyzer='word', max_df = max_df)
        
    def fit(self, texts_concat):
        self.vect_model.fit(texts_concat)
    
    def transform(self, texts_concat):
        return self.vect_model.transform(texts_concat)

count_vect = MyCountVectorizer()
count_vect.fit(LABELED_CONCAT_STEMMED_TEXTS)
labeled_count = count_vect.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_count_extra_all = np.hstack([labeled_count.todense(), labeled_extra_all])
labeled_count_extra_cuisine_zipcode = np.hstack([labeled_count.todense(), labeled_extra_cuisine_zipcode])

submit_count = count_vect.transform(SUBMIT_CONCAT_STEMMED_TEXTS)
submit_count_extra_all = np.hstack([submit_count.todense(), submit_extra_all])
submit_count_extra_cuisine_zipcode = np.hstack([submit_count.todense(), submit_extra_cuisine_zipcode])

## TF-IDF Vectors as features

In [27]:
%%time

# word level tf-idf
class MyTfidfVectorizer(MyCountVectorizer):
    def __init__(self, analyzer='word', ngram_range = None, max_features=5000):
        if ngram_range is None:
            self.vect_model = TfidfVectorizer(analyzer = analyzer, max_features = max_features)
        else:
            self.vect_model = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, 
                                              max_features = max_features)
            
    def fit(self, texts_concat):   
        self.vect_model.fit(texts_concat)
        self.vocabulary = self.vect_model.vocabulary_

top_word_tfidf_vect = MyTfidfVectorizer(ngram_range=(1, 3), max_features = 200)
top_word_tfidf_vect.fit(LABELED_CONCAT_STEMMED_TEXTS)
LABELED_TOPWORD_FEATURE = top_word_tfidf_vect.transform(LABELED_CONCAT_STEMMED_TEXTS).todense()
SUBMIT_TOPWORD_FEATURE = top_word_tfidf_vect.transform(SUBMIT_CONCAT_STEMMED_TEXTS).todense()

tfidf_vect = MyTfidfVectorizer(max_features = 10000)
tfidf_vect.fit(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf =  tfidf_vect.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf_extra_all = np.hstack([labeled_tfidf.todense(), labeled_extra_all])
labeled_tfidf_extra_cuisine_zipcode = np.hstack([labeled_tfidf.todense(), labeled_extra_cuisine_zipcode])

submit_tfidf =  tfidf_vect.transform(SUBMIT_CONCAT_STEMMED_TEXTS)
submit_tfidf_extra_all = np.hstack([submit_tfidf.todense(), submit_extra_all])
submit_tfidf_extra_cuisine_zipcode = np.hstack([submit_tfidf.todense(), submit_extra_cuisine_zipcode])

# ngram level tf-idf 
tfidf_vect_ngram = MyTfidfVectorizer(ngram_range=(1, 3), max_features = 10000)
tfidf_vect_ngram.fit(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf_ngram = tfidf_vect_ngram.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf_ngram_extra_all = np.hstack([labeled_tfidf_ngram.todense(), labeled_extra_all])
labeled_tfidf_ngram_extra_cuisine_zipcode = np.hstack([labeled_tfidf_ngram.todense(), labeled_extra_cuisine_zipcode])

submit_tfidf_ngram = tfidf_vect_ngram.transform(SUBMIT_CONCAT_STEMMED_TEXTS)
submit_tfidf_ngram_extra_all = np.hstack([submit_tfidf_ngram.todense(), submit_extra_all])
submit_tfidf_ngram_extra_cuisine_zipcode = np.hstack([submit_tfidf_ngram.todense(), submit_extra_cuisine_zipcode])

# characters level tf-idf
tfidf_vect_ngram_chars = MyTfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features = 10000)
tfidf_vect_ngram_chars.fit(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(LABELED_CONCAT_STEMMED_TEXTS)
labeled_tfidf_ngram_chars_extra_all = np.hstack([labeled_tfidf_ngram_chars.todense(), labeled_extra_all])
labeled_tfidf_ngram_chars_extra_cuisine_zipcode = np.hstack([labeled_tfidf_ngram_chars.todense(), labeled_extra_cuisine_zipcode])

Wall time: 1min


## Word Embeddings

### Build from review corpus

In [None]:
# %%time

# from gensim.models.fasttext import FastText

# class MyFastTextTfidfVectorizer(MyCountVectorizer):
#     def __init__(self, tfidf_vectorizer, size = 100):
#         self.embedding_size = size
#         self.tfidf_vectorizer = tfidf_vectorizer
#         self.fasttext_model = FastText(size = size, window = 5, min_count = 5)

#     def tfidf2embedding(self, value_vector):
#         _weighted_value = np.zeros(self.embedding_size)
#         for key in self.tfidf_vectorizer.vocabulary:
#             _index = self.tfidf_vectorizer.vocabulary[key]
#             if value_vector[_index] != 0:
#                 _weighted_value += self.fasttext_model[key] * value_vector[_index]

#         return _weighted_value
    
#     def fit(self, texts):
#         _texts_concat = [" ".join(_text) for _text in texts]
#         self.tfidf_vectorizer = MyTfidfVectorizer()
#         self.tfidf_vectorizer.fit(_texts_concat)
        
#         self.fasttext_model.build_vocab(sentences = texts)
#         self.fasttext_model.train(sentences = texts, 
#                                   total_examples = len(texts), 
#                                   epochs=10)
        
#     def transform(self, texts):
#         _texts_concat = [" ".join(_text) for _text in texts]
#         _tfidf_values = self.tfidf_vectorizer.transform(_texts_concat)
#         return np.asarray([self.tfidf2embedding(_value.toarray()[0]) for _value in _tfidf_values])

# fasttext_tfidf_vect = MyFastTextTfidfVectorizer(tfidf_vect)
# fasttext_tfidf_vect.fit(ALL_STEMMED_TEXTS)
# train_fasttext_embedding = fasttext_tfidf_vect.transform(train_stemmed_text)
# test_fasttext_embedding = fasttext_tfidf_vect.transform(test_stemmed_text)

### Prebuilt Embedding

In [None]:
# %%time
# from keras.preprocessing import text, sequence
# from keras import layers, models, optimizers

# # load the pre-trained word-embedding vectors 
# embeddings_index = {}
# for i, line in enumerate(open('data/model/wiki-news-300d-1M.vec')):
#     values = line.split()
#     embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# # create a tokenizer 
# token = text.Tokenizer()
# token.fit_on_texts(LABELED_CONCAT_STEMMED_TEXTS)
# word_index = token.word_index

# # convert text to sequence of tokens and pad them to ensure equal length vectors 
# text_train_seq = sequence.pad_sequences(token.texts_to_sequences(train_text), maxlen=70)
# text_test_seq = sequence.pad_sequences(token.texts_to_sequences(test_text), maxlen=70)

# # create token-embedding mapping
# embedding_matrix = np.zeros((len(word_index) + 1, 300))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

## Text / NLP based features

In [None]:
# %%time
# trainDF['char_count'] = trainDF['text'].apply(len)
# trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
# trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
# trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
# trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
# trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [None]:
# %%time

# import textblob

# pos_family = {
#     'noun' : ['NN','NNS','NNP','NNPS'],
#     'pron' : ['PRP','PRP$','WP','WP$'],
#     'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
#     'adj' :  ['JJ','JJR','JJS'],
#     'adv' : ['RB','RBR','RBS','WRB']
# }

# # function to check and get the part of speech tag count of a words in a given sentence
# def check_pos_tag(x, flag):
#     cnt = 0
#     try:
#         wiki = textblob.TextBlob(x)
#         for tup in wiki.tags:
#             ppo = list(tup)[1]
#             if ppo in pos_family[flag]:
#                 cnt += 1
#     except:
#         pass
#     return cnt

# trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
# trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
# trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
# trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
# trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

## Topic Models as features

In [None]:
from gensim.models import AuthorTopicModel
from gensim import corpora, models

class MyAuthorLDAVectorizer(MyCountVectorizer):
#     mallet_path = ".." + os.sep + "mallet-2.0.8"+ os.sep + "bin" + os.sep +"mallet"
    
    def __init__(self, labels, TOPIC_COUNT = 100):
        self.topic_count = TOPIC_COUNT
        self.author2doc = {"passed":[i for i in range(len(labels)) if labels[i] == 0], 
                           "failed":[i for i in range(len(labels)) if labels[i] == 1]}
    
    def fit(self, texts):
        self.dictionary = corpora.Dictionary(texts)
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]

        self.vect_model = AuthorTopicModel(_corpus,
                                            author2doc = self.author2doc,
                                            num_topics = self.topic_count, 
                                            id2word = self.dictionary,
                                            random_state = 100)
    
    def toarray(self, doc_topics):
        _doc_vect  = np.zeros((len(doc_topics), self.topic_count))
        
        for i, _doc in enumerate(doc_topics):
            for _topic, _weight in _doc:
                _doc_vect[i][_topic] = _weight
        
        return _doc_vect
        
    def transform(self, texts):
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]
#         _tfidf_corpus = self.tfidf_model[_corpus]
        
        return self.toarray(self.vect_model[_corpus])


In [None]:
# author2doc = {"passed":[i for i in range(len(LABELS)) if LABELS[i] == 0], 
#                            "failed":[i for i in range(len(LABELS)) if LABELS[i] == 1]}

# dictionary = corpora.Dictionary(LABELED_STEMMED_TEXTS)
# corpus = [dictionary.doc2bow(_doc) for _doc in LABELED_STEMMED_TEXTS]

# %time vect_model = AuthorTopicModel(corpus,author2doc = author2doc, num_topics = 2,id2word = dictionary,random_state = 100)

# vect_model.show_topic(1)
# vect_model['passed']
# vect_model['failed']

In [29]:
%%time

import os
from gensim import corpora, models

class MyLDAVectorizer(MyCountVectorizer):
    mallet_path = ".." + os.sep + "mallet-2.0.8"+ os.sep + "bin" + os.sep +"mallet"
    
    def __init__(self, TOPIC_COUNT = 100):
        self.topic_count = TOPIC_COUNT
    
    def fit(self, texts):
        self.dictionary = corpora.Dictionary(texts)
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]
        self.tfidf_model = models.TfidfModel(_corpus)
        _tfidf_corpus = self.tfidf_model[_corpus]

#         self.vect_model = models.LdaModel(_tfidf_corpus, 
#                             num_topics = self.topic_count, 
#                             id2word = self.dictionary,
#                             random_state = 100,
#                             eval_every = 5, 
#                             alpha = 'auto', 
#                             gamma_threshold = 0.01)
        
        self.vect_model = models.wrappers.LdaMallet(self.mallet_path, 
                                                     corpus = _corpus, 
                                                     num_topics = self.topic_count, 
                                                     id2word = self.dictionary)
    
    def toarray(self, doc_topics):
        _doc_vect  = np.zeros((len(doc_topics), self.topic_count))
        
        for i, _doc in enumerate(doc_topics):
            for _topic, _weight in _doc:
                _doc_vect[i][_topic] = _weight
        
        return _doc_vect
        
    def transform(self, texts):
        _corpus = [self.dictionary.doc2bow(_doc) for _doc in texts]
#         _tfidf_corpus = self.tfidf_model[_corpus]
        
        return self.toarray(self.vect_model[_corpus])

lda_vect = MyLDAVectorizer(TOPIC_COUNT = 200)
lda_vect.fit(LABELED_STEMMED_TEXTS)
labeled_lda = lda_vect.transform(LABELED_STEMMED_TEXTS)
labeled_lda_extra_all = np.hstack([labeled_lda, labeled_extra_all])
labeled_lda_extra_cuisine_zipcode = np.hstack([labeled_lda, labeled_extra_cuisine_zipcode])
labeled_TT = np.hstack([labeled_lda, LABELED_TOPWORD_FEATURE])
labeled_TT_extra_all = np.hstack([labeled_lda, LABELED_TOPWORD_FEATURE, labeled_extra_all])
labeled_TT_extra_cuisine_zipcode = np.hstack([labeled_lda, LABELED_TOPWORD_FEATURE, labeled_extra_cuisine_zipcode])

submit_lda = lda_vect.transform(SUBMIT_STEMMED_TEXTS)
submit_lda_extra_all = np.hstack([submit_lda, submit_extra_all])
submit_lda_extra_cuisine_zipcode = np.hstack([submit_lda, submit_extra_cuisine_zipcode])
submit_TT = np.hstack([submit_lda, SUBMIT_TOPWORD_FEATURE])
submit_TT_extra_all = np.hstack([submit_lda, SUBMIT_TOPWORD_FEATURE, submit_extra_all])
submit_TT_extra_cuisine_zipcode = np.hstack([submit_lda, SUBMIT_TOPWORD_FEATURE, submit_extra_cuisine_zipcode])

Wall time: 3min 26s


## Doc2Vec

In [30]:
%%time

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

class MyDoc2Vectorizer(MyCountVectorizer):
    def __init__(self, size = 100):
        self.embedding_size = size
            
    def fit(self, texts):
        _docs = [TaggedDocument(_doc, [i]) for i, _doc in enumerate(texts)]
        self.vect_model = Doc2Vec(_docs, 
                                  vector_size = self.embedding_size, 
                                  window = 5,
                                  min_count = 3,
                                  epochs = 40, 
                                  workers = 4)
        
    def transform(self, texts):
        return np.asarray([self.vect_model.infer_vector(_text) for _text in texts])

doc2vec_vect = MyDoc2Vectorizer(size = 200)
doc2vec_vect.fit(ALL_STEMMED_TEXTS)

labeled_doc2vec = doc2vec_vect.transform(LABELED_STEMMED_TEXTS)
labeled_doc2vec_extra_all = np.hstack([labeled_doc2vec, labeled_extra_all])
labeled_doc2vec_extra_cuisine_zipcode = np.hstack([labeled_doc2vec, labeled_extra_cuisine_zipcode])
labeled_DT = np.hstack([labeled_doc2vec, LABELED_TOPWORD_FEATURE])
labeled_DT_extra_all = np.hstack([labeled_doc2vec, LABELED_TOPWORD_FEATURE, labeled_extra_all])
labeled_DT_extra_cuisine_zipcode = np.hstack([labeled_doc2vec, LABELED_TOPWORD_FEATURE, labeled_extra_cuisine_zipcode])

submit_doc2vec = doc2vec_vect.transform(SUBMIT_STEMMED_TEXTS)
submit_doc2vec_extra_all = np.hstack([submit_doc2vec, submit_extra_all])
submit_doc2vec_extra_cuisine_zipcode = np.hstack([submit_doc2vec, submit_extra_cuisine_zipcode])
submit_DT = np.hstack([submit_doc2vec, SUBMIT_TOPWORD_FEATURE])
submit_DT_extra_all = np.hstack([submit_doc2vec, SUBMIT_TOPWORD_FEATURE, submit_extra_all])
submit_DT_extra_cuisine_zipcode = np.hstack([submit_doc2vec, SUBMIT_TOPWORD_FEATURE, submit_extra_cuisine_zipcode])

Wall time: 5min 29s


In [None]:
# #Test quality

# import collections

# ranks = []
# for doc_id in range(len(train_stemmed_text)):
#     inferred_vector = doc2vec_vect.vect_model.infer_vector(train_stemmed_text[train_stemmed_text.index[doc_id]])
#     sims = doc2vec_vect.vect_model.docvecs.most_similar([inferred_vector], topn=len(doc2vec_vect.vect_model.docvecs))
#     rank = [docid for docid, sim in sims].index(doc_id)
#     ranks.append(rank)

# collections.Counter(ranks)

# Models

In [50]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate

# def show_score(classifier_name, scores):
#     print("Accuracy:%0.2f Precission:%0.2f Recall:%0.2f F1:%0.2f"%scores, "-> [%s]"%(classifier_name))
    
# def train_model(classifier, train_feature, train_label, test_feature, test_label, is_neural_net=False):
# #     print(train_feature)
#     # fit the training dataset on the classifier
#     classifier.fit(train_feature, train_label)
    
#     # predict the labels on validation dataset
#     predictions = classifier.predict(test_feature)
    
#     if is_neural_net:
#         predictions = predictions.argmax(axis=-1)

#     print(metrics.confusion_matrix(test_label, predictions))
#     return (metrics.accuracy_score(test_label, predictions),
#             metrics.precision_score(test_label, predictions),
#             metrics.recall_score(test_label, predictions),
#             metrics.f1_score(test_label, predictions))

# def run_model(classfier_configs):
#     for _name in classfier_configs:
# #         print(classfier_configs[_name].train)
#         scores = train_model(classfier_configs[_name][0], classfier_configs[_name][1], train_label, classfier_configs[_name][2], test_label)
#         show_score(_name, scores)

ALIAS = "xj9"
SUBMISSION_FOLDER = "./data/submit"
def make_submit_file(file_name, predictions):
    with open(SUBMISSION_FOLDER + "/" + file_name, 'w') as f:
        f.write(ALIAS + "\n")
        for _value in predictions:
            f.write(str(_value) + "\n")    
    
    
def eval_model(classfier_configs, model, model_name, param_distributions = None, do_grid_search = False):
    test_scores = []
    best_params = []
    model_names = []
    
    for _name in classfier_configs:
        model_names.append(_name)
        if not (param_distributions is None):
            if do_grid_search:
                best_model = GridSearchCV(model, 
                                             scoring = "f1", 
                                             param_grid = param_distributions, 
                                             cv = 5)
            else:
                best_model = RandomizedSearchCV(model, 
                                             scoring = "f1", 
                                             param_distributions = param_distributions, 
                                             cv = 5)
            
            best_model.fit(classfier_configs[_name][0], classfier_configs[_name][1])
            _score = best_model.best_score_
            _params = best_model.best_params_

        else:
            best_model = model
            cv_results = cross_validate(best_model, 
                                        classfier_configs[_name][0], 
                                        classfier_configs[_name][1],
                                        scoring = 'f1',
                                        cv = 5)
            
            best_model.fit(classfier_configs[_name][0], classfier_configs[_name][1])
            _score = np.mean(cv_results['test_score'])
            _params = None
            
        
        print(_score, '\t', _params, '\t', _name)
        test_scores.append(_score)
        best_params.append(_params)
        
        make_submit_file("%s_%s_submit.txt"%(model_name, classfier_configs[_name][2]), best_model.predict(classfier_configs[_name][3]))
    
    best_of_best = np.argmax(test_scores)
    print('Best Model:[', model_names[best_of_best], '] Test Score:', "%0.3f"%(test_scores[best_of_best]))
    
    return {'test_scores': test_scores, 'best_parameters': best_params}

In [51]:
model_config = {
    "Extra All": (labeled_extra_all, LABELS, "ea", submit_extra_all)
    , "Extra Cuisine Zipcode": (labeled_extra_cuisine_zipcode, LABELS, "ecz", submit_extra_cuisine_zipcode)
    , "Count": (labeled_count, LABELS, "c", submit_count)
    , "Count + Extra All": (labeled_count_extra_all, LABELS, "c_ea", submit_count_extra_all)
    , "Count + Extra Cuisine+Zipcode": (labeled_count_extra_cuisine_zipcode, LABELS, "c_ecz", submit_count_extra_cuisine_zipcode)
    , "TFIDF": (labeled_tfidf, LABELS, "tfidf", submit_tfidf)
    , "TFIDF + Extra All": (labeled_tfidf_extra_all, LABELS, "tfidf_ea", submit_tfidf_extra_all)
    , "TFIDF + Extra Cuisine+Zipcode": (labeled_tfidf_extra_cuisine_zipcode, LABELS, "tfidf_ecz", submit_tfidf_extra_cuisine_zipcode)
    , "TFIDF NGram": (labeled_tfidf_ngram, LABELS, "tfidf_ng", submit_tfidf_ngram)
    , "TFIDF NGram + Extra All": (labeled_tfidf_ngram_extra_all, LABELS, "tfidf_ng_ea", submit_tfidf_ngram_extra_all)
    , "TFIDF NGram + Extra Cuisine+ZipCode": (labeled_tfidf_ngram_extra_cuisine_zipcode, LABELS, "tfidf_ng_ecz",
                                              submit_tfidf_ngram_extra_cuisine_zipcode)
    , "LDA": (labeled_lda, LABELS, submit_lda, "lda")
    , "LDA + Extra All": (labeled_lda_extra_all, LABELS, "lda_ea", submit_lda_extra_all)
    , "LDA + Extra Cuisine+Zipcode": (labeled_lda_extra_cuisine_zipcode, LABELS,"lda_ecz", submit_lda_extra_cuisine_zipcode)
    , "Doc2Vec": (labeled_doc2vec, LABELS, "d2v", submit_doc2vec)
    , "Doc2Vec Extra All": (labeled_doc2vec_extra_all, LABELS, "d2v_ea", submit_doc2vec_extra_all)
    , "Doc2Vec Extra Cuisine+Zipcode": (labeled_doc2vec_extra_cuisine_zipcode, LABELS, "d2v_ecz",
                                        submit_doc2vec_extra_cuisine_zipcode)
    , "TT": (labeled_TT, LABELS, "tt", submit_TT)
    , "TT + Extra All": (labeled_TT_extra_all, LABELS,"tt_ea", submit_TT_extra_all)
    , "TT + Extra Cuisine+Zipcode": (labeled_TT_extra_cuisine_zipcode, LABELS,"tt_ecz", submit_TT_extra_cuisine_zipcode)
    , "DT": (labeled_DT, LABELS, "dt", submit_DT)
    , "DT Extra All": (labeled_DT_extra_all, LABELS,"dt_ea", submit_DT_extra_all)
    , "DT Extra Cuisine+Zipcode": (labeled_DT_extra_cuisine_zipcode, LABELS, "dt_ecz", submit_DT_extra_cuisine_zipcode)    
}

## Naive Bayes 

In [52]:
from sklearn.naive_bayes import MultinomialNB

nb_config = {
    "Extra All": (labeled_extra_all, LABELS, "ea", submit_extra_all)
    , "Extra Cuisine Zipcode": (labeled_extra_cuisine_zipcode, LABELS, "ecz", submit_extra_cuisine_zipcode)
    , "Count": (labeled_count, LABELS, "c", submit_count)
    , "Count + Extra All": (labeled_count_extra_all, LABELS, "c_ea", submit_count_extra_all)
    , "Count + Extra Cuisine+Zipcode": (labeled_count_extra_cuisine_zipcode, LABELS, "c_ecz", submit_count_extra_cuisine_zipcode)
    , "TFIDF": (labeled_tfidf, LABELS, "tfidf", submit_tfidf)
    , "TFIDF + Extra All": (labeled_tfidf_extra_all, LABELS, "tfidf_ea", submit_tfidf_extra_all)
    , "TFIDF + Extra Cuisine+Zipcode": (labeled_tfidf_extra_cuisine_zipcode, LABELS, "tfidf_ecz", submit_tfidf_extra_cuisine_zipcode)
    , "TFIDF NGram": (labeled_tfidf_ngram, LABELS, "tfidf_ng", submit_tfidf_ngram)
    , "TFIDF NGram + Extra All": (labeled_tfidf_ngram_extra_all, LABELS, "tfidf_ng_ea", submit_tfidf_ngram_extra_all)
    , "TFIDF NGram + Extra Cuisine+ZipCode": (labeled_tfidf_ngram_extra_cuisine_zipcode, LABELS, "tfidf_ng_ecz",
                                              submit_tfidf_ngram_extra_cuisine_zipcode)
    , "LDA": (labeled_lda, LABELS, "lda", submit_lda)
    , "LDA + Extra All": (labeled_lda_extra_all, LABELS, "lda_ea", submit_lda_extra_all)
    , "LDA + Extra Cuisine+Zipcode": (labeled_lda_extra_cuisine_zipcode, LABELS,"lda_ecz", submit_lda_extra_cuisine_zipcode)
    , "TT": (labeled_TT, LABELS, "tt", submit_TT)
    , "TT + Extra All": (labeled_TT_extra_all, LABELS,"tt_ea", submit_TT_extra_all)
    , "TT + Extra Cuisine+Zipcode": (labeled_TT_extra_cuisine_zipcode, LABELS,"tt_ecz", submit_TT_extra_cuisine_zipcode)
}

nb_eval = eval_model(nb_config, MultinomialNB(), "nb")

0.6059380201315685 	 None 	 Extra All
0.6272072595274806 	 None 	 Extra Cuisine Zipcode
0.6622324661669784 	 None 	 Count
0.6618479894648867 	 None 	 Count + Extra All
0.6560218848796275 	 None 	 Count + Extra Cuisine+Zipcode
0.6787404920510662 	 None 	 TFIDF
0.7017638638450687 	 None 	 TFIDF + Extra All
0.6737271806539652 	 None 	 TFIDF + Extra Cuisine+Zipcode
0.6490372665443611 	 None 	 TFIDF NGram
0.6911237307980287 	 None 	 TFIDF NGram + Extra All
0.6657373148454245 	 None 	 TFIDF NGram + Extra Cuisine+ZipCode
0.6220672611064836 	 None 	 LDA
0.6162458763299167 	 None 	 LDA + Extra All
0.6401722644784845 	 None 	 LDA + Extra Cuisine+Zipcode
0.6167809836400291 	 None 	 TT
0.6139526462930719 	 None 	 TT + Extra All
0.6459488103761558 	 None 	 TT + Extra Cuisine+Zipcode
Best Model:[ TFIDF + Extra All ] Test Score: 0.702


##  Linear Classifier

In [None]:
%%time
from sklearn.linear_model import LogisticRegression

# linear_config = {
#     "Linear Count": (LogisticRegression(C= 0.1, penalty='l1', tol=0.1, solver='saga'), train_count, test_count)
#     , "Linear TFIDF": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf, test_tfidf)
#     , "Linear TFIDF NGram": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf_ngram, test_tfidf_ngram)
#     , "Linear TFIDF NGram Chars": (LogisticRegression(C= 0.1, penalty='l2', tol=0.1, solver='saga'), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
# #     , "Linear FastText Embedding": (LogisticRegression(solver='lbfgs', max_iter=int(1e6)), train_fasttext_embedding, test_fasttext_embedding)
#     , "Linear LDA": (LogisticRegression(solver='lbfgs'), train_lda, test_lda)
#     , "Linear Doc2Vec": (LogisticRegression(solver='lbfgs'), train_doc2vec, test_doc2vec)
# }

# run_model(linear_config)

linear_parameters = {'penalty':('l1', 'l2'), 'C':[10, 1, 0.1, 0.01], 'solver':['saga']}

linear_eval = eval_model(model_config, LogisticRegression(),"linear", param_distributions = linear_parameters, do_grid_search = True)





0.5658076185680123 	 {'C': 0.01, 'penalty': 'l2', 'solver': 'saga'} 	 Extra All


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.642433672505511 	 {'C': 1, 'penalty': 'l2', 'solver': 'saga'} 	 Extra Cuisine Zipcode




0.6504902335245407 	 {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'} 	 Count


## SVM

In [None]:
%%time

from sklearn.svm import SVC

# svm_config = {
#     "SVM Count": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_count, test_count),
#     "SVM TFIDF": (SVC(C = 10, gamma = 0.01, kernel='rbf'), train_tfidf, test_tfidf),
#     "SVM TFIDF NGram": (SVC(C = 1, gamma = 0.01, kernel='linear'), train_tfidf_ngram, test_tfidf_ngram),
#     "SVM TFIDF NGram Chars": (SVC(C = 10, gamma = 0.01, kernel='rbf'), train_tfidf_ngram_chars, test_tfidf_ngram_chars),
# #     "SVM FastText Embedding": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_fasttext_embedding, test_fasttext_embedding),
#     "SVM LDA": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_lda, test_lda),
#     "SVM Doc2Vec": (SVC(C = 1, gamma = 0.1, kernel='rbf'), train_doc2vec, test_doc2vec),
# }
    
# run_model(svm_config)

svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 10, 100], "gamma": np.logspace(-2, 2, 5)}

svm_eval = eval_model(model_config, SVC(),"svm", param_distributions = svm_parameters, do_grid_search = True)

## Random Forest

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier

rf_parameters = {'n_estimators':[100, 200, 500], 'max_features':[None, 0.25, 0.5, 0.75],
                'max_depth': [None, 5, 10], 'min_samples_leaf': [0.0005, 0.01, 0.05, 0.1],
                 'min_samples_split':[2, 5, 10]}

# rf_eval = eval_model(model_config, RandomForestClassifier())
rf_eval = eval_model(model_config, RandomForestClassifier(), "rf", param_distributions = rf_parameters, do_grid_search = False)

## Boosting

In [None]:
%%time

from xgboost import XGBClassifier

# xgboost_config = {
#     "XGBoost Count": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_count, test_count)
#     , "XGBoost TFIDF": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf, test_tfidf)
#     , "XGBoost TFIDF NGram": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf_ngram.tocsc(), test_tfidf_ngram.tocsc())
#     , "XGBoost TFIDF NGram Chars": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_tfidf_ngram_chars, test_tfidf_ngram_chars)
#     , "XGBoost LDA": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_lda, test_lda)
#     , "XGBoost Doc2Vec": (XGBClassifier(colsample_bytree = 0.8, gamma= 2, max_depth = 4, min_child_weight = 1, subsample =1.0), train_doc2vec, test_doc2vec)
# }


xgb_parameters = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

# xgb_eval = eval_model(model_config, XGBClassifier())
xgb_eval = eval_model(model_config, XGBClassifier(), "xgb", param_distributions = xgb_parameters, do_grid_search = False)

In [None]:
def show_model_score(scores, config):
    for i, _name in enumerate(config):
#         print('Test Score:', "%0.3f"%(scores[i]), 'Data:[', _name , ']')
        print("%0.3f"%(scores[i]))

In [None]:
show_model_score(xgb_eval['test_scores'], model_config)

In [None]:
show_model_score(rf_eval['test_scores'], model_config)

In [None]:
show_model_score(svm_eval['test_scores'], model_config)

In [None]:
show_model_score(linear_eval['test_scores'], model_config)

In [None]:
show_model_score(nb_eval['test_scores'], nb_config)

In [None]:
test_tfidf = tfidf_vect.transform(" ".join(_text) for _text in ALL_STEMMED_TEXTS[len(LABELS):])
test_extra_all = ALL_EXTRA_FEATURE[len(LABELS):]
test_extra_cuisine_zipcode = COUSINE_ZIPCODE_EXTRA_FEATURE[len(LABELS):]

test_tfidf_extra_all = np.hstack([test_tfidf.todense(), test_extra_all])
test_tfidf_extra_cuisine_zipcode = np.hstack([test_tfidf.todense(), test_extra_cuisine_zipcode])

In [None]:
svm_eval['best_parameters'][4]

In [None]:
nb_best_model = MultinomialNB()
nb_best_model.fit(labeled_tfidf_extra_all, LABELS)
nb_predictions = nb_best_model.predict(test_tfidf_extra_all)

with open('nb_results.txt', 'w') as f:
    f.write("xj9\n")
    for _value in nb_predictions:
        f.write(str(_value) + "\n")

In [None]:
svm_best_model = SVC(C = 0.1, gamma = 10, kernel = 'rbf')
svm_best_model.fit(labeled_tfidf_extra_cuisine_zipcode, LABELS)
svm_predictions = svm_best_model.predict(test_tfidf_extra_cuisine_zipcode)

with open('svm_results.txt', 'w') as f:
    f.write("xj9\n")
    for _value in svm_predictions:
        f.write(str(_value) + "\n")