Packages:
* [NLTK](http://www.nltk.org/howto/classify.html)
* [SpaCy](https://spacy.io/)
* [AllenNLP](https://allennlp.org/tutorials)

Articles:
* [A Comprehensive Guide to Understand and Implement Text Classification in Python](https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/)
* [Machine Learning, NLP: Text Classification using scikit-learn, python and NLTK](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)
* [State-of-the-Art Text Classification using BERT model: “Predict the Happiness” Challenge](https://appliedmachinelearning.blog/2019/03/04/state-of-the-art-text-classification-using-bert-model-predict-the-happiness-hackerearth-challenge/)

In [43]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from tqdm import tqdm
import numpy as np
import pandas as pd
import string
import time


stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text, min_len = 4):
        if token not in STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

with open("./data/raw/Hygiene/hygiene.dat.labels") as f:
    LABELS = [int(l) for l in f.readlines() if l[0].isdigit()]

ALL_TEXTS = []
LABELED_TEXTS = []
with open("./data/raw/Hygiene/hygiene.dat") as f:
    ALL_TEXTS = f.readlines()
    for i in range(0, len(LABELS)):
        LABELED_TEXTS.append(ALL_TEXTS[i])

LABELED_STEMMED_TEXTS = [preprocess(_text) for _text in tqdm(LABELED_TEXTS)]
LABELED_RAW_STEMMED_TEXTS = [" ".join(_text) for _text in LABELED_STEMMED_TEXTS]

FEATURE_MORE =pd.read_csv("./data/raw/Hygiene/hygiene.dat.additional", header=None)

100%|██████████| 546/546 [00:09<00:00, 56.98it/s]


In [44]:
# create a dataframe using texts and lables
trainDF = pd.DataFrame()
trainDF['text'] = LABELED_RAW_STEMMED_TEXTS
trainDF['label'] = LABELS

# split the dataset into training and validation datasets 
train_text, test_text, train_label, test_label = model_selection.train_test_split(trainDF['text'], 
                                                                                  trainDF['label'],
                                                                                  test_size = 0.2)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_label = encoder.fit_transform(train_label)
test_label = encoder.fit_transform(test_label)

In [45]:
# dictionary = corpora.Dictionary(processed_docs)
# print("Before prunn:%d"%(len(dictionary)))
# dictionary.filter_extremes(no_below = 2, no_above = 0.5)
# print("After prunn:%d"%(len(dictionary)))
# corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Count Vectors as features

In [46]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_df = 0.5)
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
train_count =  count_vect.transform(train_text)
test_count =  count_vect.transform(test_text)

# TF-IDF Vectors as features

In [47]:
%%time
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=5000)
tfidf_vect.fit(trainDF['text'])
train_tfidf =  tfidf_vect.transform(train_text)
test_tfidf =  tfidf_vect.transform(test_text)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word',ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
train_tfidf_ngram =  tfidf_vect_ngram.transform(train_text)
test_tfidf_ngram =  tfidf_vect_ngram.transform(test_text)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_text) 
test_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_text) 

CPU times: user 9 s, sys: 170 ms, total: 9.17 s
Wall time: 8.73 s


# Word Embeddings

In [49]:
LABELED_RAW_STEMMED_TEXTS[3]

'access googl street view like medina yarrow point weird come pictur exterior yelper stop take snapshot good restaur tire mazatlan puerto vallarta azteca squar footag smaller guess peopl assum tabl right away realiti probabl wait come complimentari chip salsa want guacamol mind complimentari burrita littl spici kick couldn huge like alki think wait till summertim decid place neighborhood better time like alki locat know love servic outstand navajo torta bland good textur butternut squash enchilada mmmmmm vegetarian option best good flavor sauc time wrong sauc didnt tast delici like char asparagus zing interest tasti light good fruiti ice sweet like fruiti ice tea burn crave great madison park cactus favorit cactus locat locat charm quaint madison park area nice vibe summer night pretti perfect restaur pack cactus usual spot tabl area open window fantast breez nice view madison park happen quick agre good get love start drink friend order margarita baja jalapeno fruiti cocktail guacamol

In [None]:
%%time
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('data/model/wiki-news-300d-1M.vec')):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
text_train_seq = sequence.pad_sequences(token.texts_to_sequences(train_text), maxlen=70)
text_test_seq = sequence.pad_sequences(token.texts_to_sequences(test_text), maxlen=70)

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [50]:
%%time
from gensim.models.fasttext import FastText

model = FastText(size = 100, window = 5, min_count = 5)
model.build_vocab(sentences = LABELED_STEMMED_TEXTS)
model.train(sentences = LABELED_STEMMED_TEXTS, total_examples = len(LABELED_STEMMED_TEXTS), epochs=10)

CPU times: user 34.6 s, sys: 763 ms, total: 35.3 s
Wall time: 16.2 s


# Text / NLP based features

In [None]:
%%time
trainDF['char_count'] = trainDF['text'].apply(len)
trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [None]:
%%time

import textblob

pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

# Topic Models as features

In [None]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(text_train_count)
topic_word = lda_model.components_
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))