# Load data

In [None]:
import pandas as pd
from google.colab import drive
import os
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/CA683_Assignment/YelpDataset/20210411')

Mounted at /content/drive/


In [None]:
path = './20210411_final_data_265062.csv'
review_col_list2 = ["stars","text"]
df = pd.read_csv(path, usecols=review_col_list2)

In [None]:
#df =pd.read_pickle('./pickle_review_df_preprocessed_104756.txt') #/content/drive/MyDrive/CA683_Assignment/YelpDataset/20210411/20210411_final_data_265062.csv

In [None]:
df.head(5)

Unnamed: 0,stars,text
0,1.0,10pm on a super bowl Sunday and they're alread...
1,5.0,Holy heck this place is amazing. I love their ...
2,4.0,Amazing shrimp taco. The others were good but...
3,3.0,the chips may well be the only thing worth goi...
4,4.0,Great food and fun atmosphere. Nothing bad to...


In [None]:
df = df[['stars','text']]

In [None]:
import spacy


nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
stop_words

In [None]:
from string import punctuation

## declare function

In [None]:
import numpy as np
import re
import glob
from smart_open import smart_open
import os
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn import utils
from collections import namedtuple, defaultdict
import logging
from sklearn.feature_extraction.text import TfidfVectorizer


# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
	"""
	Ref: https://stackoverflow.com/questions/20802056/python-regular-expression-1
	:param text: string
	:return:
		clean string
	"""
	norm_text = text.lower()
	# Replace breaks with spaces
	norm_text = norm_text.replace('<br />', ' ')
	norm_text = norm_text.replace('\n', ' ')
	# Pad punctuation with spaces on both sides
	#norm_text = re.sub(r"([\.\",\(\)!\?;:])", r" \1 ", norm_text)\n
	norm_text = norm_text.translate(str.maketrans('', '', string.punctuation))
	return norm_text


def concat_files(dirname, folders):
	"""
	Concatenate text from files to one file, and return a file list.
	:param dirname: string of directory
	:param folders: list of folder names
	:return
		files: list of file paths
	"""
	files = []

	for fol in folders:
		output = fol.replace('/', '-') + '.txt'
		txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
		print('{} records in {}...'.format(len(txt_files), output))
		files.append(output)

		with smart_open(os.path.join(dirname, output), 'wb') as n:
			for i, txt in enumerate(txt_files):
				with smart_open(txt, 'rb') as f:
					one_text = f.read().decode('utf-8')  # from binary to string
					one_text = normalize_text(one_text)  # convert to lower-case and strip punctuations
					n.write(one_text.encode('utf-8') + b'\n')  # from string to binary + newline

	return files


def select_imdb(select_num, dirname, files, file_splits, file_sentiments):
	"""
	Subset and split IMDB dataset into train/test.
	:param select_num: num of rows to select
	:param dirname: directory of txt files
	:param files: list of string name of files
	:param file_splits: list of string on train/test split
	:param file_sentiments: list of string on pos/neg sentiment label
	:return:
		list of namedtuple
	"""

	sent_doc = namedtuple('sent_doc', ['words', 'tags', 'split', 'sentiment'])
	all_doc = []
	doc_id = 0
	for i, fi in enumerate(files[:-1]):
		s_ = file_splits[i]
		se_ = file_sentiments[i]

		with smart_open(os.path.join(dirname, fi), 'rb', encoding='utf-8') as texts:
			for line_no, line in enumerate(texts):
				if line_no < select_num:
					tokens = gensim.utils.to_unicode(line).split()
					words = tokens  # must be a list for doc2vec
					tags = [doc_id]  # must be a list for doc2vec
					doc_id += 1
					split = s_
					sentiment = se_
					all_doc.append(sent_doc(words, tags, split, sentiment))
				else:
					break

	return all_doc


class DocPreprocess(object):

	def __init__(self,
				 nlp,
				 stop_words,
				 docs,
				 labels,
				 build_bi=False,
				 min_count=5,
				 threshold=10,
				 allowed_postags=['ADV', 'VERB', 'ADJ', 'NOUN', 'PROPN', 'NUM']):

		self.nlp = nlp  # spacy nlp object
		self.stop_words = stop_words  # spacy.lang.en.stop_words.STOP_WORDS
		self.docs = docs  # docs must be either list or numpy array or series of docs
		self.labels = labels # labels must be list or or numpy array or series of labels
		self.doc_ids = np.arange(len(docs))
		self.simple_doc_tokens = [gensim.utils.simple_preprocess(doc, deacc=True) for doc in self.docs]

		if build_bi:
			self.bi_detector = self.build_bi_detect(self.simple_doc_tokens, min_count=min_count, threshold=threshold)
			self.new_docs = self.make_bigram_doc(self.bi_detector, self.simple_doc_tokens)
		else:
			self.new_docs = self.make_simple_doc(self.simple_doc_tokens)
		self.doc_words = [self.lemmatize(doc, allowed_postags=allowed_postags) for doc in self.new_docs]
		self.tagdocs = [TaggedDocument(words=words, tags=[tag]) for words, tag in zip(self.doc_words, self.doc_ids)]


	def build_bi_detect(self, simple_doc_tokens, min_count, threshold):
		bi_ = gensim.models.phrases.Phrases(simple_doc_tokens, min_count=min_count, threshold=threshold)
		bi_detector = gensim.models.phrases.Phraser(bi_)  # wrapper enhance efficiency
		return bi_detector


	def make_bigram_doc(self, bi_detector, simple_doc_tokens):
		bi_doc_tokens = [bi_detector[doc_tokens] for doc_tokens in simple_doc_tokens]
		bi_docs = []
		for bi_tokens in bi_doc_tokens:
			bi_doc = " ".join(bi_tokens)  # concatenate back to a sentence
			bi_docs.append(bi_doc)
		return bi_docs


	def make_simple_doc(self, simple_doc_tokens):
		simple_docs = []
		for doc_tokens in simple_doc_tokens:
			simple = " ".join(doc_tokens)  # concatenate back to a sentence
			simple_docs.append(simple)
		return simple_docs


	def lemmatize(self, doc, allowed_postags):
		"""
		Lemmatize words and remove stop_words.
		:param doc: text
		:param allowed_postags: list of pos tags
		:return:
			list of tokens
		"""
		doc = self.nlp(doc)
		tokens = [token.lemma_ for token in doc if (
				token.pos_ in allowed_postags) and (token.text not in self.stop_words)
    ]
		return tokens



class DocModel(object):

	def __init__(self, docs, **kwargs):
		"""
		:param docs: list of TaggedDocument
		:param kwargs: dictionary of (key,value) for Doc2Vec arguments
		"""
		self.model = Doc2Vec(**kwargs)
		self.docs = docs
		self.model.build_vocab([x for x in self.docs])

	def custom_train(self, fixed_lr=False, fixed_lr_epochs=None):
		"""
		Train Doc2Vec with two options, without fixed learning rate(recommended) or with fixed learning rate.
		Fixed learning rate also includes implementation of shuffling training dataset.
		:param fixed_lr: boolean
		:param fixed_lr_epochs: num of epochs for fixed lr training
		"""
		if not fixed_lr:
			self.model.train([x for x in self.docs],
							 total_examples=len(self.docs),
							 epochs=self.model.epochs)
		else:
			for _ in range(fixed_lr_epochs):
				self.model.train(utils.shuffle([x for x in self.docs]),
								 total_examples=len(self.docs),
								 epochs=1)
				self.model.alpha -= 0.002
				self.model.min_alpha = self.model.alpha  # fixed learning rate


	def test_orig_doc_infer(self):
		"""
		Use the original doc as input for model's vector inference,
		and then compare using most_similar()
		to see if model finds the original doc id be the most similar doc to the input.
		"""
		idx = np.random.randint(len(self.docs))
		print('idx: ' + str(idx))
		doc = [doc for doc in self.docs if doc.tags[0] == idx]
		inferred_vec = self.model.infer_vector(doc[0].words)
		print(self.model.docvecs.most_similar([inferred_vec]))  # wrap vec in a list


class MeanEmbeddingVectorizer(object):


	def __init__(self, word_model):
		self.word_model = word_model
		self.vector_size = word_model.wv.vector_size

	def fit(self):  # comply with scikit-learn transformer requirement
		return self

	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector

	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""
		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word))

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])


class TfidfEmbeddingVectorizer(object):

	def __init__(self, word_model):

		self.word_model = word_model
		self.word_idf_weight = None
		self.vector_size = word_model.wv.vector_size

	def fit(self, docs):  # comply with scikit-learn transformer requirement
		"""
		Fit in a list of docs, which had been preprocessed and tokenized,
		such as word bi-grammed, stop-words removed, lemmatized, part of speech filtered.
		Then build up a tfidf model to compute each word's idf as its weight.
		Noted that tf weight is already involved when constructing average word vectors, and thus omitted.
		:param
			pre_processed_docs: list of docs, which are tokenized
		:return:
			self
		"""

		text_docs = []
		for doc in docs:
			text_docs.append(" ".join(doc))

		tfidf = TfidfVectorizer()
		tfidf.fit(text_docs)  # must be list of text string

		# if a word was never seen - it must be at least as infrequent
		# as any of the known words - so the default idf is the max of
		# known idf's
		max_idf = max(tfidf.idf_)  # used as default value for defaultdict
		self.word_idf_weight = defaultdict(lambda: max_idf,
										   [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])
		return self


	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector


	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""

		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word) * self.word_idf_weight[word])  # idf weighted

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])

## declare data frame

In [None]:
import numpy as np
import gensim


In [None]:
all_docs = DocPreprocess(nlp, stop_words, df['text'], df['stars']) #around 1hour to complete

In [None]:
print('Demo of doc words...')
all_docs.doc_words[5][:10]

Demo of doc words...


['price',
 'high',
 'food',
 'good',
 'service',
 'awesome',
 'gamble',
 'hard',
 'rock',
 'eat']

In [None]:
all_docs.labels.iloc[4]

4.0

In [None]:
import multiprocessing
import sys
from gensim.models.word2vec import Word2Vec

workers = multiprocessing.cpu_count()
print('number of cpu: {}'.format(workers))
assert gensim.models.doc2vec.FAST_VERSION > -1

number of cpu: 2


Word2Vec:
size: Using a higher dimensionality than vocabulary size would more-or-less guarantee 'overfitting'. The training could tend toward an idiosyncratic vector for each word – essentially like a 'one-hot' encoding – that would perform better than any other encoding, because there's no cross-word interference forced by representing a larger number of words in a smaller number of dimensions.
https://stackoverflow.com/questions/45444964/python-what-is-the-size-parameter-in-gensim-word2vec-model-class

In [None]:
word_model = Word2Vec(all_docs.doc_words,
                      min_count=5,
                      size=200,
                      window=5,
                      workers=workers,
                      iter=100)

In [None]:
#word_model1 = Word2Vec(all_docs.doc_words,
                      min_count=5,
                      size=100,
                      window=5,
                      workers=workers,
                      iter=100)

In [None]:
word_model.save("word2vec.model")

In [None]:
word_model = Word2Vec.load("word2vec.model")

In [None]:
word_model.wv.save_word2vec_format('/content/drive/MyDrive/CA683_Assignment/YelpDataset/Data final new/word2vec200_new.txt', binary=False)

In [None]:
_save_word2vec_format()

In [None]:
class SaveEmbeddingVectorizer(object):


	def __init__(self, word_model):
		self.word_model = word_model
		self.vector_size = word_model.wv.vector_size

	def fit(self):  # comply with scikit-learn transformer requirement
		return self

	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector

	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""
		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word))

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])


In [None]:
vec_tr = SaveEmbeddingVectorizer(word_model)
word_vec = vec_tr.transform(all_docs.doc_words)

print('Demo of word averaging doc vector...')
display(word_vec[4])



Demo of word averaging doc vector...


array([-3.92165232, -3.0878787 , -0.16704041, -1.13683629,  1.21828532,
        0.27376157, -0.42476043, -0.91182113,  0.70386249, -0.06748584,
        0.99777162, -0.36395034,  1.15195608, -2.74257421, -0.975555  ,
       -0.26089469,  0.78469825, -1.74833596, -1.54873776, -1.7906965 ,
        1.6555928 ,  1.39265466, -1.64504075,  2.32766128, -0.68572265,
        2.00495934,  1.78312433,  0.21682189, -1.6861254 , -1.00973892,
       -0.46868184,  0.577124  ,  3.30001402, -0.14830118, -0.61378556,
       -1.00067425, -0.50749475, -0.43173692, -0.10008602,  0.33132648,
        0.44923231, -0.25669596,  1.34209383,  0.64705753,  0.27424324,
        0.22284591,  1.48210073,  3.11118817, -1.9441458 ,  1.71400893,
        2.55682683,  1.53637421,  0.41754565,  4.61667681,  0.72137368,
        2.38815975,  2.11989117,  0.00892776, -1.68494523, -1.07636797,
       -1.44884014, -0.01833257, -2.73183584, -0.6105895 ,  1.96454191,
        1.51123595, -1.8901608 ,  1.20990896,  0.77628559,  0.58

In [None]:
np.savetxt(os.path.join('./','word_vec.csv'), word_vec, delimiter=',')

In [None]:
word_vec.shape

(3100281, 200)

In [None]:
class MeanEmbeddingVectorizer(object):


	def __init__(self, word_model):
		self.word_model = word_model
		self.vector_size = word_model.wv.vector_size

	def fit(self):  # comply with scikit-learn transformer requirement
		return self

	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector

	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""
		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word))

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])


In [None]:
mean_vec_tr = MeanEmbeddingVectorizer(word_model)
doc_vec = mean_vec_tr.transform(all_docs.doc_words)

print('Demo of word averaging doc vector...')
display(doc_vec[4])



Demo of word averaging doc vector...


array([-2.69792736e-01, -5.35807014e-01,  8.15641344e-01,  6.40296340e-02,
       -4.68887165e-02,  3.65254968e-01, -1.95329517e-01,  8.62961709e-02,
       -9.88713354e-02,  1.11988783e+00, -3.48342478e-01,  6.58046842e-01,
        2.04589397e-01, -1.01225346e-01,  5.12566715e-02,  5.09131849e-01,
        5.54112941e-02,  5.03945053e-01,  1.66125402e-01,  4.04879265e-02,
        5.94993293e-01, -5.72470129e-01,  5.56224346e-01, -2.81331480e-01,
       -2.65876830e-01, -8.82139921e-01,  5.20005465e-01, -1.10059834e+00,
        1.32963523e-01, -7.86401555e-02, -1.63804367e-01,  3.41727942e-01,
        3.63860071e-01,  5.61114550e-01, -7.90810704e-01,  3.52625847e-01,
        1.23786531e-01,  1.17522955e+00,  1.31062880e-01,  4.76933211e-01,
        3.61703187e-01,  5.73030591e-01, -1.81359984e-02,  6.62546698e-03,
       -2.84063697e-01,  1.18781197e+00, -1.81620538e-01,  2.24330395e-01,
        3.75050157e-01, -5.87293327e-01,  4.44372632e-02,  5.25909483e-01,
        2.50879556e-01,  

In [None]:
print('Shape of word-mean doc2vec...')
display(doc_vec.shape)
#print('Save word-mean doc2vec as csv file...')
#np.savetxt(os.path.join(dir_path,'doc_vec.csv'), doc_vec, delimiter=',')

Shape of word-mean doc2vec...


(275197, 200)

In [None]:
np.savetxt(os.path.join('./','doc_vec.csv'), doc_vec, delimiter=',')

In [None]:
class TfidfEmbeddingVectorizer(object):

	def __init__(self, word_model):

		self.word_model = word_model
		self.word_idf_weight = None
		self.vector_size = word_model.wv.vector_size

	def fit(self, docs):  # comply with scikit-learn transformer requirement
		"""
		Fit in a list of docs, which had been preprocessed and tokenized,
		such as word bi-grammed, stop-words removed, lemmatized, part of speech filtered.
		Then build up a tfidf model to compute each word's idf as its weight.
		Noted that tf weight is already involved when constructing average word vectors, and thus omitted.
		:param
			pre_processed_docs: list of docs, which are tokenized
		:return:
			self
		"""

		text_docs = []
		for doc in docs:
			text_docs.append(" ".join(doc))

		tfidf = TfidfVectorizer()
		tfidf.fit(text_docs)  # must be list of text string

		# if a word was never seen - it must be at least as infrequent
		# as any of the known words - so the default idf is the max of
		# known idf's
		max_idf = max(tfidf.idf_)  # used as default value for defaultdict
		self.word_idf_weight = defaultdict(lambda: max_idf,
										   [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])
		return self


	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector


	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""

		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word) * self.word_idf_weight[word])  # idf weighted

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])

In [None]:
tfidf_vec_tr = TfidfEmbeddingVectorizer(word_model)

In [None]:

tfidf_vec_tr.fit(all_docs.doc_words)  # fit tfidf model first
tfidf_doc_vec = tfidf_vec_tr.transform(all_docs.doc_words)



In [None]:
tfidf_doc_vec.shape

(275197, 200)

In [None]:
# Save tfidf word averaging doc2vec.
print('Shape of tfidf-word-mean doc2vec...')
display(tfidf_doc_vec.shape)
print('Save tfidf-word-mean doc2vec as csv file...')
np.savetxt(os.path.join('./', 'tfidf_doc_vec.csv'), tfidf_doc_vec, delimiter=',')

Shape of tfidf-word-mean doc2vec...


(275197, 200)

Save tfidf-word-mean doc2vec as csv file...


#GloVe
CLOSED - pretrained GloVe data file is not available for Yelp dataset. Cannot find tutorials for converting to GloVe

In [None]:
from gensim.test.utils import get_tmpfile, datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec



# Load in GloVe vector.
glove_vec_fi = datapath('/content/drive/MyDrive/CA683_Assignment/YelpDataset/102442_related/glove.twitter.27B.200d.txt')
tmp_word2vec_fi = get_tmpfile('tmp_glove2word2vec.txt')

glove2word2vec(glove_vec_fi, tmp_word2vec_fi)

glove_word_model = KeyedVectors.load_word2vec_format(tmp_word2vec_fi)

In [None]:
class MeanEmbeddingVectorizerGlove(object):


	def __init__(self, glove_word_model):
		self.glove_word_model = glove_word_model
		self.vector_size = glove_word_model.wv.vector_size

	def fit(self):  # comply with scikit-learn transformer requirement
		return self

	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector

	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""
		mean = []
		for word in sent:
			if word in self.glove_word_model.wv.vocab:
				mean.append(self.glove_word_model.wv.get_vector(word))

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])

In [None]:
mean_vec_tr_Glove = MeanEmbeddingVectorizerGlove(glove_word_model)
doc_vec_Glove = mean_vec_tr_Glove.transform(all_docs.doc_words)

print('Demo of word averaging doc vector...')
display(doc_vec_Glove[4])

  


Demo of word averaging doc vector...


array([-1.94482982e-01,  2.21669991e-02,  2.77861133e-02,  1.20053448e-01,
        1.25420883e-01,  2.23221093e-01,  5.07838905e-01, -1.93701163e-01,
       -8.37843269e-02, -1.39627561e-01, -1.81261107e-01, -6.15924411e-02,
       -5.46755672e-01, -1.59380227e-01,  2.30633900e-01,  1.12949997e-01,
        6.64856657e-02, -2.52187461e-01, -2.41074562e-01,  2.20824480e-02,
        5.12551144e-02, -5.53069972e-02, -2.17195198e-01, -1.19284779e-01,
       -1.26033992e-01,  1.07790232e+00,  9.74177718e-02,  1.46911889e-01,
        1.60915449e-01, -2.91877866e-01, -1.73660547e-01, -1.51009873e-01,
       -2.66779006e-01, -1.25371993e-01, -6.45881072e-02,  8.63382295e-02,
        9.70205516e-02, -3.60637046e-02, -7.54388841e-03,  6.72093779e-02,
        3.04515988e-01,  1.69251338e-01,  1.72246993e-01, -2.68236697e-01,
        1.07599467e-01,  4.92706820e-02,  9.19498503e-02, -4.99255471e-02,
       -2.28127107e-01,  2.91612893e-01,  1.14784561e-01,  3.41268927e-02,
       -2.85834640e-01,  

In [None]:
print('Shape of word-mean doc2vec Glove...')
display(doc_vec_Glove.shape)
#print('Save word-mean doc2vec Glove as csv file...')
#np.savetxt(os.path.join(dir_path,'doc_vec.csv'), doc_vec, delimiter=',')

Shape of word-mean doc2vec Glove...


(275197, 200)

In [None]:
np.savetxt(os.path.join('./','doc_vec_Glove.csv'), doc_vec_Glove, delimiter=',')

In [None]:
class TfidfEmbeddingVectorizerGlove(object):

	def __init__(self, glove_word_model):

		self.glove_word_model = glove_word_model
		self.word_idf_weight = None
		self.vector_size = glove_word_model.wv.vector_size

	def fit(self, docs):  # comply with scikit-learn transformer requirement
		"""
		Fit in a list of docs, which had been preprocessed and tokenized,
		such as word bi-grammed, stop-words removed, lemmatized, part of speech filtered.
		Then build up a tfidf model to compute each word's idf as its weight.
		Noted that tf weight is already involved when constructing average word vectors, and thus omitted.
		:param
			pre_processed_docs: list of docs, which are tokenized
		:return:
			self
		"""

		text_docs = []
		for doc in docs:
			text_docs.append(" ".join(doc))

		tfidf = TfidfVectorizer()
		tfidf.fit(text_docs)  # must be list of text string

		# if a word was never seen - it must be at least as infrequent
		# as any of the known words - so the default idf is the max of
		# known idf's
		max_idf = max(tfidf.idf_)  # used as default value for defaultdict
		self.word_idf_weight = defaultdict(lambda: max_idf,
										   [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])
		return self


	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector


	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""

		mean = []
		for word in sent:
			if word in self.glove_word_model.wv.vocab:
				mean.append(self.glove_word_model.wv.get_vector(word) * self.word_idf_weight[word])  # idf weighted

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])

In [None]:
tfidf_vec_tr_Glove = TfidfEmbeddingVectorizerGlove(glove_word_model)

  import sys


In [None]:
tfidf_vec_tr_Glove.fit(all_docs.doc_words)  # fit tfidf model first
tfidf_doc_vec_Glove = tfidf_vec_tr_Glove.transform(all_docs.doc_words)



In [None]:
tfidf_doc_vec_Glove.shape

(275197, 200)

In [None]:
# Save tfidf word averaging doc2vec.
print('Shape of tfidf-word-mean doc2vec...')
display(tfidf_doc_vec_Glove.shape)
print('Save tfidf-word-mean doc2vec as csv file...')
np.savetxt(os.path.join('./', 'tfidf_doc_vec_Glove.csv'), tfidf_doc_vec_Glove, delimiter=',')

Shape of tfidf-word-mean doc2vec...


(275197, 200)

Save tfidf-word-mean doc2vec as csv file...


# Docvec

In [None]:
class DocModel(object):

	def __init__(self, docs, **kwargs):
		"""
		:param docs: list of TaggedDocument
		:param kwargs: dictionary of (key,value) for Doc2Vec arguments
		"""
		self.model = Doc2Vec(**kwargs)
		self.docs = docs
		self.model.build_vocab([x for x in self.docs])

	def custom_train(self, fixed_lr=False, fixed_lr_epochs=None):
		"""
		Train Doc2Vec with two options, without fixed learning rate(recommended) or with fixed learning rate.
		Fixed learning rate also includes implementation of shuffling training dataset.
		:param fixed_lr: boolean
		:param fixed_lr_epochs: num of epochs for fixed lr training
		"""
		if not fixed_lr:
			self.model.train([x for x in self.docs],
							 total_examples=len(self.docs),
							 epochs=self.model.epochs)
		else:
			for _ in range(fixed_lr_epochs):
				self.model.train(utils.shuffle([x for x in self.docs]),
								 total_examples=len(self.docs),
								 epochs=1)
				self.model.alpha -= 0.002
				self.model.min_alpha = self.model.alpha  # fixed learning rate


	def test_orig_doc_infer(self):
		"""
		Use the original doc as input for model's vector inference,
		and then compare using most_similar()
		to see if model finds the original doc id be the most similar doc to the input.
		"""
		idx = np.random.randint(len(self.docs))
		print('idx: ' + str(idx))
		doc = [doc for doc in self.docs if doc.tags[0] == idx]
		inferred_vec = self.model.infer_vector(doc[0].words)
		print(self.model.docvecs.most_similar([inferred_vec]))  # wrap vec in a list

dm ({1,0}, optional) – Defines the training algorithm. If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
hs ({1,0}, optional) – If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
sample (float, optional) – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
alpha (float, optional) – The initial learning rate.

min_alpha (float, optional) – Learning rate will linearly drop to min_alpha as training progresses.
epochs (int, optional) – Number of iterations (epochs) over the corpus. Defaults to 10 for Doc2Vec.
https://radimrehurek.com/gensim/models/doc2vec.html

In [None]:
dm_args = {
    'dm': 1,
    'dm_mean': 1,
    'vector_size': 100,
    'window': 5,
    'negative': 5,
    'hs': 0,
    'min_count': 5,
    'sample': 0,
    'workers': workers,
    'alpha': 0.025,
    'min_alpha': 0.025,
    'epochs': 100,
    'comment': 'alpha=0.025'
}

In [None]:
dm = DocModel(docs=all_docs.tagdocs, **dm_args)

In [None]:
dm.custom_train()

In [None]:
# Save doc2vec as feature dataframe.
dm_doc_vec_ls = []
for i in range(len(dm.model.docvecs)):
    dm_doc_vec_ls.append(dm.model.docvecs[i])


dm_doc_vec = pd.DataFrame(dm_doc_vec_ls)
print('Shape of dm doc2vec...')
display(dm_doc_vec.shape)

print('Save dm doc2vec as csv file...')
dm_doc_vec.to_csv(os.path.join('./', 'dm_doc_vec.csv'), index=False, header=False)

Shape of dm doc2vec...


(275197, 100)

Save dm doc2vec as csv file...


In [None]:

print('Shape of target labels...')
display(all_docs.labels.shape)
target_labels = all_docs.labels

print('Save target labels...')
target_labels.to_csv(os.path.join('./', 'target_labels.csv'), index=False, header=True)

Shape of target labels...


(275197,)

Save target labels...


#Classification Models
SGDClassifier or Logistic Regression applied on 

Tf-Idf Weighted Averaging Word Vector
PV-DM Doc2vec
Tf-Idf and Doc2vec Concatenated Feature

## Prepare

In [None]:

import os

# Read in saved files.


doc_vec = pd.read_csv(os.path.join('./', 'doc_vec.csv'), header=None)
tfidf_doc_vec = pd.read_csv(os.path.join('./', 'tfidf_doc_vec.csv'), header=None)
doc_vec_Glove = pd.read_csv(os.path.join('./', 'doc_vec_Glove.csv'), header=None)
tfidf_doc_vec_Glove = pd.read_csv(os.path.join('./', 'tfidf_doc_vec_Glove.csv'), header=None)
dm_doc_vec = pd.read_csv(os.path.join('./', 'dm_doc_vec.csv'), header=None)
target_labels = pd.read_csv(os.path.join('./', 'target_labels.csv'), header=0)

In [None]:
from sklearn.linear_model import LogisticRegression

# Classification via Logistic Model
logistic = LogisticRegression(random_state=1, multi_class='multinomial', solver='saga')

In [None]:
from sklearn.linear_model import SGDClassifier

# (Optional) Classification via stochastic gradient descent classifier.
sgd = SGDClassifier(loss='hinge',
                    verbose=1,
                    random_state=1,
                    learning_rate='invscaling',
                    eta0=1)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:

model = sgd  # or choose sgd.
df = tfidf_doc_vec
concate = False
concat_df = dm_doc_vec

In [None]:

import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import math
import seaborn as sns

def split_size(df, train=0.8, valid=0.):
    train_size = math.floor(len(df) * train)
    valid_size = math.floor(len(df) * valid)
    test_size = len(df) - train_size - valid_size
    return train_size, valid_size, test_size

In [None]:
from sklearn.model_selection import train_test_split




def main(model, df, concate, concat_df):
    if concate:
        df = pd.concat([df, concat_df], axis=1, ignore_index=True)
    else:
        df = df

    # Specify train/valid/test size.
    train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    # Prepare test dataset.
    train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

    # Prepare valid dataset.
    if valid_size != 0:
        train_X, valid_X, train_y, valid_y = train_test_split(train_X,
                                                      train_y,
                                                      test_size=valid_size,
                                                      random_state=1,
                                                      stratify=train_y)
    
    print('Shape of train_X: {}'.format(train_X.shape))
    print('Shape of valid_X: {}'.format(valid_X.shape if 'valid_X' in vars() else (0,0)))
    print('Shape of text_X: {}'.format(test_X.shape))
    
    model.fit(train_X, train_y)
    
    if valid_size != 0:
        return model, train_X, valid_X, test_X, train_y, valid_y, test_y
    else:
        return model, train_X, None, test_X, train_y, None, test_y

## Simple Averaging Word Vector

In [None]:

model = sgd  # or choose sgd.
df = doc_vec
concate = False
concat_df = dm_doc_vec

In [None]:

# __main__
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (220157, 200)
Shape of valid_X: (0, 0)
Shape of text_X: (55040, 200)


  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


-- Epoch 1
Norm: 1.91, NNZs: 200, Bias: -1.531377, T: 220157, Avg. loss: 0.119428
Total training time: 0.20 seconds.
-- Epoch 2
Norm: 1.83, NNZs: 200, Bias: -1.529842, T: 440314, Avg. loss: 0.110772
Total training time: 0.38 seconds.
-- Epoch 3
Norm: 1.82, NNZs: 200, Bias: -1.513425, T: 660471, Avg. loss: 0.110093
Total training time: 0.56 seconds.
-- Epoch 4
Norm: 1.79, NNZs: 200, Bias: -1.482759, T: 880628, Avg. loss: 0.109890
Total training time: 0.76 seconds.
-- Epoch 5
Norm: 1.80, NNZs: 200, Bias: -1.484349, T: 1100785, Avg. loss: 0.109451
Total training time: 0.93 seconds.
-- Epoch 6
Norm: 1.78, NNZs: 200, Bias: -1.481391, T: 1320942, Avg. loss: 0.109392
Total training time: 1.12 seconds.
-- Epoch 7
Norm: 1.77, NNZs: 200, Bias: -1.475521, T: 1541099, Avg. loss: 0.109278
Total training time: 1.30 seconds.
Convergence after 7 epochs took 1.30 seconds
-- Epoch 1
Norm: 0.42, NNZs: 200, Bias: -0.997838, T: 220157, Avg. loss: 0.107977
Total training time: 0.18 seconds.
-- Epoch 2
Norm:

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.3s finished


In [None]:
def sk_evaluate(model, feature, label, label_names):
    pred = model.predict(feature)
    true = np.array(label)

    print('Score on dataset...\n')
    print('Confusion Matrix:\n', confusion_matrix(true, pred))
    print('\nClassification Report:\n', classification_report(true, pred, target_names=label_names))
    print('\naccuracy: {:.3f}'.format(accuracy_score(true, pred)))
    print('f1 score: {:.3f}'.format(f1_score(true, pred, average='weighted')))

    return pred, true

In [None]:
print('Performance of Mean Word Vector on training dataset...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Mean Word Vector on training dataset...
Score on dataset...

Confusion Matrix:
 [[ 13904    388    173    472   1996]
 [  4852    956    328   1327   2124]
 [  2796   1326    627   4372   7253]
 [  1061    626    357   4566  40165]
 [   916    273    175   1372 127752]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.59      0.82      0.69     16933
         2.0       0.27      0.10      0.15      9587
         3.0       0.38      0.04      0.07     16374
         4.0       0.38      0.10      0.16     46775
         5.0       0.71      0.98      0.82    130488

    accuracy                           0.67    220157
   macro avg       0.47      0.41      0.38    220157
weighted avg       0.59      0.67      0.59    220157


accuracy: 0.671
f1 score: 0.586


##Tf-Idf Weighted Averaging Word Vector

In [None]:
model = sgd  # or choose sgd.
df = tfidf_doc_vec
concate = False
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (220157, 200)
Shape of valid_X: (0, 0)
Shape of text_X: (55040, 200)


  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


-- Epoch 1
Norm: 3.37, NNZs: 200, Bias: -2.151490, T: 220157, Avg. loss: 0.787789
Total training time: 0.18 seconds.
-- Epoch 2
Norm: 1.76, NNZs: 200, Bias: -1.706451, T: 440314, Avg. loss: 0.234644
Total training time: 0.36 seconds.
-- Epoch 3
Norm: 1.45, NNZs: 200, Bias: -1.714692, T: 660471, Avg. loss: 0.200969
Total training time: 0.54 seconds.
-- Epoch 4
Norm: 1.27, NNZs: 200, Bias: -1.733585, T: 880628, Avg. loss: 0.187489
Total training time: 0.72 seconds.
-- Epoch 5
Norm: 1.15, NNZs: 200, Bias: -1.750487, T: 1100785, Avg. loss: 0.177228
Total training time: 0.90 seconds.
-- Epoch 6
Norm: 1.11, NNZs: 200, Bias: -1.792036, T: 1320942, Avg. loss: 0.171553
Total training time: 1.08 seconds.
-- Epoch 7
Norm: 0.98, NNZs: 200, Bias: -1.809486, T: 1541099, Avg. loss: 0.166453
Total training time: 1.25 seconds.
-- Epoch 8
Norm: 0.93, NNZs: 200, Bias: -1.832152, T: 1761256, Avg. loss: 0.162684
Total training time: 1.43 seconds.
-- Epoch 9
Norm: 0.89, NNZs: 200, Bias: -1.852896, T: 198141

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   39.2s finished


In [None]:
print('Performance of Tf-Idf Mean Word Vector on training dataset...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Tf-Idf Mean Word Vector on training dataset...
Score on dataset...

Confusion Matrix:
 [[ 12893    852    640    582   1966]
 [  4027   1318   1269    922   2051]
 [  2645   1333   2519   3147   6730]
 [  1602    633   1505   5757  37278]
 [  1779    433    636   3534 124106]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.56      0.76      0.65     16933
         2.0       0.29      0.14      0.19      9587
         3.0       0.38      0.15      0.22     16374
         4.0       0.41      0.12      0.19     46775
         5.0       0.72      0.95      0.82    130488

    accuracy                           0.67    220157
   macro avg       0.47      0.43      0.41    220157
weighted avg       0.60      0.67      0.60    220157


accuracy: 0.666
f1 score: 0.601


In [None]:
#test on testing data
print('Performance of Tf-Idf Mean Word Vector on testing dataset...')
_, _ = sk_evaluate(clf, test_X, test_y, label_names=None)

Performance of Tf-Idf Mean Word Vector on testing dataset...
Score on dataset...

Confusion Matrix:
 [[ 3193   209   152   155   525]
 [ 1019   315   342   229   492]
 [  670   323   626   788  1686]
 [  406   167   335  1415  9371]
 [  430    81   145   825 31141]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.56      0.75      0.64      4234
         2.0       0.29      0.13      0.18      2397
         3.0       0.39      0.15      0.22      4093
         4.0       0.41      0.12      0.19     11694
         5.0       0.72      0.95      0.82     32622

    accuracy                           0.67     55040
   macro avg       0.47      0.42      0.41     55040
weighted avg       0.60      0.67      0.60     55040


accuracy: 0.667
f1 score: 0.600


##PV-DM Doc2vec 

In [None]:
model = sgd  # or choose sgd.
df = dm_doc_vec
concate = False
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (220157, 100)
Shape of valid_X: (0, 0)
Shape of text_X: (55040, 100)


  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


-- Epoch 1
Norm: 1.63, NNZs: 100, Bias: -1.036919, T: 220157, Avg. loss: 0.188057
Total training time: 0.14 seconds.
-- Epoch 2
Norm: 1.48, NNZs: 100, Bias: -1.112054, T: 440314, Avg. loss: 0.153576
Total training time: 0.28 seconds.
-- Epoch 3
Norm: 1.49, NNZs: 100, Bias: -1.160924, T: 660471, Avg. loss: 0.151255
Total training time: 0.42 seconds.
-- Epoch 4
Norm: 1.40, NNZs: 100, Bias: -1.186569, T: 880628, Avg. loss: 0.149689
Total training time: 0.56 seconds.
-- Epoch 5
Norm: 1.37, NNZs: 100, Bias: -1.193784, T: 1100785, Avg. loss: 0.148869
Total training time: 0.70 seconds.
-- Epoch 6
Norm: 1.40, NNZs: 100, Bias: -1.221610, T: 1320942, Avg. loss: 0.148353
Total training time: 0.84 seconds.
-- Epoch 7
Norm: 1.36, NNZs: 100, Bias: -1.220718, T: 1541099, Avg. loss: 0.148038
Total training time: 0.98 seconds.
-- Epoch 8
Norm: 1.33, NNZs: 100, Bias: -1.229708, T: 1761256, Avg. loss: 0.147718
Total training time: 1.13 seconds.
-- Epoch 9
Norm: 1.34, NNZs: 100, Bias: -1.244879, T: 198141

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.7s finished


In [None]:
print('Performance of Doc2vec on training dataset...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Doc2vec on training dataset...
Score on dataset...

Confusion Matrix:
 [[  9974    542    433    396   5588]
 [  3578    951    841    676   3541]
 [  2405   1287   2245   2077   8360]
 [  1261    790   1970   2810  39944]
 [  1212    447    928   1336 126565]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.54      0.59      0.56     16933
         2.0       0.24      0.10      0.14      9587
         3.0       0.35      0.14      0.20     16374
         4.0       0.39      0.06      0.10     46775
         5.0       0.69      0.97      0.80    130488

    accuracy                           0.65    220157
   macro avg       0.44      0.37      0.36    220157
weighted avg       0.57      0.65      0.56    220157


accuracy: 0.647
f1 score: 0.563


##Tf-Idf and Doc2vec Concatenated Feature

###logistic

In [None]:
model = logistic  # or choose sgd.
df = tfidf_doc_vec
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (220157, 300)
Shape of valid_X: (0, 0)
Shape of text_X: (55040, 300)


  y = column_or_1d(y, warn=True)


In [None]:
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset...
Score on dataset...

Confusion Matrix:
 [[ 12949   1075    934    426   1549]
 [  3551   1647   2173    826   1390]
 [  1797    984   5008   4044   4541]
 [   796    237   2449  11037  32256]
 [   862    139    835   6796 121856]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.65      0.76      0.70     16933
         2.0       0.40      0.17      0.24      9587
         3.0       0.44      0.31      0.36     16374
         4.0       0.48      0.24      0.32     46775
         5.0       0.75      0.93      0.83    130488

    accuracy                           0.69    220157
   macro avg       0.54      0.48      0.49    220157
weighted avg       0.65      0.69      0.65    220157


accuracy: 0.693
f1 score: 0.653


In [None]:
#test on testing data
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using logistic ...')
_, _ = sk_evaluate(clf, test_X, test_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using logistic ...
Score on dataset...

Confusion Matrix:
 [[ 3189   263   228   127   427]
 [  934   393   552   213   305]
 [  491   247  1198   994  1163]
 [  206    44   570  2729  8145]
 [  206    27   209  1637 30543]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.63      0.75      0.69      4234
         2.0       0.40      0.16      0.23      2397
         3.0       0.43      0.29      0.35      4093
         4.0       0.48      0.23      0.31     11694
         5.0       0.75      0.94      0.83     32622

    accuracy                           0.69     55040
   macro avg       0.54      0.48      0.48     55040
weighted avg       0.65      0.69      0.65     55040


accuracy: 0.691
f1 score: 0.650


### decision tree

In [None]:
model = DecisionTreeClassifier()  # or choose sgd.
df = tfidf_doc_vec
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (220157, 300)
Shape of valid_X: (0, 0)
Shape of text_X: (55040, 300)


In [None]:
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset using decision tree...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset using decision tree...
Score on dataset...

Confusion Matrix:
 [[ 16933      0      0      0      0]
 [     0   9587      0      0      0]
 [     0      0  16374      0      0]
 [     0      0      0  46775      0]
 [     0      0      0      0 130488]]

Classification Report:
               precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     16933
         2.0       1.00      1.00      1.00      9587
         3.0       1.00      1.00      1.00     16374
         4.0       1.00      1.00      1.00     46775
         5.0       1.00      1.00      1.00    130488

    accuracy                           1.00    220157
   macro avg       1.00      1.00      1.00    220157
weighted avg       1.00      1.00      1.00    220157


accuracy: 1.000
f1 score: 1.000


In [None]:
#test on testing data
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using decision tree...')
_, _ = sk_evaluate(clf, test_X, test_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using decision tree...
Score on dataset...

Confusion Matrix:
 [[ 1904   609   557   494   670]
 [  571   421   463   422   520]
 [  517   435   756  1056  1329]
 [  470   399  1059  3292  6474]
 [  685   556  1471  6742 23168]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.46      0.45      0.45      4234
         2.0       0.17      0.18      0.17      2397
         3.0       0.18      0.18      0.18      4093
         4.0       0.27      0.28      0.28     11694
         5.0       0.72      0.71      0.72     32622

    accuracy                           0.54     55040
   macro avg       0.36      0.36      0.36     55040
weighted avg       0.54      0.54      0.54     55040


accuracy: 0.537
f1 score: 0.539


### random forest

In [None]:
model = RandomForestClassifier()  # or choose sgd.
df = tfidf_doc_vec
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (220157, 300)
Shape of valid_X: (0, 0)
Shape of text_X: (55040, 300)




In [None]:
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset using RandomForestClassifier ...')
_, _ = sk_evaluate(clf, train_X, train_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on training dataset using RandomForestClassifier ...
Score on dataset...

Confusion Matrix:
 [[ 16933      0      0      0      0]
 [     0   9587      0      0      0]
 [     0      0  16374      0      0]
 [     0      0      0  46774      1]
 [     0      0      0      0 130488]]

Classification Report:
               precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     16933
         2.0       1.00      1.00      1.00      9587
         3.0       1.00      1.00      1.00     16374
         4.0       1.00      1.00      1.00     46775
         5.0       1.00      1.00      1.00    130488

    accuracy                           1.00    220157
   macro avg       1.00      1.00      1.00    220157
weighted avg       1.00      1.00      1.00    220157


accuracy: 1.000
f1 score: 1.000


In [None]:
#test on testing data
print('Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using RandomForestClassifier ...')
_, _ = sk_evaluate(clf, test_X, test_y, label_names=None)

Performance of Tf-Idf Mean Word Vector and Doc2vec Combined on testing dataset using RandomForestClassifier ...
Score on dataset...

Confusion Matrix:
 [[ 2930    43   130   163   968]
 [  947    78   264   290   818]
 [  533    68   421   807  2264]
 [  218    14   141  1210 10111]
 [  198     4    32   602 31786]]

Classification Report:
               precision    recall  f1-score   support

         1.0       0.61      0.69      0.65      4234
         2.0       0.38      0.03      0.06      2397
         3.0       0.43      0.10      0.17      4093
         4.0       0.39      0.10      0.16     11694
         5.0       0.69      0.97      0.81     32622

    accuracy                           0.66     55040
   macro avg       0.50      0.38      0.37     55040
weighted avg       0.59      0.66      0.58     55040


accuracy: 0.662
f1 score: 0.579


# DAO_TEST

In [None]:
import time
import datetime

#import cPickle as pickle
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import pylab
import re
import scipy as sp
import seaborn

from gensim import corpora, models
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.lda import LDA
#from sklearn.qda import QDA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

plt.rc('figure', figsize=(10,6))
seaborn.set()
colors = seaborn.color_palette()

In [None]:
label_keys =[1, 2, 3, 4, 5]

In [None]:
df = tfidf_doc_vec
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
        df = pd.concat([df, concat_df], axis=1, ignore_index=True)
 

In [None]:
   # Specify train/valid/test size.
train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    # Prepare test dataset.
train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

In [None]:
clfs = [RandomForestClassifier(), LogisticRegression(),DecisionTreeClassifier()]
clf_names = ['Random Forest', 'Logistic Regression','Decision Tree']

NBResults = {}
for (i, clf_) in enumerate(clfs):
    clf = clf_.fit(train_X, train_y)
    preds = clf.predict(test_X)
    
    precision = metrics.precision_score(test_y, preds,average ='micro')
    recall = metrics.recall_score(test_y, preds,average ='micro')
    f1 = metrics.f1_score(test_y, preds,average ='micro')
    accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds)
    matrix = metrics.confusion_matrix(test_y, preds, labels=label_keys)
    
    data = {'precision':precision,
            'recall':recall,
            'f1_score':f1,
            'accuracy':accuracy,
            'clf_report':report,
            'clf_matrix':matrix,
            'y_predicted':preds}
    
    NBResults[clf_names[i]] = data

cols = ['precision', 'recall', 'f1_score', 'accuracy']
pd.DataFrame(NBResults).T[cols].T

  
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Random Forest,Logistic Regression,Decision Tree
precision,0.662028,0.691043,0.532086
recall,0.662028,0.691043,0.532086
f1_score,0.662028,0.691043,0.532086
accuracy,0.662028,0.691043,0.532086


In [None]:
#concat
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

In [None]:
#pv-dm
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

--------------------
MODEL: Random Forest
--------------------
The precision for this classifier is 0.5970748546511628
The recall for this classifier is    0.5970748546511628
The f1 for this classifier is        0.5970748546511628
The accuracy for this classifier is  0.5970748546511628
Here is the classification report:
              precision    recall  f1-score   support

         1.0       0.77      0.03      0.06      4234
         2.0       0.00      0.00      0.00      2397
         3.0       0.46      0.00      0.01      4093
         4.0       0.32      0.03      0.05     11694
         5.0       0.60      0.99      0.75     32622

    accuracy                           0.60     55040
   macro avg       0.43      0.21      0.17     55040
weighted avg       0.52      0.60      0.46     55040

--------------------------
MODEL: Logistic Regression
--------------------------
The precision for this classifier is 0.6634629360465116
The recall for this classifier is    0.6634629360465

In [None]:
#tf-idf
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

--------------------
MODEL: Random Forest
--------------------
The precision for this classifier is 0.663953488372093
The recall for this classifier is    0.663953488372093
The f1 for this classifier is        0.663953488372093
The accuracy for this classifier is  0.663953488372093
Here is the classification report:
              precision    recall  f1-score   support

         1.0       0.60      0.70      0.64      4234
         2.0       0.42      0.05      0.08      2397
         3.0       0.44      0.12      0.19      4093
         4.0       0.40      0.12      0.18     11694
         5.0       0.70      0.97      0.81     32622

    accuracy                           0.66     55040
   macro avg       0.51      0.39      0.38     55040
weighted avg       0.60      0.66      0.59     55040

--------------------------
MODEL: Logistic Regression
--------------------------
The precision for this classifier is 0.6855922965116279
The recall for this classifier is    0.6855922965116279


In [None]:
#doc_vec
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

--------------------
MODEL: Random Forest
--------------------
The precision for this classifier is 0.6684411337209303
The recall for this classifier is    0.6684411337209303
The f1 for this classifier is        0.6684411337209303
The accuracy for this classifier is  0.6684411337209303
Here is the classification report:
              precision    recall  f1-score   support

         1.0       0.61      0.72      0.66      4234
         2.0       0.34      0.05      0.08      2397
         3.0       0.42      0.13      0.20      4093
         4.0       0.41      0.14      0.20     11694
         5.0       0.71      0.96      0.82     32622

    accuracy                           0.67     55040
   macro avg       0.50      0.40      0.39     55040
weighted avg       0.60      0.67      0.60     55040

--------------------------
MODEL: Logistic Regression
--------------------------
The precision for this classifier is 0.6940406976744186
The recall for this classifier is    0.6940406976744

In [None]:
df = tfidf_doc_vec
concate = False  # set to True.
concat_df = dm_doc_vec

In [None]:
    df = pd.concat([df, concat_df], axis=1, ignore_index=True)

In [None]:
  # Specify train/valid/test size.
train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    # Prepare test dataset.
train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

In [None]:
clfs = [RandomForestClassifier(), LogisticRegression(),DecisionTreeClassifier()]
clf_names = ['Random Forest', 'Logistic Regression','Decision Tree']

NBResults = {}
for (i, clf_) in enumerate(clfs):
    clf = clf_.fit(train_X, train_y)
    preds = clf.predict(test_X)
    
    precision = metrics.precision_score(test_y, preds,average ='micro')
    recall = metrics.recall_score(test_y, preds,average ='micro')
    f1 = metrics.f1_score(test_y, preds,average ='micro')
    accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds)
    matrix = metrics.confusion_matrix(test_y, preds, labels=label_keys)
    
    data = {'precision':precision,
            'recall':recall,
            'f1_score':f1,
            'accuracy':accuracy,
            'clf_report':report,
            'clf_matrix':matrix,
            'y_predicted':preds}
    
    NBResults[clf_names[i]] = data

cols = ['precision', 'recall', 'f1_score', 'accuracy']
pd.DataFrame(NBResults).T[cols].T

  


NameError: ignored

In [None]:
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

--------------------
MODEL: Random Forest
--------------------
The precision for this classifier is 0.6639898255813953
The recall for this classifier is    0.6639898255813953
The f1 for this classifier is        0.6639898255813953
The accuracy for this classifier is  0.6639898255813953
Here is the classification report:
              precision    recall  f1-score   support

         1.0       0.61      0.70      0.65      4234
         2.0       0.42      0.05      0.09      2397
         3.0       0.43      0.12      0.19      4093
         4.0       0.39      0.12      0.19     11694
         5.0       0.70      0.97      0.81     32622

    accuracy                           0.66     55040
   macro avg       0.51      0.39      0.38     55040
weighted avg       0.59      0.66      0.59     55040

--------------------------
MODEL: Logistic Regression
--------------------------
The precision for this classifier is 0.6855922965116279
The recall for this classifier is    0.6855922965116

In [None]:
#For doc_vec
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

# DAO_TEST_GLOVE

In [None]:
df = tfidf_doc_vec_Glove
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
   df = pd.concat([df, concat_df], axis=1, ignore_index=True)

In [None]:
 # Specify train/valid/test size.
train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    # Prepare test dataset.
train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

In [None]:
clfs = [RandomForestClassifier(), LogisticRegression(),DecisionTreeClassifier()]
clf_names = ['Random Forest', 'Logistic Regression','Decision Tree']

NBResults = {}
for (i, clf_) in enumerate(clfs):
    clf = clf_.fit(train_X, train_y)
    preds = clf.predict(test_X)
    
    precision = metrics.precision_score(test_y, preds,average ='micro')
    recall = metrics.recall_score(test_y, preds,average ='micro')
    f1 = metrics.f1_score(test_y, preds,average ='micro')
    accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds)
    matrix = metrics.confusion_matrix(test_y, preds, labels=label_keys)
    
    data = {'precision':precision,
            'recall':recall,
            'f1_score':f1,
            'accuracy':accuracy,
            'clf_report':report,
            'clf_matrix':matrix,
            'y_predicted':preds}
    
    NBResults[clf_names[i]] = data

cols = ['precision', 'recall', 'f1_score', 'accuracy']
pd.DataFrame(NBResults).T[cols].T

  
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Random Forest,Logistic Regression,Decision Tree
precision,0.628797,0.680378,0.494113
recall,0.628797,0.680378,0.494113
f1_score,0.628797,0.680378,0.494113
accuracy,0.628797,0.680378,0.494113


In [None]:
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

--------------------
MODEL: Random Forest
--------------------
The precision for this classifier is 0.628797238372093
The recall for this classifier is    0.628797238372093
The f1 for this classifier is        0.628797238372093
The accuracy for this classifier is  0.628797238372093
Here is the classification report:
              precision    recall  f1-score   support

         1.0       0.66      0.45      0.54      4234
         2.0       0.61      0.00      0.01      2397
         3.0       0.54      0.01      0.03      4093
         4.0       0.31      0.03      0.06     11694
         5.0       0.64      0.99      0.77     32622

    accuracy                           0.63     55040
   macro avg       0.55      0.30      0.28     55040
weighted avg       0.56      0.63      0.52     55040

--------------------------
MODEL: Logistic Regression
--------------------------
The precision for this classifier is 0.6803779069767442
The recall for this classifier is    0.6803779069767442


In [None]:
df = tfidf_doc_vec_Glove
#concate = False  # set to True.
#concat_df = dm_doc_vec

#df = pd.concat([df, concat_df], axis=1, ignore_index=True)

 # Specify train/valid/test size.
train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    # Prepare test dataset.
train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

clfs = [RandomForestClassifier(), LogisticRegression(),DecisionTreeClassifier()]
clf_names = ['Random Forest', 'Logistic Regression','Decision Tree']

NBResults = {}
for (i, clf_) in enumerate(clfs):
    clf = clf_.fit(train_X, train_y)
    preds = clf.predict(test_X)
    
    precision = metrics.precision_score(test_y, preds,average ='micro')
    recall = metrics.recall_score(test_y, preds,average ='micro')
    f1 = metrics.f1_score(test_y, preds,average ='micro')
    accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds)
    matrix = metrics.confusion_matrix(test_y, preds, labels=label_keys)
    
    data = {'precision':precision,
            'recall':recall,
            'f1_score':f1,
            'accuracy':accuracy,
            'clf_report':report,
            'clf_matrix':matrix,
            'y_predicted':preds}
    
    NBResults[clf_names[i]] = data

cols = ['precision', 'recall', 'f1_score', 'accuracy']
pd.DataFrame(NBResults).T[cols].T


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Random Forest,Logistic Regression,Decision Tree
precision,0.630051,0.659502,0.49068
recall,0.630051,0.659502,0.49068
f1_score,0.630051,0.659502,0.49068
accuracy,0.630051,0.659502,0.49068


In [None]:
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

--------------------
MODEL: Random Forest
--------------------
The precision for this classifier is 0.6300508720930232
The recall for this classifier is    0.6300508720930232
The f1 for this classifier is        0.6300508720930232
The accuracy for this classifier is  0.6300508720930232
Here is the classification report:
              precision    recall  f1-score   support

         1.0       0.63      0.48      0.55      4234
         2.0       0.33      0.00      0.01      2397
         3.0       0.47      0.02      0.03      4093
         4.0       0.32      0.04      0.08     11694
         5.0       0.64      0.98      0.78     32622

    accuracy                           0.63     55040
   macro avg       0.48      0.31      0.29     55040
weighted avg       0.55      0.63      0.52     55040

--------------------------
MODEL: Logistic Regression
--------------------------
The precision for this classifier is 0.6595021802325581
The recall for this classifier is    0.6595021802325

In [None]:
df = doc_vec_Glove
#concate = False  # set to True.
#concat_df = dm_doc_vec

#df = pd.concat([df, concat_df], axis=1, ignore_index=True)

 # Specify train/valid/test size.
train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    # Prepare test dataset.
train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1,
                                                    stratify=target_labels)

clfs = [RandomForestClassifier(), LogisticRegression(),DecisionTreeClassifier()]
clf_names = ['Random Forest', 'Logistic Regression','Decision Tree']

NBResults = {}
for (i, clf_) in enumerate(clfs):
    clf = clf_.fit(train_X, train_y)
    preds = clf.predict(test_X)
    
    precision = metrics.precision_score(test_y, preds,average ='micro')
    recall = metrics.recall_score(test_y, preds,average ='micro')
    f1 = metrics.f1_score(test_y, preds,average ='micro')
    accuracy = accuracy_score(test_y, preds)
    report = classification_report(test_y, preds)
    matrix = metrics.confusion_matrix(test_y, preds, labels=label_keys)
    
    data = {'precision':precision,
            'recall':recall,
            'f1_score':f1,
            'accuracy':accuracy,
            'clf_report':report,
            'clf_matrix':matrix,
            'y_predicted':preds}
    
    NBResults[clf_names[i]] = data

cols = ['precision', 'recall', 'f1_score', 'accuracy']
pd.DataFrame(NBResults).T[cols].T

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Random Forest,Logistic Regression,Decision Tree
precision,0.641588,0.671185,0.507376
recall,0.641588,0.671185,0.507376
f1_score,0.641588,0.671185,0.507376
accuracy,0.641588,0.671185,0.507376


In [None]:
for model, val in NBResults.items():
    print ('-------'+'-'*len(model))
    print ('MODEL:', model)
    print ('-------'+'-'*len(model))
    print ('The precision for this classifier is ' + str(val['precision']))
    print ('The recall for this classifier is    ' + str(val['recall']))
    print ('The f1 for this classifier is        ' + str(val['f1_score']))
    print ('The accuracy for this classifier is  ' + str(val['accuracy']))
    print ('Here is the classification report:')
    print (val['clf_report'])

--------------------
MODEL: Random Forest
--------------------
The precision for this classifier is 0.6415879360465117
The recall for this classifier is    0.6415879360465117
The f1 for this classifier is        0.6415879360465117
The accuracy for this classifier is  0.6415879360465117
Here is the classification report:
              precision    recall  f1-score   support

         1.0       0.63      0.58      0.60      4234
         2.0       0.33      0.01      0.01      2397
         3.0       0.50      0.03      0.05      4093
         4.0       0.36      0.06      0.10     11694
         5.0       0.65      0.98      0.79     32622

    accuracy                           0.64     55040
   macro avg       0.50      0.33      0.31     55040
weighted avg       0.57      0.64      0.54     55040

--------------------------
MODEL: Logistic Regression
--------------------------
The precision for this classifier is 0.6711845930232558
The recall for this classifier is    0.6711845930232

#RNN

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

model = Sequential()

# Embedding layer


# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))

# Recurrent layer
model.add(LSTM(64, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(5, activation='sigmoid'))

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath = 'yelp_lstm_gru_weights.hdf5', save_best_only=True, 
                             save_weights_only=False)]

In [None]:
history = model.fit(X_train,  y_train, 
                    batch_size=2048, epochs=150,
                    callbacks=callbacks,
                    validation_data=(X_valid, y_valid))

In [None]:
tfidf_doc_vec.info()
tfidf_doc_vec.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102442 entries, 0 to 102441
Columns: 200 entries, 0 to 199
dtypes: float64(200)
memory usage: 156.3 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,4.241534,-0.346359,-3.430429,1.740263,-0.881717,0.311229,-1.979204,-0.181652,-0.644655,-1.254866,-0.764523,-1.593858,-0.710255,5.791879,6.53138,-0.094709,4.021795,3.892041,-2.560342,0.974002,-3.033901,-5.886605,-1.866182,-4.86161,-0.216919,-1.368133,-5.397421,-1.443106,1.039358,2.114127,-5.281143,4.021247,3.425811,0.292956,0.945736,0.002539,0.709782,-2.012698,2.450704,-3.295984,...,2.739785,-1.350479,4.276579,-5.200751,-1.74818,-2.766015,-1.576548,-2.170029,-2.850688,1.789694,-0.243569,0.094091,-2.139257,3.867438,-1.34551,-1.114932,4.069699,1.102934,-3.819638,-1.040834,-1.044949,-1.570153,-1.82744,-0.271423,3.345941,1.119662,-2.546671,2.289669,-1.249396,4.977643,-5.24302,5.681765,0.641808,5.069246,-0.921466,-1.215854,-3.658603,-0.201084,2.376911,-2.882553
1,-4.256276,7.571361,6.65878,-4.826914,2.247503,-8.215537,2.946768,6.347083,0.474824,-5.031849,4.728989,-1.718058,4.690131,2.545589,0.81382,-0.709955,1.880499,0.800798,-2.287667,-1.95516,0.674691,-5.252521,0.695431,4.10746,-4.718019,1.965088,5.1048,3.402157,-0.837587,2.0111,2.745435,-3.664337,-3.892045,-7.367734,-3.790738,-2.515052,-5.052354,4.241956,-0.099118,2.694211,...,-1.071032,4.440019,-0.080174,0.850033,5.1721,6.572573,1.007506,-1.624678,-1.139039,4.209938,6.290622,-3.402455,-3.605804,-6.764954,2.966686,-3.687046,-1.158803,1.585174,1.999677,-1.591179,2.93365,-1.906195,3.296059,-2.718283,-1.080722,-2.002822,-0.129199,-1.778279,-3.614841,-6.437204,1.577998,4.989604,3.143239,0.496878,5.884332,-2.10894,1.566612,-3.506336,-5.213051,-2.940182
2,-1.655971,2.690683,-0.239536,-1.089188,1.03021,1.825364,1.06476,2.333301,0.073958,-1.068034,1.397268,0.406684,-1.991326,1.66988,0.834133,-0.736294,-3.435335,-0.99051,-0.073842,-0.899898,-3.570755,0.048199,-1.467797,1.780982,-1.141539,-2.411931,1.42555,0.820652,0.43051,1.073295,0.629742,1.864826,1.177647,-1.807969,-0.79446,2.09776,2.439232,2.420574,3.230696,0.632595,...,1.843291,2.876377,-1.803974,-1.793558,-0.390911,-1.739682,-3.015334,0.399557,-2.420814,2.205678,-2.252358,0.618523,2.63501,2.740447,2.677685,-6.293159,-1.923197,1.295902,-0.751943,-4.281597,-2.643601,-0.853006,5.780793,0.947092,0.206086,-1.130668,0.023397,1.084631,2.764695,-1.054006,-2.653031,1.76226,0.672126,1.644587,2.107002,3.911186,2.789545,-4.561905,-0.893687,-3.496453
3,-2.03675,-4.824581,4.206439,-2.848737,1.616262,-3.578638,-1.332959,-1.482494,1.21781,-0.974078,-1.421062,-2.796275,-1.916879,1.660348,-1.376542,-2.248796,-1.713045,0.813905,-0.667315,-2.356043,0.97927,-0.081409,-1.11676,1.621463,-1.32463,-0.408252,4.920693,-0.52533,0.363341,-0.31972,-3.637743,-0.82032,3.643006,2.674333,-3.114033,-3.036408,2.789954,-2.692105,1.084654,3.1675,...,4.865672,-2.302689,4.509201,2.358053,-1.162396,1.977106,0.350616,0.651786,2.509765,3.054227,2.250558,1.893524,-0.308948,0.329153,-1.069556,7.370651,0.400089,2.070981,0.203354,1.388923,0.570043,-0.057124,4.142462,0.414542,-0.128299,-1.227175,0.095076,0.577437,-3.428213,-0.73198,2.572657,3.157101,-3.1243,-2.856102,-2.432476,1.572566,1.261692,-5.538431,-1.663208,5.69881
4,-3.407166,-1.889094,3.155638,-0.763353,0.410485,-1.845323,-4.484404,-3.674747,0.105887,2.348912,3.278773,-1.409052,-2.402828,0.514554,0.610011,-1.077408,-1.824826,-0.865337,-0.888725,-2.125015,-1.113216,-1.629092,-1.249926,-2.004465,-0.209894,2.297168,3.067438,-3.503512,1.606484,-0.844252,1.13046,2.512684,0.436818,2.080024,-6.421183,-4.124078,1.182407,0.216152,2.032705,4.664596,...,7.615785,-0.463298,2.431035,2.073502,3.424167,3.15025,2.360209,1.179626,-0.517552,-0.493751,3.996435,1.174502,-2.608114,1.326632,-0.256042,3.152467,-2.050098,-3.396377,2.757633,1.831535,-0.138438,-0.902062,-0.626832,0.1756,1.505481,-1.170212,0.986665,-0.46688,5.122849,4.613226,-0.041371,3.270757,-5.278025,-1.001384,2.151574,-0.15919,-1.968791,-0.348048,1.632455,4.105619


In [None]:
model = model  # or choose sgd.
df = tfidf_doc_vec
concate = True  # set to True.
concat_df = dm_doc_vec

In [None]:
clf, train_X, valid_X, test_X, train_y, valid_y, test_y = main(model, 
                                                               df, 
                                                               concate=concate, 
                                                               concat_df=concat_df)

Shape of train_X: (81953, 300)
Shape of valid_X: (0, 0)
Shape of text_X: (20489, 300)


ValueError: ignored