# Text classification task

In [0]:
#----- mount colab env (Necessary for Colab use)
# from google.colab import drive
# drive.mount('/content/drive')

## Packages

In [0]:
#%tensorflow_version 1.x

#---- magic trio + special guest
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#---- utils
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import keras.backend as K
import gensim
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from tqdm.autonotebook import tqdm
from collections import defaultdict
import logging
from keras.models import clone_model
from sklearn import utils

# if you want to try the concat version of doc2vec you need to install this dependency
#!pip install -q testfixtures
#from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

import time

#---- dl
from keras.layers import Input, Dense, Dropout, LeakyReLU
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import ReLU
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.regularizers import l1_l2, l1, l2
from keras.optimizers import RMSprop, Nadam, SGD

#!pip install -q livelossplot

from livelossplot.keras import PlotLossesCallback

In [0]:
with open("/content/drive/My Drive/Colab Notebooks/TM&S/df_preprocessed_eng.pckle", "rb") as infile:
  data = pickle.load(infile)

In [0]:
#----- remapping categories

category_remap_dict = {
    "LGBT": "socializing",
    "singles": "socializing",
    "fashion/beauty": "health/wellbeing",
    "movies/film": "socializing",
    "book clubs": "education/learning",
    "sci-fi/fantasy": "games",
    "support": "health/wellbeing",
    "cars/motorcycles": "outdoors/adventure"
}


data["remap_category"] = data.category.map(lambda x: category_remap_dict[x] if x in category_remap_dict.keys() else x )


## Inspection

In [0]:
data.head()

In [0]:
plt.figure(figsize = (15,20))
sns.set_style("whitegrid")
sns.countplot(y = data.remap_category)

## Functions

In [0]:
#----- k-folds cross validation
def load_data_kfold(k, X_train, y_train):
    
    folds = list(StratifiedKFold(n_splits = k, shuffle = True,
                                 random_state = 42).split(X_train, y_train))
    
    return folds


In [0]:
#----- top-k accuracy score
def top_k_acc(model, X_test, y_true, k_top = 3, ml = False):
  if ml:
    probs = model.predict_proba(X_test)
  else:  
    probs = model.predict(X_test)
  #y_true = np.argmax(y_true, axis = 1)
  topn = np.argsort(probs, axis = 1)[:,-k_top:]
  return np.mean(np.array([1 if y_true[k_top] in topn[k_top] else 0 for k_top in range(len(topn))]))

In [0]:
#----- averaging for features (word2vec pre-trained)
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

In [0]:
#---- list averaging (word2vec pre-trained)
def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [0]:
#----- Embedding vectorizer (Tf-idf)
class TfidfEmbeddingVectorizer(object):
      def __init__(self, word2vec):
          self.word2vec = word2vec
          self.word2weight = None
          self.dim = len(next(iter(word2vec.values())))

      def fit(self, X, y):
          tfidf = TfidfVectorizer(analyzer=lambda x: x)
          tfidf.fit(X)
          # if a word was never seen - it must be at least as infrequent
          # as any of the known words - so the default idf is the max of 
          # known idf's
          max_idf = max(tfidf.idf_)
          self.word2weight = defaultdict(
              lambda: max_idf,
              [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

          return self

      def transform(self, X):
          return np.array([
                  np.mean([self.word2vec[w] * self.word2weight[w]
                          for w in words if w in self.word2vec] or
                          [np.zeros(self.dim)], axis=0)
                  for words in X
              ])

In [0]:
#----- Embedding vectorizer (mean)
class MeanEmbeddingVectorizer(object):
      def __init__(self, word2vec):
          self.word2vec = word2vec
          # if a text is empty we should return a vector of zeros
          # with the same dimensionality as all the other vectors
          self.dim = len(next(iter(word2vec.values())))

      def fit(self, X, y):
          return self

      def transform(self, X):
          return np.array([
              np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                      or [np.zeros(self.dim)], axis=0)
              for words in X
          ])


# Text representation

## Pre-trained word2vec model (Deprecated, performanes were similar to not pre-trained)

In [0]:
# wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Colab Notebooks/TM&S/GoogleNews-vectors-negative300.bin.gz", binary=True)
# wv.init_sims(replace=True)

In [0]:
#w2v = dict(zip(wv.wv.index2word, wv.wv.syn0))

In [0]:
#vect = TfidfEmbeddingVectorizer(w2v)

In [0]:
#vect.fit(X, data.remap_category)

In [0]:
#train_vect = vect.transform(X)

In [0]:
#X_word_average = word_averaging_list(wv, data.desc_lemm_no_badwords)

## Text-representation function

In [0]:
#----- text-representation function
def text_represent(method, X, y, df_token = None, size = 300):
  '''
  @params
  method = str, one of ['bow-tfidf', 'bow-count', 'w2v-tfidf', 'w2v-mean']
         bow-tfidf: bag-of-words representation using Tf-Idf
         bow-count: bag-of-words representation using token count
         w2v-tfidf: word-embedding representation using word2vec and Tf-Idf
         w2v-mean:  word-embedding representation using word2vec and mean 
  X = list of lists, list containing a list of token for each document
  df_token = pd.Series, a pandas series containing the token token for every document (To use with bow methods)
  y = pd.Series, a pandas series containing the labels of every document
  '''
  #----- bag-of-words tf-idf
  if method == 'bow-tfidf':
    print("Creating bag-of-words tf-idf representation...")
    tfidf = TfidfVectorizer(max_features = size, analyzer = 'word', ngram_range = (1,1), sublinear_tf = True)
    train_vect = tfidf.fit_transform(df_token)

  #----- bag-of-words count vector
  elif method == 'bow-count':
    print("Creating bag-of-words count representation...")
    count_vect = CountVectorizer(max_features = size, analyzer = 'word', ngram_range = (1,1))
    train_vect = count_vect.fit_transform(df_token)

  #----- word embedding word2vec tf-idf
  elif method == 'w2v-tfidf':
    print("Creating word embedding representation using word2vec & tf-idf...")
    print("It may took a while")
    model = gensim.models.Word2Vec(X, size = size, workers = 4, iter = 10)
    w2v = dict(zip(model.wv.index2word, model.wv.vectors))
    vect = TfidfEmbeddingVectorizer(w2v)
    vect.fit(X, y)
    train_vect = vect.transform(X)

  #----- word embedding word2vec mean
  elif method == 'w2v-mean':
    print("Creating word embedding representation using word2vec and mean...")
    print("It may took a while")
    model = gensim.models.Word2Vec(X, size = size, workers = 4, iter = 10)
    w2v = dict(zip(model.wv.index2word, model.wv.vectors))
    vect = MeanEmbeddingVectorizer(w2v)
    vect.fit(X, y)
    train_vect = vect.transform(X)
  
  #----- raise Error
  else:
    print("No valid text-representation method selected")
    raise ValueError

  print("done")
  return train_vect


## Create dataframes for evaluations and scoring

In [0]:
#----- evaluations dataframe
# columns = pd.MultiIndex.from_product([['Stemming', 'Stemming+Badwords', 'Lemmatization', 'Lemmatization+Badwords'], ['Acc', 'Macro F-Measure', 'Weighted F-Measure']],
#                                      names = ['Processing method', 'Feature Extraction Method'])
# #columns = ['model', 'representation', 'f1-macro', 'f1-weighted', 'accuracy']
# evaluations = pd.DataFrame(columns = columns)
# evaluations = evaluations.append(pd.DataFrame.from_dict({'Stemming':{'Acc': 0.692, 'Weighted F-Measure':0.687, 'Macro F-Measure':0.659},
#                                                          'Stemming+Badwords':{'Acc': 0.692, 'Weighted F-Measure':0.687, 'Macro F-Measure':0.659},
#                                                          'Lemmatization':{'Acc': 0.692, 'Weighted F-Measure':0.687, 'Macro F-Measure':0.661},
#                                                          'Lemmatization+Badwords':{'Acc': 0.693, 'Weighted F-Measure':0.688, 'Macro F-Measure':0.662}}).unstack().rename('Count'))
# evaluations = evaluations.append(pd.DataFrame.from_dict({'Stemming':{'Acc': 0.689, 'Weighted F-Measure':0.685, 'Macro F-Measure':0.656},
#                                                          'Stemming+Badwords':{'Acc': 0.691, 'Weighted F-Measure':0.686, 'Macro F-Measure':0.660},
#                                                          'Lemmatization':{'Acc': 0.688, 'Weighted F-Measure':0.684, 'Macro F-Measure':0.656},
#                                                          'Lemmatization+Badwords':{'Acc': 0.688, 'Weighted F-Measure':0.684, 'Macro F-Measure':0.658}}).unstack().rename('Tf-idf'))
# evaluations

In [0]:
#----- dump evaluations dataframe
# with open("/content/drive/My Drive/Colab Notebooks/TM&S/evaluations.pckl", "wb") as outfile:
#   pickle.dump(evaluations, outfile)

In [0]:
#----- create scoring dataframe
# columns1 = pd.MultiIndex.from_product([[""],['Model', 'Processing method','Feature Extraction Method']])
# columns2 = pd.MultiIndex.from_product([['Acc', 'Top-3 Acc', 'Macro F-Measure', 'Weighted F-Measure'],
#                                                                   ['value', 'std']])

# scores = pd.concat([pd.DataFrame(columns = columns1), pd.DataFrame(columns = columns2)], axis = 1)
#scores

In [0]:
#----- dump scoring dataframe
# with open("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/scores.pckl", "wb") as outfile:
#   pickle.dump(scores, outfile)
#   outfile.close()

## Doc2Vec section (choose to use DBOW, DM or concatenate the two models)

In [0]:
#----- tag all the documents
tagged = data.apply(lambda r: gensim.models.doc2vec.TaggedDocument(words=str(r.desc_stemm).split(" "), tags=[r.remap_category]), axis=1)
cores = 4

#----- dbow model
model_dbow = gensim.models.Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(tagged.values)]) 

#----- dmm model
#model_dmm = gensim.models.Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
#model_dmm.build_vocab([x for x in tqdm(tagged.values)])

In [0]:
#---- train dbow model
%%time
for epoch in range(10):
    model_dbow.train(utils.shuffle([x for x in tqdm(tagged.values)]), total_examples=len(tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

In [0]:
#----- train dmm model
# %%time
# for epoch in range(10):
#     model_dmm.train(utils.shuffle([x for x in tqdm(tagged.values)]), total_examples=len(tagged.values), epochs=1)
#     model_dmm.alpha -= 0.002
#     model_dmm.min_alpha = model_dmm.alpha

In [0]:
#----- make the models light
# model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
# model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [0]:
#----- concatenate models
#new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [0]:
def vec_for_learning(model, tagged_docs):
    #sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in tqdm(tagged_docs.values)])
    return targets, regressors

In [0]:
#----- feature extraction
y, X = vec_for_learning(model_dbow, tagged)
train_vect = np.asarray(X)
y_ar = np.asarray(y)

In [0]:
#----- save doc2vec model
#np.savez_compressed("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/d2v_stemm_concat.npz", X = train_vect, y = y_ar)

## Arrange data

In [0]:
#----- create list of lists token representation
X = []
for i, j in tqdm(data.iterrows()):
  X.append(j.desc_lemm_no_badwords.split())

In [0]:
#%%time
#----- text representation (choose representation and the preprocessing phase)
train_vect = text_represent(method = 'w2v-mean', X = X, df_token = data.desc_lemm_no_badwords, y = data.remap_category)

In [0]:
#----- eventually save the representation
#np.savez_compressed("/content/drive/My Drive/Colab Notebooks/TM&S/w2v_mean_lemm_no_badwords.npz", X = train_vect)

In [0]:
#----- if saved, load the representation
# with np.load("/content/drive/My Drive/Colab Notebooks/TM&S/w2v_tfidf.npz") as infile:
#   train_vect = infile['X']
#   #category = infile['y']

In [0]:
#----- handling labels
y = data.remap_category
lb = LabelEncoder()
y = lb.fit_transform(y)
y = to_categorical(y, num_classes = 24)
num_labels = y.shape[1]

# Models

## Neural Network

### K-folds cross validation scoring

In [0]:
#----- if saved, load
# with np.load("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/d2v_lemm_no_badwords.npz") as infile:
#   train_vect = infile['X']
#   infile.close()

In [0]:
#----- nn model
K.clear_session()

model = Sequential()
model.add(Dense(1024, input_dim = train_vect.shape[1]))
model.add(ReLU())
model.add(Dropout(.5))
model.add(Dense(512))
model.add(ReLU())
model.add(Dropout(.5))
model.add(Dense(128))
model.add(ReLU())
model.add(Dropout(.5))
model.add(Dense(24, activation="softmax"))

model.summary()

In [0]:
#----- cross-validation cicle
k = 5
acc = []
top_3 = []
f1_macro = []
f1_weighted = []

lb = LabelEncoder()
lb.fit(data.remap_category)


folds = load_data_kfold(k, train_vect, data.remap_category.values)
for j, (train_idx, test_idx) in enumerate(tqdm(folds)):
  es = EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights = True)
  rlrop = ReduceLROnPlateau(monitor='val_loss', patience = 5, factor = .5, min_lr = 1e-6)

  X_train_cv = train_vect[train_idx]
  y_train_cv = data.remap_category.values[train_idx]
  X_test_cv = train_vect[test_idx]
  y_test_cv = data.remap_category.values[test_idx]

  y_train_cv = lb.transform(y_train_cv)
  y_test_cv = lb.transform(y_test_cv)
  y_train_cv = to_categorical(y_train_cv, num_classes = 24)
  y_test_cv = to_categorical(y_test_cv, num_classes = 24)


  #----- creation of validation set for EarlyStopping
  X_train_cv, X_val, y_train_cv, y_val = train_test_split(X_train_cv, y_train_cv, test_size = .1,
                                                      random_state = 42,
                                                      stratify = y_train_cv)
  kfold_model = clone_model(model)
  kfold_model.compile(optimizer = Adam(), loss = "categorical_crossentropy", metrics = ['acc'])

  kfold_model.fit(X_train_cv, y_train_cv,
                  batch_size = 2048,
                  epochs = 100,
                  verbose = 0,
                  validation_data = (X_val, y_val),
                  callbacks = [es, rlrop],
                  use_multiprocessing = True)

#----- evaluation
  y_pred = kfold_model.predict_classes(X_test_cv)
  y_true = np.argmax(y_test_cv, axis = 1)
  f1_macro.append(f1_score(y_true, y_pred, average = 'macro'))
  f1_weighted.append(f1_score(y_true, y_pred, average = 'weighted'))
  acc.append(accuracy_score(y_true, y_pred))
  top_3.append(top_k_acc(kfold_model, X_test_cv, y_true, k_top = 3))


print("Acc: {} (+/- {})\nTop-3: {} (+/- {})\nF1-Macro: {} (+/- {})\nF1-Weighted: {} (+/- {})".format(round(np.mean(acc), 3), round(np.std(acc), 3),
                              round(np.mean(top_3), 3), round(np.std(top_3), 3),
                              round(np.mean(f1_macro), 3), round(np.std(f1_macro), 3),
                              round(np.mean(f1_weighted), 3), round(np.std(f1_weighted), 3)))

In [0]:
#----- save results

# with open("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/scores.pckl", "rb") as infile:
#   scores = pickle.load(infile)
#   infile.close()

# #scores
# #['stem+badwords', 'stem+badwords', 'lemm+badwords']
# processing_method = "Lemmatization+Badwords removal"
# #['w2v tf-idf', 'w2v mean', 'doc2vec']
# representation = 'Doc2Vec'
# #[0,1,2]
# row = 2

# scores.loc[row, ("", "Model")] = "NN"
# scores.loc[row, ("", "Processing method")] = processing_method
# scores.loc[row, ("", "Feature Extraction Method")] = representation
# scores.loc[row, ("Acc", "value")] = round(np.mean(acc), 3)
# scores.loc[row, ("Acc", "std")] = round(np.std(acc), 3)
# scores.loc[row, ("Top-3 Acc", "value")] = round(np.mean(top_3), 3)
# scores.loc[row, ("Top-3 Acc", "std")] = round(np.std(top_3), 3)
# scores.loc[row, ("Macro F-Measure", "value")] = round(np.mean(f1_macro), 3)
# scores.loc[row, ("Macro F-Measure", "std")] = round(np.std(f1_macro), 3)
# scores.loc[row, ("Weighted F-Measure", "value")] = round(np.mean(f1_weighted), 3)
# scores.loc[row, ("Weighted F-Measure", "std")] = round(np.std(f1_weighted), 3)

# with open("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/scores.pckl", "wb") as outfile:
#   pickle.dump(scores, outfile)
#   outfile.close()


#scores

### Punctual evaluation

In [0]:
#----- punctual evaluations

#----- nn model
K.clear_session()

model = Sequential()
model.add(Dense(1024, input_dim = train_vect.shape[1]))
model.add(ReLU())
model.add(Dropout(.5))
model.add(Dense(512))
model.add(ReLU())
model.add(Dropout(.5))
model.add(Dense(128))
model.add(ReLU())
model.add(Dropout(.5))
model.add(Dense(24, activation="softmax"))

# model.summary()

#----- train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_vect, y, test_size = .1, random_state = 1, stratify = y)
num_labels = y.shape[1]

es = EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights=True)
rlrop = ReduceLROnPlateau(monitor='val_loss', patience = 5, factor = .5, min_lr = 1e-6)
model.fit(X_train, y_train,
          batch_size = 2048,
          epochs = 100,
          verbose = 0,
          validation_split = .1,
          callbacks = [PlotLossesCallback(), es, rlrop],
          use_multiprocessing = True)

y_pred = model.predict_classes(X_test)
y_true = np.argmax(y_test, axis = 1)
f1_macro = f1-score(y_true, y_pred, average = 'macro')
f1_weighted = f1-score(y_true, y_pred, average = 'weighted')
acc = accuracy_score(y_true, y_pred)

acc, f1_macro, f1_weighted

# model = 'W2V Tf-idf'
# representation = 'Stemming'

# with open("/content/drive/My Drive/Colab Notebooks/TM&S/evaluations.pckl", "rb") as infile:
#   evaluations = pickle.load(infile)

# evaluations.loc[model, (representation, 'Acc')] = acc
# evaluations.loc[model, (representation, 'Macro F-Measure')] = f1_macro
# evaluations.loc[model, (representation, 'Weighted F-Measure')] = f1_weighted

# with open("/content/drive/My Drive/Colab Notebooks/TM&S/evaluations.pckl", "wb") as outfile:
#   pickle.load(evaluations, outfile)

## Random Forest

### K-folds cross validation scoring

In [0]:
#----- load if saved
#train_vect = np.load("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/d2v_lemm_no_badwords.npz")['X']

In [0]:
#----- cross-validation cicle
k = 5
acc = []
top_3 = []
f1_macro = []
f1_weighted = []
lb = LabelEncoder()
lb.fit(data.remap_category)

folds = load_data_kfold(k, train_vect, data.remap_category.values)
for j, (train_idx, test_idx) in enumerate(tqdm(folds)):

  X_train_cv = train_vect[train_idx]
  y_train_cv = data.remap_category.values[train_idx]
  X_test_cv = train_vect[test_idx]
  y_test_cv = data.remap_category.values[test_idx]

  rf = RandomForestClassifier(n_jobs=-1)
  rf = rf.fit(X_train_cv, y_train_cv)




#----- evaluation
  y_pred = rf.predict(X_test_cv)
  #y_true = np.argmax(y_test_cv, axis = 1)
  f1_macro.append(f1_score(y_test_cv, y_pred, average = 'macro'))
  f1_weighted.append(f1_score(y_test_cv, y_pred, average = 'weighted'))
  acc.append(accuracy_score(y_test_cv, y_pred))
  top_3.append(top_k_acc(rf, X_test_cv, lb.transform(y_test_cv), k_top = 3, ml = True))


print("Acc: {} (+/- {})\nTop-3: {} (+/- {})\nF1-Macro: {} (+/- {})\nF1-Weighted: {} (+/- {})".format(round(np.mean(acc), 3), round(np.std(acc), 3),
                              round(np.mean(top_3), 3), round(np.std(top_3), 3),
                              round(np.mean(f1_macro), 3), round(np.std(f1_macro), 3),
                              round(np.mean(f1_weighted), 3), round(np.std(f1_weighted), 3)))


In [0]:
#----- save results
# with open("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/scores.pckl", "rb") as infile:
#   scores = pickle.load(infile)
#   infile.close()

# #scores
# #['Stemming+Badwords removal', 'Stemming+Badwords removal', 'Lemmatization+Badwords removal']
# processing_method = "Lemmatization+Badwords removal"
# #['W2V Tf-idf', 'W2V Mean', 'Doc2Vec']
# representation = 'Doc2Vec'
# #[3,4,5]
# row = 5

# scores.loc[row, ("", "Model")] = "RF"
# scores.loc[row, ("", "Processing method")] = processing_method
# scores.loc[row, ("", "Feature Extraction Method")] = representation
# scores.loc[row, ("Acc", "value")] = round(np.mean(acc), 3)
# scores.loc[row, ("Acc", "std")] = round(np.std(acc), 3)
# scores.loc[row, ("Top-3 Acc", "value")] = round(np.mean(top_3), 3)
# scores.loc[row, ("Top-3 Acc", "std")] = round(np.std(top_3), 3)
# scores.loc[row, ("Macro F-Measure", "value")] = round(np.mean(f1_macro), 3)
# scores.loc[row, ("Macro F-Measure", "std")] = round(np.std(f1_macro), 3)
# scores.loc[row, ("Weighted F-Measure", "value")] = round(np.mean(f1_weighted), 3)
# scores.loc[row, ("Weighted F-Measure", "std")] = round(np.std(f1_weighted), 3)

# # with open("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/scores.pckl", "wb") as outfile:
# #   pickle.dump(scores, outfile)
# #   outfile.close()


# scores

### Punctual evaluation

In [0]:
#----- if saved, load
# with np.load("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/d2v_stemm_concat.npz") as infile:
#   train_vect = infile['X']
#   y = infile['y']
#   infile.close()

In [0]:
#----- train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_vect, y, test_size = .2, random_state = 42, stratify = y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [0]:
rf = RandomForestClassifier(n_jobs=-1)

In [0]:
rf.fit(X_train, y_train)

In [0]:
y_pred = rf.predict(X_test)

f1_macro = f1_score(y_test, y_pred, average = 'macro')
f1_weighted = f1_score(y_test, y_pred, average = 'weighted')
acc = accuracy_score(y_test, y_pred)

acc, f1_macro, f1_weighted

# # ['W2V Tf-idf', 'W2V mean', 'Doc2Vec']
# model = 'Doc2Vec (concat)'
# # ['Stemming', 'Stemming+Badwords', 'Lemmatization', 'Lemmatization+Badwords']
# representation = 'Stemming'

# time.sleep(2)
# with open("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/evaluations.pckl", "rb") as infile:
#   evaluations = pickle.load(infile)
#   infile.close()

# time.sleep(2)
# evaluations.loc[model, (representation, 'Acc')] = round(acc, 3)
# evaluations.loc[model, (representation, 'Macro F-Measure')] = round(f1_macro, 3)
# evaluations.loc[model, (representation, 'Weighted F-Measure')] = round(f1_weighted, 3)

# time.sleep(2)
# #with open("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/evaluations.pckl", "wb") as outfile:
#  # pickle.dump(evaluations, outfile)
#   #outfile.close()

# evaluations

## Dump evaluations & scores into Excel fashion

In [0]:
# with open("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/evaluations.pckl", "rb") as infile:
#   evaluations = pickle.load(infile)
#   infile.close()

# evaluations

In [0]:
#evaluations.to_excel("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/evaluations.xlsx")

In [0]:
# with open("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/scores.pckl", "rb") as infile:
#   scores = pickle.load(infile)
#   infile.close()

# scores

In [0]:
#scores.to_excel("/content/drive/My Drive/Colab Notebooks/TM&S/text-representations&evaluations/scores.xlsx")