<font color="#6E6E6E"><h2 align="left">Text Analytics - Autoencoder</h2></font>

 Analysis of a subset of a complaints dataset using 2 classes.

In [None]:
COLAB = True

## Import main libraries

In [None]:
import nltk # !pip install nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator

%matplotlib inline

lemmatizer = WordNetLemmatizer().lemmatize

## Load dataset ("corpus")

In [None]:
# El dataset está en:
# https://drive.google.com/file/d/1LFW1GSVkZXyXKFUdKNZA8alKK64d5J9P

if COLAB:
    from google_drive_downloader import GoogleDriveDownloader as gdd
    gdd.download_file_from_google_drive(file_id='1LFW1GSVkZXyXKFUdKNZA8alKK64d5J9P',
                                        dest_path='./reporting_complaints_2classes.csv')

In [None]:
# Read the input
df = pd.read_csv("./reporting_complaints_2classes.csv") # the dataset is loaded into a Pandas DataFrame
print(df.shape)

In [None]:
df.sample(10)

In [None]:
df['product'].value_counts()

In [None]:
narratives = list(df['consumer_complaint_narrative'])
labels = np.array(df['product'])
print("Number of narratives: {}".format(len(narratives)))

In [None]:
case = 5
print("Type of product:", labels[case])
narratives[case]

## Preprocessing

## Training-test split

In [None]:
Nval = 1000

from sklearn.model_selection import train_test_split

narratives_trval, narratives_te, y_trval, y_te = train_test_split(narratives, labels,
                                                                  test_size=0.3, random_state=1)

narratives_tr = narratives_trval[:-Nval]
y_tr = y_trval[:-Nval]

narratives_val = narratives_trval[-Nval:]
y_val = y_trval[-Nval:]

print("Number of training examples:", len(y_tr))
print("Number of validation examples:", len(y_val))
print("Number of test examples:", len(y_te))

### Creation / load of stopwords list

In [None]:
nltk.download('stopwords')

In [None]:
stop_words = stopwords.words('english')

for i in range(1,21):
    stop_words.append(i*'x')
print(stop_words) # alphabetical sort

### Punctuation marks

In [None]:
from string import punctuation
punctuation_marks = list(punctuation)
print(punctuation_marks)

In [None]:
stop_words_and_punctuation = sorted(list(set(stop_words + punctuation_marks)))
print(stop_words_and_punctuation)

In [None]:
case = 2
text = narratives_tr[case]
text

## Bag-of-words (BOW) representation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
TF_vectorizer = CountVectorizer(max_features=1000, # max_df=0.5, min_df=20,
                                min_df=0.05,
                                stop_words=stop_words_and_punctuation)

TF_vectorizer

In [None]:
TF_vectorizer.fit(narratives_tr)
tf_tr = TF_vectorizer.transform(narratives_tr)
TF_vocabulary = TF_vectorizer.get_feature_names_out()

In [None]:
print(len(TF_vocabulary))
print(TF_vocabulary)

In [None]:
n=50
unique_labels = np.unique(labels)
for label in unique_labels:
    freqs = np.array(tf_tr[y_tr==label].sum(axis=0))[0]
    plt.figure(figsize=(15,5))
    inds = np.argsort(freqs)[::-1]
    plt.plot(freqs[inds[:n]])
    plt.xticks(range(n), np.array(TF_vocabulary)[inds[:n]], rotation=75)
    plt.title('word frequencies in complaints related to issue ' + '"'+label+'"', fontsize=16)
    plt.show()

## TF-IDF

In [None]:
TFIDF_vectorizer = TfidfVectorizer(max_features=1000,
                                   min_df=0.05,
                                   stop_words=stop_words_and_punctuation,
                                   norm='l2')
print(TFIDF_vectorizer)

X_tr  = np.array(TFIDF_vectorizer.fit_transform(narratives_tr).todense(),
                 dtype=np.float16)
X_te  = np.array(TFIDF_vectorizer.transform(narratives_te).todense(),
                 dtype=np.float16)
X_val = np.array(TFIDF_vectorizer.transform(narratives_val).todense(),
                 dtype=np.float16)

TFIDF_vocabulary = TFIDF_vectorizer.get_feature_names_out()

In [None]:
len(TFIDF_vocabulary)

In [None]:
print(TFIDF_vocabulary)

### Majority class (baseline)

In [None]:
from sklearn.dummy import DummyClassifier

clf = DummyClassifier(strategy='most_frequent')
clf.fit(X_tr, y_tr)
print("score en training :", clf.score(X_tr, y_tr).round(3))
print("score en test     :", clf.score(X_te, y_te).round(3))

## Autoencoder (los pesos del decoder son espejo de los del encoder)

In [None]:
X_tr.shape, X_val.shape, X_te.shape

In [None]:
np.unique(y_tr)

In [None]:
y_tr_int = 1*(y_tr=="Credit card")
y_te_int = 1*(y_te=="Credit card")

In [None]:
# knn works better with normalized X_proy_tr, normalized X_proy_te

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint

In [None]:
def grafica_entrenamiento(tr_loss, val_loss):
    ax=plt.figure(figsize=(10,4)).gca()
    plt.plot(1+np.arange(len(tr_loss)), tr_loss)
    plt.plot(1+np.arange(len(val_loss)), val_loss)
    plt.title('loss del modelo', fontsize=18)
    plt.xlabel('época', fontsize=18)
    plt.ylabel('mse', fontsize=18)
    plt.legend(['entrenamiento', 'validación'], loc='upper left')
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.show()

In [None]:
loss = "mse"

In [None]:
class DenseTranspose(keras.layers.Layer):
    def __init__(self, dense, activation=None, **kwargs):
        self.dense = dense
        self.activation = keras.activations.get(activation)
        super().__init__(**kwargs)
    def build(self, batch_input_shape):
        self.biases = self.add_weight(name="bias",
                                      shape=[self.dense.input_shape[-1]],
                                      initializer="zeros")
        super().build(batch_input_shape)
    def call(self, inputs):
        z = tf.matmul(inputs, self.dense.weights[0], transpose_b=True)
        return self.activation(z + self.biases)

In [None]:
dense_1 = keras.layers.Dense(2, use_bias=False)

encoder = keras.models.Sequential([dense_1])
decoder = keras.models.Sequential([DenseTranspose(dense_1)])

ae = keras.models.Sequential([encoder, decoder])

#ae.compile(loss=loss, optimizer=keras.optimizers.SGD(learning_rate=0.5))
#ae.compile(loss=loss, optimizer=keras.optimizers.SGD(learning_rate=1.5))
ae.compile(loss=loss, optimizer="adam")

In [None]:
modelpath="model_current_best.h5"
checkpoint = ModelCheckpoint(modelpath, monitor='val_loss',
                             verbose=2,
                             save_best_only=True,
                             mode='min') # graba sólo los que mejoran en validación

callbacks_list = [checkpoint]

In [None]:
epochs = 100
batch_size = 64

acum_tr_loss = []
acum_val_loss = []

best_weights = None
best_val_loss = 1e20

In [None]:
from copy import deepcopy

In [None]:
for e in range(epochs):
    history = ae.fit(X_tr, X_tr,
                     batch_size=batch_size,
                     epochs=1,
                     #callbacks=callbacks_list,
                     verbose=0,
                     validation_data=(X_val, X_val))

    acum_tr_loss  += history.history['loss']
    acum_val_loss += history.history['val_loss']
    
    if acum_val_loss[-1]<best_val_loss:
        best_val_loss = acum_val_loss[-1]
        best_weights = deepcopy(ae.get_weights())
    
    if (e+1)%50 == 0:
        grafica_entrenamiento(acum_tr_loss, acum_val_loss)

In [None]:
ae.set_weights(best_weights)

In [None]:
print("loss en training :", ((np.array(X_tr) - ae.predict(X_tr))**2).mean())
print("loss en test     :", ((np.array(X_te) - ae.predict(X_te))**2).mean())

## Visualización del embedding aprendido

In [None]:
doc_vecs_tr  = encoder.predict(X_tr)
doc_vecs_val = encoder.predict(X_val)
doc_vecs_te  = encoder.predict(X_te)
word_vecs = decoder.get_weights()[1]

In [None]:
doc_vecs_tr.shape, doc_vecs_val.shape, doc_vecs_te.shape, word_vecs.shape

In [None]:
W1 = ae.layers[0].get_weights()[0]
W2 = ae.layers[1].get_weights()[1]

In [None]:
W1.shape, W2.shape

In [None]:
W1.T.dot(W2).round(3)

In [None]:
# importing bokeh library for interactive dataviz

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool, LabelSet, ColumnDataSource, Range1d
from bokeh.plotting import figure, show, output_notebook

output_notebook()
p = bp.figure(plot_width=700, plot_height=600, title="words in latent dimensions",
              tools="pan,wheel_zoom,box_zoom,reset,hover,save",
              x_axis_label='lat 0',
              y_axis_label='lat 1',
              #x_axis_type=None, y_axis_type=None,
              min_border=1)
p.title.text_font_size='16pt'
p.xaxis.axis_label_text_font_style='normal'
p.xaxis.axis_label_text_font_size='16pt'
p.yaxis.axis_label_text_font_style='normal'
p.yaxis.axis_label_text_font_size='16pt'

p.xgrid.visible = False
p.ygrid.visible = False

dictf = {'x':word_vecs[:,0],
         'y':word_vecs[:,1],
         'words':TFIDF_vocabulary}
aa = ColumnDataSource(dictf)
p.scatter(x='x', y='y', source=aa)
labels_p = LabelSet(x='x', y='y', text='words',
                    level='glyph',
                    x_offset=5, y_offset=5, source=aa, render_mode='canvas')
p.add_layout(labels_p)

hover = p.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(p)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

index2word = np.array(TFIDF_vocabulary)
word2index = {w:i for i,w in enumerate(index2word)}

def most_similar_words(word,wordvecs,n=10):
    n_latent_dimensions = np.shape(wordvecs)[1]
    ind = word2index[word]
    aux = np.reshape(wordvecs[ind,:], (1,n_latent_dimensions))
    dists = pairwise_distances(aux, wordvecs).flatten()
    inds = dists.argsort().tolist()
    inds.remove(ind)
    return index2word[inds[:n]]

In [None]:
len(TFIDF_vocabulary)

In [None]:
most_similar_words('debt', word_vecs)

In [None]:
most_similar_words('card', word_vecs)

In [None]:
most_similar_words('loan', word_vecs)

In [None]:
most_similar_words('call', word_vecs)

In [None]:
most_similar_words('bank', word_vecs)

In [None]:
most_similar_words('home', word_vecs)

In [None]:
colors = ['blue', 'orange']

plt.figure(figsize=(8,8))
for i,label in enumerate(unique_labels):
    plt.scatter(doc_vecs_tr[y_tr==label,0],
                doc_vecs_tr[y_tr==label,1],
                s = 1, alpha = 1, c = colors[i], label=label,
                )
plt.legend()
plt.xlabel('lat 0', fontsize=16)
plt.ylabel('lat 1', fontsize=16)
plt.title('docs in latent dimensions', fontsize=16);

In [None]:
def most_similar_docs(doc, wordvecs, docvecs, n=10):
    n_latent_dimensions = np.shape(wordvecs)[1]
    aux = np.reshape(doc, (1,n_latent_dimensions))
    dists = pairwise_distances(aux, docvecs).flatten()
    inds = dists.argsort().tolist()
    return inds[:n]

In [None]:
caso = 0

doc_inds = most_similar_docs(doc_vecs_te[caso], word_vecs, doc_vecs_tr)

start_bold = '\033[1m'
end_bold   = '\033[0m'

print(start_bold + "document {} in test:".format(caso) + end_bold)
print(narratives_te[caso])
print(start_bold + "Most similar documents in training:" + end_bold)
for i,ind in enumerate(doc_inds):
    print(start_bold + "* doc {} in training:".format(i) + end_bold)
    print(narratives_tr[ind])

In [None]:
# defining the chart
output_notebook()
p = bp.figure(plot_width=700, plot_height=600, title="docs in latent dimensions",
              tools="pan,wheel_zoom,box_zoom,reset,hover,save",
              x_axis_label='lat 0',
              y_axis_label='lat 1',
              #x_axis_type=None, y_axis_type=None,
              min_border=1)
p.title.text_font_size = '16pt'
p.xaxis.axis_label_text_font_style='normal'
p.xaxis.axis_label_text_font_size='16pt'
p.yaxis.axis_label_text_font_style='normal'
p.yaxis.axis_label_text_font_size='16pt'

p.xgrid.visible = False
p.ygrid.visible = False

for i,label in enumerate(unique_labels):
    inds = np.where(y_tr==label)[0]
    dictf = {'x':doc_vecs_tr[inds,0],
             'y':doc_vecs_tr[inds,1],
             'text':np.array(narratives_tr)[y_tr==label],
             'ind':inds}
    aa = ColumnDataSource(dictf)
    p.scatter(x='x', y='y', source=aa,
              color=colors[i], legend=label)

hover = p.select(dict(type=HoverTool))
hover.tooltips={"text": "@text", "index in tr": "@ind"}
show(p)

## Construction of classification models

### Naïve Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(doc_vecs_tr, y_tr)
print("score en training :", clf.score(doc_vecs_tr, y_tr).round(3))
print("score en test     :", clf.score(doc_vecs_te, y_te).round(3))


### Support Vector Machines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [None]:
print(doc_vecs_tr.var(axis=0))

clf = Pipeline(
    [("scaler", StandardScaler()),
     ("SVC", SVC(kernel='rbf'))]
)

clf.fit(doc_vecs_tr, y_tr)
print("score en training :", clf.score(doc_vecs_tr, y_tr).round(3))
print("score en test     :", clf.score(doc_vecs_te, y_te).round(3))


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf = Pipeline(
    [("scaler", StandardScaler()),
     ("LogReg", LogisticRegression())]
)

clf.fit(doc_vecs_tr, y_tr)
print("score en training :", clf.score(doc_vecs_tr, y_tr).round(3))
print("score en test     :", clf.score(doc_vecs_te, y_te).round(3))

In [None]:
from sklearn.metrics import classification_report, roc_curve, auc

col_clase_positiva = 1
y_pred_proba = clf.predict_proba(doc_vecs_te)
y_pred = clf.predict(doc_vecs_te)
print('')
print(classification_report(y_te, y_pred))

In [None]:
colors2 = ['r', 'g', 'm', 'c', 'y']
plt.figure(figsize=(7,5))
for i,l in enumerate(unique_labels):
    fpr, tpr, thresholds = roc_curve(y_te, y_pred_proba[:,i], pos_label=unique_labels[i])
    plt.plot(fpr, tpr, colors2[i]+'-.', label=l+' (%2.2f)' % auc(fpr, tpr))
plt.plot(fpr, fpr, 'b-', label = 'Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend();

# Clustering

In [None]:
scaler = StandardScaler()
X_km   = scaler.fit_transform(doc_vecs_tr)

X_km = doc_vecs_tr

## k-means

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score as qmetric

Nrepetitions = 10

qualities = []
inertias = []
models = []
kini = 1
kfin = 10
for k in range(kini,kfin+1):
    print("Evaluando k=%d" % k)
    km = KMeans(n_clusters=k,
                init='k-means++', n_init=Nrepetitions,
                max_iter=500, random_state=2)    
    km.fit(X_km)
    models.append(km)
    inertias.append(km.inertia_)
    if k >1:
        qualities.append(qmetric(X_km, km.labels_))
    else:
        qualities.append(0)

In [None]:
fig = plt.figure(figsize=(14,3))

ax = plt.subplot(1,2,1)
plt.plot(range(kini,kfin+1), inertias, marker='o')
plt.xlabel('number of clusters')
plt.title('clustering inertia')

ax = plt.subplot(1,2,2)
plt.plot(range(kini,kfin+1), qualities, marker='o')
plt.xlabel('number of clusters')
plt.title('clustering quality')
plt.show()

best = pd.Series(qualities).idxmax() # get index for the best model
km = models[best]
n_clusters = km.get_params()['n_clusters']
clusters = km.labels_
n_clusters

In [None]:
def clustering_reporting(cluster_labels, clases):
    unique_clases = np.unique(clases)
    
    for i,c in enumerate(np.unique(cluster_labels)):
        inds = np.where(cluster_labels == c)[0]
        print("cluster %d (%.2f%%):" % (i,100*len(inds)/len(clases)))
        for c in unique_clases:
            print('   '+c+": %.2f%%" % (100*(clases[inds].tolist().count(c))/len(inds)))
        print()

In [None]:
clustering_reporting(km.labels_, y_tr)