# Classifiers

In order to run this notebook you must have installed some packages.  See [requirements](requirements.txt).  
File produced by running `conda list --export > requirements.txt`


Reference: 
* [Multi-Label Text Classification by Zuzanna Deutschman](https://towardsdatascience.com/multi-label-text-classification-5c505fdedca8).
* [Multi-label Classification Examples](https://skml.readthedocs.io/en/latest/auto_examples/index.html)

## Setting up libraries

### Importing py modules

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
# import nltk
# nltk.download('stopwords')

# from nltk.corpus import stopwords
# from nltk.stem.snowball import SnowballStemmer
# from bs4 import BeautifulSoup
# import lxml

import re
# import csv
# from tqdm import tqdm
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import BernoulliNB
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
# from sklearn.preprocessing import MultiLabelBinarizer
# import pickle


## Loading the data

In [None]:
tweets_new = pd.read_csv('tweets.csv')
# tweets_new

In [None]:
tweets = tweets_new.copy()
tweets.head(10)

In [None]:
# print(tweets.columns.values)

# get the name of the columns: text and labels
TEXT_DESCRIPTION_COL = tweets.columns.values[0]
CIRCUMSTANTIAL_DESCRIPTION_COL = tweets.columns.values[1]
ELECTRICITY_DESCRIPTION_COL = tweets.columns.values[2]
GAS_DESCRIPTION_COL = tweets.columns.values[3]
GASOLINE_DESCRIPTION_COL = tweets.columns.values[4]
SOCIAL_DESCRIPTION_COL = tweets.columns.values[5]
ACCOUNT_DESCRIPTION_COL = tweets.columns.values[6]
WATER_DESCRIPTION_COL = tweets.columns.values[7]
print('Column names:')
print('\t' ,TEXT_DESCRIPTION_COL)
print('\t' ,CIRCUMSTANTIAL_DESCRIPTION_COL)
print('\t' ,ELECTRICITY_DESCRIPTION_COL)
print('\t' ,GAS_DESCRIPTION_COL)
print('\t' ,GASOLINE_DESCRIPTION_COL)
print('\t' ,SOCIAL_DESCRIPTION_COL)
print('\t' ,ACCOUNT_DESCRIPTION_COL)
print('\t' ,WATER_DESCRIPTION_COL)

In [None]:
tc = tweets.copy()

# isolate the text and the labels
tc_labels = tc.drop(labels=[TEXT_DESCRIPTION_COL], axis=1)
tc_text = tc.drop(labels=[CIRCUMSTANTIAL_DESCRIPTION_COL,
                          ELECTRICITY_DESCRIPTION_COL, GAS_DESCRIPTION_COL,
                          GASOLINE_DESCRIPTION_COL, SOCIAL_DESCRIPTION_COL,
                          ACCOUNT_DESCRIPTION_COL, WATER_DESCRIPTION_COL], axis=1)

# replaces any number greater than 0 for a 1, bc we need "existance" or "not existance" of the
# label on the tweet
tc_labels = tc_labels.where(tc_labels == 0, 1)
tc_labels.head(10)

In [None]:
binarized_tweet_labels = tc_text.merge(tc_labels, on=tc_labels.index)  # in case I need to merge them
binarized_tweet_labels = binarized_tweet_labels.drop(labels = ['key_0'], axis=1)
binarized_tweet_labels.head(10)

## Vectorizing

In [None]:
# spliting the data into train and test
train, test = train_test_split(binarized_tweet_labels, random_state=42, test_size=0.30, shuffle=True)
train_text = train[TEXT_DESCRIPTION_COL].values.astype('U')
test_text = test[TEXT_DESCRIPTION_COL].values.astype('U')


# creating the vectorizer, using uni-bi-tri grams, and selecting only 10000 features.
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', 
                             ngram_range=(1,3), norm='l2', max_features = 10000)

vectorizer.fit(train_text)
vectorizer.fit(test_text)

# x_train is the vectorization of each document - sparse matrix
x_train = vectorizer.transform(train_text)

# y_train are the corresponding labels of each document - pandas.DF
y_train = train.drop(labels = [TEXT_DESCRIPTION_COL], axis=1)

# same as above, but whit will be used for testing
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = [TEXT_DESCRIPTION_COL], axis=1)


In [None]:
x_train

#### Explore . . .

In [None]:
# features selected by the vectorizer with uni-bi-tri grams
# x_train
features = vectorizer.get_feature_names()
features

## Build a model, train and predict

Helper methods to evaluate the model

In [None]:
def report_measures(y_test, y_pred) -> None:

    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Hamming loss: ", hamming_loss(y_test, y_pred))
    
    print("F1 score:")
    print("\tmicro: ", f1_score(y_test, y_pred, average='micro'))
    print("\tmacro: ", f1_score(y_test, y_pred, average='macro'))

    print("Precision:")
    print("\tmicro: ", precision_score(y_test, y_pred, average='micro'))
    print("\tmacro: ", precision_score(y_test, y_pred, average='macro'))

    print("Recall:")
    print("\tmicro: ", recall_score(y_test, y_pred, average='micro'))
    print("\tmacro: ", recall_score(y_test, y_pred, average='macro'))

### Binary Relevance
1. with GaussianNB

In [None]:
#Binary Relevance: Naive Bayes with Bernoulli Distribution
br_classifier = BinaryRelevance(BernoulliNB())
br_classifier.fit(x_train, y_train)
br_predictions = br_classifier.predict(x_test)

report_measures(y_test, br_predictions)

2. with LogisticRegression

In [None]:
# Binary Relevance: Logistic Regression
br_classifier1 = BinaryRelevance(LogisticRegression())
br_classifier1.fit(x_train, y_train)
br_predictions1 = br_classifier1.predict(x_test)

report_measures(y_test, br_predictions1)

# br_predictions.toarray()
# y_test

### Label Powerset


In [None]:
#Label Powerset
lp_classifier = LabelPowerset(LogisticRegression())
lp_classifier.fit(x_train, y_train)
lp_predictions = lp_classifier.predict(x_test)

report_measures(y_test, lp_predictions)

### MLkNN

In [None]:
ml_classifier = MLkNN(k=4)
# to prevent errors when handling sparse matrices.
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()

ml_classifier.fit(x_train, y_train)
ml_predictions = ml_classifier.predict(x_test)

report_measures(y_test, ml_predictions)

### Classifier Chain Model

In [None]:
# for the next classifier we need to remove from y-train, y-test categories which
# equal 0 for all train samples
# y_train = train.drop(labels = [TEXT_DESCRIPTION_COL], axis=1)
# y_test = test.drop(labels = [TEXT_DESCRIPTION_COL], axis=1)
# selected_labels = y_train.columns[y_train.sum(axis = 0, skipna = True) > 0].tolist()

# y_train = y_train.filter(selected_labels, axis=1)
# y_test = y_test.filter(selected_labels, axis=1)
# x_train = vectorizer.transform(train_text)
# x_test = vectorizer.transform(test_text)

cc_classifier = ClassifierChain(LogisticRegression())
cc_classifier.fit(x_train, y_train)
# print(cc_classifier.predict(x_test))
cc_predictions_proba = cc_classifier.predict_proba(x_test)

report_measures(y_test, cc_classifier.predict(x_test))

In [None]:
#for plotting metrics as a function of threashold
def plotting_metrics_function_of_threshold(model_name, cc_predictions_proba, y_test):
    th = []
    f = []
    ham = []
    ac = []

    for t in range (5,90): # threshold value
        y_pred_new = (cc_predictions_proba >= t/100).astype(int)
        print("t =" ,t/100, '\t', "Accuracy = ",accuracy_score(y_test,y_pred_new),
             '\t', "F1 = ",f1_score(y_test,y_pred_new, average="micro"), '\t',
              "Hamming loss = ",hamming_loss(y_test,y_pred_new))
        th.append(t)
        ac.append(accuracy_score(y_test,y_pred_new))
        f.append(f1_score(y_test,y_pred_new, average="micro"))
        ham.append(hamming_loss(y_test,y_pred_new))
    plt.rcParams["figure.figsize"] = (12,6)
    with plt.style.context('ggplot'):
        plt.plot(th, f)
        plt.plot(th, ham)
        plt.plot(th, ac)
        plt.legend(['F1', 'Hamming loss', 'Accuracy'], loc='center left', fontsize = 14)
        plt.ylabel("metrics", fontsize = 14)
        plt.xlabel("threshold", fontsize = 14)
        plt.title(model_name, fontsize = 18)
    plt.show()
    
plotting_metrics_function_of_threshold('Chain Model Classifier', cc_predictions_proba, y_test)

### multi label Classifier using Neural Networks

Two ways to do this:
* using TF-IDF Vectorizer
* using Word Embeddings

In [None]:
import tensorflow as tf
from tensorflow import keras

#### using TF-IDF vectorizer
For this model we will use TF-IDF vectorizer generated on [Section 1.3](#Vectorizing) to convert the sentences into vectors, then this will be passed
into the neural network.

In [None]:
# print(x_train.shape[1])
# x_train
# y_train
# x_test
# y_test

# work to be done. . .

model = keras.Sequential()
model.add(keras.layers.Dense(16, input_dim=x_train.shape[1], activation="relu"))
# model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(tweets.columns.values.shape[0] -1, activation="sigmoid"))

model.summary()

In [None]:


model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

batch_size = 20
epochs = 10

history = model.fit(x_train, y_train, batch_size, epochs, validation_data=(x_test, y_test))

results = model.evaluate(x_test, y_test, batch_size)

nn_predictions = model.predict(x_test)

In [None]:
# print(history)
print(results)
# nn_predictions
# plotting_metrics_function_of_threshold('ANN Classifier', nn_predictions, y_test)


In [None]:
pd_pred_prob = pd.DataFrame(nn_predictions, columns = tweets.columns.values[1:])
pd_pred = (pd_pred_prob.copy() >= 0.5)
pd_pred = pd_pred.where(pd_pred == True, 0)
pd_pred = pd_pred.where(pd_pred == False, 1)

report_measures(y_test, pd_pred.to_numpy(dtype=np.int))

In [None]:
# pd_pred.insert(0, 'tweet', test_text)
# pd_pred

In [None]:
tw_doc = 'La cosa esta dificil con lo de la luz y todo lo demas'
tw_doc_vector = vectorizer.transform([tw_doc])

tw_pred = model.predict([tw_doc_vector])
print('Prediction: \n', tw_doc)
pd.DataFrame(tw_pred, columns = tweets.columns.values[1:])

#### using Word Embeddings

TBD

In [None]:
# binarized_tweet_labels

#  these are all the labels, for now I will select only one of them
#  and we will test the classification on a single label.

# print( TEXT_DESCRIPTION_COL )
# print( CIRCUMSTANTIAL_DESCRIPTION_COL )
# print( ELECTRICITY_DESCRIPTION_COL )
# print( GAS_DESCRIPTION_COL )
# print( GASOLINE_DESCRIPTION_COL )
# print( SOCIAL_DESCRIPTION_COL )
# print( ACCOUNT_DESCRIPTION_COL )
# print( WATER_DESCRIPTION_COL )

