# 0 Imports

In [None]:
import os, warnings, re, shutil
from datetime import datetime
from ast import literal_eval

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import multilabel_confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score, hamming_loss, precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from skmultilearn.model_selection import iterative_train_test_split

from bs4 import BeautifulSoup as bs
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer
# nltk.download()
from wordcloud import WordCloud

import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow_addons as tfa
from tensorflow.python.profiler import trace
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
from tensorflow import keras
from tensorflow.keras.layers import Input, Dropout, Dense, Embedding, AveragePooling1D, Flatten
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy, AUC
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import bert
from transformers import TFBertModel, BertConfig, BertTokenizerFast
from tokenizers import BertWordPieceTokenizer

# tf.compat.v1.disable_eager_execution()
tf.compat.v1.enable_eager_execution()

import scripts

def set_seed(seed=31415):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed()

plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=18, titlepad=10)
plt.rc('image', cmap='magma')
warnings.filterwarnings("ignore")

print("-----------------------------------------")
if tf.test.gpu_device_name():
    print(f"GPU used: {tf.test.gpu_device_name()}")
else:
    print(f"GPU not used")
print("-----------------------------------------")

from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

***
# 1 Config

In [None]:
config = {
    "raw_preprocessing": False,
    "preprocessing": False,
    "baseline": False,
    "bert_base": False,
    "bert_se": True
}

***
# 2 Preprocessing

## 2.0 Utils

In [None]:
def preproc_raw_data_body(cell):
    soup = bs(cell, "html.parser")

    script_tags = soup.find_all("script")
    for script_tag in script_tags:
        script_tag.extract()

    code_tags = soup.find_all("code")
    for code_tag in code_tags:
        code_tag.extract()

    preproc_cell = soup.get_text()
    preproc_cell = preproc_cell.replace(',', ' ')

    return preproc_cell

In [None]:
def tags_to_list(cell):
    return [tag for tag in re.split(r'[<>]', cell) if tag]

In [None]:
def lower(cell):
    return cell.lower()

In [None]:
def tokenize(*texts):
    tokens = []
    for text in texts:
        # https://regex101.com/
        tokenizer = nltk.RegexpTokenizer(r'\.?[a-z#]+')
        tokens_temp = tokenizer.tokenize(text)
        tokens += [re.sub("(.)\\1{3,}", "\\1", token) for token in tokens_temp]
    return tokens

In [None]:
def remove_stop_words(cell):
    return [word for word in cell if word not in stop_words]

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

In [None]:
def tag_pos(cell):
    treebank_tags = pos_tag(cell)
    pos = [(tag[0], get_wordnet_pos(tag[1])) for tag in treebank_tags]
    return pos

In [None]:
def lemmatize(cell, with_pos=False):
    lemmatizer = WordNetLemmatizer()
    if not with_pos:
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in cell]
    else:
        lemmatized_tokens = [lemmatizer.lemmatize(pair[0], pos=pair[1]) for pair in cell]
    return lemmatized_tokens

In [None]:
def stemmize(cell):
    stemmer = SnowballStemmer("english")
    stemmed_tokens = [stemmer.stem(token) for token in cell]
    return stemmed_tokens

## 2.1 Loading raw data

In [None]:
if config["raw_preprocessing"]:

    raw_data_body = pd.read_csv("data/raw_data_body.csv")
    raw_data_body.shape

## 2.2 Parse html

In [None]:
if config["raw_preprocessing"]:

    raw_data_body["Body"] = raw_data_body.apply(lambda row: preproc_raw_data_body(row.Body), axis=1)

## 2.3 Merge body with rest

In [None]:
display = None

if config["raw_preprocessing"]:

    raw_data_rest = pd.read_csv("data/raw_data_id_title_tags.csv")
    raw_data = raw_data_rest.join(raw_data_body)
    raw_data = raw_data.set_index("Id").reindex(["Title", "Body", "Tags"], axis="columns")
    raw_data["Sentence_Pristine"] = raw_data.apply(lambda row: row["Title"] + " " + row["Body"], axis="columns")
    display = raw_data.head()

display

## 2.4 Tags to list

In [None]:
display = None

if config["raw_preprocessing"]:

    raw_data["Tags"] = raw_data.apply(lambda row: tags_to_list(row["Tags"]), axis="columns")
    display = raw_data[["Tags"]].head()

display

## 2.5 Lowering

In [None]:
display = None

if config["raw_preprocessing"]:

    raw_data["Sentence_Pristine"] = raw_data.apply(lambda row: lower(row["Sentence_Pristine"]), axis="columns")
    display = raw_data[["Sentence_Pristine"]].head()

display

## 2.6 Save raw preprocessed data

In [None]:
if config["raw_preprocessing"]:
    
    raw_data.drop(columns=["Title", "Body"], inplace=True)
    raw_data.to_csv("data/raw_data.csv")

## 2.7 Load raw preprocessed data

In [None]:
if config["preprocessing"]:
    
    data = pd.read_csv("data/raw_data.csv", index_col="Id")

## 2.8 Tokenize

In [None]:
display = None

if config["preprocessing"]:

    data["Tokens"] = data.apply(lambda row: tokenize(row["Sentence_Pristine"]), axis="columns")
    display = data[["Sentence_Pristine", "Tokens"]].head()

display

## 2.9 StopWords deletion

In [None]:
display = None

if config["preprocessing"]:

    stop_words = stopwords.words("english")
    data["Tokens"] = data.apply(lambda row: remove_stop_words(row["Tokens"]), axis="columns")
    display = data[["Sentence_Pristine", "Tokens"]].head()

display

## 2.10 POS - Part-Of-Speech

In [None]:
display = None

if config["preprocessing"]:

    data["POS"] = data.apply(lambda row: tag_pos(row["Tokens"]), axis="columns")
    display = data[["Tokens", "POS"]].head()

display

## 2.11 Lemmatize

In [None]:
display = None

if config["preprocessing"]:

    data["Lemmatized"] = data.apply(lambda row: lemmatize(row["POS"], with_pos=True), axis="columns")
    display = data[["Tokens", "Lemmatized"]].head()

display

## 2.12 Stemmize

In [None]:
display = None

if config["preprocessing"]:

    data["LemmaAndStem"] = data.apply(lambda row: stemmize(row["Tokens"]), axis="columns")
    display = data[["Tokens", "LemmaAndStem"]].head()

display

## 2.13 Generating sentence

In [None]:
display = None

if config["preprocessing"]:

    data["Sentence"] = data.apply(lambda row: " ".join([str(item) for item in row["LemmaAndStem"]]), axis="columns")
    display = data[["LemmaAndStem", "Sentence"]].head()

display

## 2.14 Saving data

In [None]:
display = None

if config["preprocessing"]:

    data.to_csv("data/data_cleaned.csv", index_label="Id")
    display = data.head()

display

***
# 3 Data preparation

## 3.0 Utils

In [None]:
def bow(dataset, max_features=None, min_df=0.0, max_df=1.0):
    vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=max_features, max_df=max_df, min_df=min_df)
    matrix = vectorizer.fit_transform(dataset)

    data_dense = matrix.todense()
    print(f"Sparcity: {((data_dense > 0).sum() / data_dense.size)*100:.4}%")

    vocab = vectorizer.get_feature_names_out()

    matrix = matrix.toarray()
    bag = pd.DataFrame(data=matrix, columns=vocab)
    return bag, vectorizer

In [None]:
def tfidf(dataset, max_features=None, min_df=0.0, max_df=1.0):
    vectorizer = TfidfVectorizer(tokenizer=None, stop_words=None, max_features=max_features, min_df=min_df, max_df=max_df)
    matrix = vectorizer.fit_transform(dataset).toarray()
    vocab = vectorizer.get_feature_names_out()
    tfidf = pd.DataFrame(data=matrix, columns=vocab)
    return tfidf, vectorizer

## 3.1 Loading

In [None]:
data = pd.read_csv("data/data_cleaned.csv", index_col="Id")
data = data.reset_index()
data = data.iloc[:, 1:]

data["Tags"] = data["Tags"].apply(eval)

data.head()

In [None]:
data = data[["Sentence_Pristine", "Sentence", "Tags"]]
data.head()

In [None]:
data.shape

***
## 3.2 Dataset constraints

BERT_SE fine_tuning has been done for a dataset of 23 313 rows, then our 50k rows dataset must be constraint

In [None]:
data = data.iloc[:23313]

In [None]:
data.shape

In [None]:
data.head()

***
## 3.3 Tags

In [None]:
data[["Tags"]].head()

In [None]:
data.Tags.values

In [None]:
tags = []
for row in data.Tags.values:
    tags += row
tags_df = pd.DataFrame(data=tags, columns=["Tag"]).value_counts().reset_index()
tags_df.columns = ["Tag", "Count"]

In [None]:
tags_df.info()

In [None]:
tags_df.head()

In [None]:
plt.figure(figsize=(15, 4))

sns.barplot(data=tags_df.iloc[:20], x="Tag", y="Count")

plt.title("Tag count", size=20)
plt.xlabel("Tag", size=16)
plt.ylabel("Count", size=16)
plt.xticks(rotation=45, size=16, ha="right")
plt.yticks(size=16)
plt.show()

In [None]:
plt.figure(figsize=(15, 3))

ax = sns.ecdfplot(data=tags_df, x="Count", log_scale=True)

plt.axhline(0.98, linestyle="--", linewidth=1, color="r")
plt.axvline(30, linestyle="--", linewidth=1, color="r")

plt.title("Cummulative coverage percentage", size=20)
plt.xlabel("Number of post", size=16)
plt.ylabel("Proportion", size=16)
plt.xticks(rotation=45, size=16, ha="right")
plt.yticks(size=16)
plt.show()

In [None]:
tags_df = tags_df[:30]
TAGS_NB = tags_df.shape[0]
tags_df.info()

In [None]:
word_frequencies = dict(zip(tags_df.Tag, tags_df.Count))
wordcloud = WordCloud(background_color="black", width=1600, height=800).generate_from_frequencies(word_frequencies)

fig = plt.figure(figsize=(15, 5))
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
tags = tags_df.Tag.tolist()

In [None]:
tags[:20]

In [None]:
def find_or_remove(cell, word_list):
    return [word for word in cell if word in word_list]

In [None]:
data["Tags_Reduced"] = data.apply(lambda row: find_or_remove(row["Tags"], tags),axis="columns")

In [None]:
data["Tags_Reduced"].isna().any()

In [None]:
data["Tags_Reduced"].head()

In [None]:
# data["Tags_Reduced"] = data.apply(lambda row: at_least_two(row), axis=1)
# data.dropna(subset=["Tags_Reduced"], inplace=True)
# data.shape

***
# 4 Baseline

## 4.0 Utils

In [None]:
# https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn
#
def latent_dirichlet_allocation_tuning(dataset: pd.DataFrame, param_grid: dict):
    data_bow, vectorizer = bow(dataset, min_df=.005)
    feature_names = data_bow.columns

    lda = LatentDirichletAllocation()
    gs = GridSearchCV(lda, param_grid)
    gs.fit(data_bow)

    lda_model = gs.best_estimator_
    lda_output = lda_model.transform(data_bow)
    topic_names = ["Topic"+str(i) for i in range(lda_model.n_components)]

    lda_output_dataframe = pd.DataFrame(np.round(lda_output, 2), columns=topic_names)

    return gs, feature_names, data_bow, vectorizer, lda_output_dataframe

In [None]:
def get_dominant_topic(lda_model, data_bow, dataset_row_nb):
    lda_output = lda_model.transform(data_bow)

    topic_names = ["Topic"+str(i) for i in range(lda_model.n_components)]
    doc_names = ["Doc"+str(i) for i in range(dataset_row_nb)]

    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

    dominant_topics = np.argmax(df_document_topic.values, axis=1)
    df_document_topic["Dominant_Topic"] = dominant_topics

    return df_document_topic

In [None]:
def topic_distribution(dominant_topic_df):
    distribution = dominant_topic_df["Dominant_Topic"].value_counts().reset_index()
    distribution.columns = ["Dominant_Topic", "Count"]

    plt.figure(figsize=(15, 4))

    sns.countplot(data=dominant_topic_df, x="Dominant_Topic")

    plt.title("Tag count", size=20)
    plt.xlabel("Tag", size=16)
    plt.ylabel("Count", size=16)
    plt.xticks(rotation=45, size=16, ha="right")
    plt.yticks(size=16)
    plt.show()

    return distribution

In [None]:
def topic_words(lda_model, feature_names, n_words=20):
    keywords = np.array(feature_names)
    topic_keywords = []
    for topic_weight in lda_model.components_:
        topic_keyword_locs = (-topic_weight).argsort()[:n_words]
        topic_keywords.append(feature_names.take(topic_keyword_locs))
    
    topic_keywords_df = pd.DataFrame(data=topic_keywords)
    topic_keywords_df.columns = ["Word"+str(i) for i in range(topic_keywords_df.shape[1])]
    topic_keywords_df.index = ["Topic"+str(i) for i in range(topic_keywords_df.shape[0])]
    return topic_keywords_df

In [None]:
def make_prediction(lda_model, sentence, vectorizer, topic_keywords_dataset):
    sentence = scripts.preprocess_sentence(sentence)
    data_bow = vectorizer.transform([sentence])
    topic_probability_score = lda_model.transform(data_bow)
    topic = topic_keywords_dataset.iloc[np.argmax(topic_probability_score), :]
    topic_name = topic.name
    topic_words = topic.values.tolist()
    return topic_name, topic_words, topic_probability_score

In [None]:
def classifier_tuning_post_lda(dataset_X: pd.DataFrame, dataset_y: pd.DataFrame, meta_model, model, param_grid: dict, scoring: str = "f1_micro"):
    start = datetime.now()

    # target multi label binarizer
    multi_label_binarizer = MultiLabelBinarizer()
    y = multi_label_binarizer.fit_transform(dataset_y)

    feature_names = dataset_X.columns
    classes = multi_label_binarizer.classes_

    X_train, X_test, y_train, y_test = train_test_split(dataset_X, y, test_size = 0.33, random_state = 1)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # multioutput or onevsrest ...
    meta_model.fit(X_train, y_train)

    # gridsearch tuning/fitting
    gs = GridSearchCV(meta_model, param_grid, scoring=scoring, refit=True)
    gs.fit(X_train, y_train)

    # advanced evaluation
    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)

    print(f"classifier_tuning > Time taken to run this cell : {datetime.now() - start} \n")

    return gs, classes, y_test, y_pred

In [None]:
def evaluate(gs, classes, y_test, y_pred):
    start = datetime.now()

    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Hamming loss ", hamming_loss(y_test, y_pred))

    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')
    
    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    
    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    print(classification_report(y_test, y_pred, target_names=classes, zero_division=0))

    print(f"evaluate > Time taken to run this cell : {datetime.now() - start}\n")

***
## 4.1 Latent Dirichlet Allocation

In [None]:
if config["baseline"]:

    param_grid = {
        "n_components": [10],
        "learning_decay": [.7],
        "random_state": [0],
        "n_jobs": [10]
    }

    gs, feature_names, data_bow, vectorizer, lda_output_dataframe = latent_dirichlet_allocation_tuning(data["Sentence"], param_grid)

In [None]:
if config["baseline"]:

    print(f"Log likelihood: {gs.best_estimator_.score(data_bow)}")
    print(f"Perplexity: {gs.best_estimator_.perplexity(data_bow)}")
    print(f"Best params: {gs.best_params_}")

In [None]:
if config["baseline"]:

    dominant_topic_df = get_dominant_topic(gs.best_estimator_, data_bow, data.shape[0])
    dominant_topic_df.head()

In [None]:
if config["baseline"]:

    topic_distribution(dominant_topic_df)

In [None]:
if config["baseline"]:

    topic_keywords_df = pd.DataFrame(data=gs.best_estimator_.components_, columns=feature_names, index=dominant_topic_df.columns[:-1])
    topic_keywords_df.info()    

In [None]:
display = None

if config["baseline"]:

    display = topic_keywords_df.iloc[:, :20]

display

In [None]:
display = None

if config["baseline"]:

    topic_keywords_dataset = topic_words(lda_model=gs.best_estimator_, feature_names=feature_names, n_words=20)
    display = topic_keywords_dataset

display

In [None]:
if config["baseline"]:

    param_grid = {
        "estimator__solver": ["liblinear"],
        "estimator__penalty": ["l1"],
        "estimator__random_state": [0],
    }

    model = LogisticRegression()
    meta_model = OneVsRestClassifier(model)
    gs, classes, y_test, y_pred = classifier_tuning_post_lda(lda_output_dataframe, data["Tags_Reduced"], meta_model, model, param_grid)
    evaluate(gs, classes, y_test, y_pred)

***
## 4.2 Results

<img src="records/lda_results_metrics.png" style="background-color:white">

<img src="records/lda_results_topic_distrib_plot.png" style="background-color:white">

<img src="records/lda_results_topic_words.png" style="background-color:white">

<img src="records/logistic_classifier_results_01.png" style="background-color:white">

<img src="records/logistic_classifier_results_02.png" style="background-color:white">

<img src="records/logistic_classifier_results_03.png" style="background-color:white">

***
# 5 NN preparation

## 5.0 Utils

In [None]:
def make_pseudo_list(row):
    ls = row["Tags_Reduced"]
    cell = "["
    for elt in ls:
        cell = cell + "'" + elt + "',"
    cell = cell + "]"
    return cell

In [None]:
def at_least_two(row):
    tags_list = row["Tags_Reduced"]
    tags_list_len = len(tags_list)
    if tags_list_len > 1:
        return row["Tags_Reduced"]

In [None]:
# about multilabelbinarizer, reverse action
def to_class(mlb, vec):
    indexes = [i for i in range(len(vec)) if vec[i] != 0]
    return ' '.join([list(mlb.classes_)[i] for i in indexes])

In [None]:
def multi_label_accuracy(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
    """For multi-label classification, one has to define a custom
    acccuracy function because neither tf.keras.metrics.Accuracy nor
    tf.keras.metrics.CategoricalAccuracy evaluate the number of 
    exact matches.

    :Example:
    >>> from tensorflow.keras import metrics
    >>> y_true = tf.convert_to_tensor([[1., 1.]])
    >>> y_pred = tf.convert_to_tensor([[1., 0.]])
    >>> metrics.Accuracy()(y_true, y_pred).numpy()
    0.5
    >>> metrics.CategoricalAccuracy()(y_true, y_pred).numpy()
    1.0
    >>> multi_label_accuracy(y_true, y_pred).numpy()
    0.0
    """   
    y_pred = tf.math.round(y_pred)
    exact_matches = tf.math.reduce_all(y_pred == y_true, axis=1)
    exact_matches = tf.cast(exact_matches, tf.float32)
    return tf.math.reduce_mean(exact_matches)

In [None]:
def hamming_loss(y_true, y_pred):
    return K.mean(y_true*(1-y_pred)+(1-y_true)*y_pred)

In [None]:
def Custom_Hamming_Loss1(y_true, y_pred):
    tmp = K.abs(y_true-y_pred)
    return K.mean(K.cast(K.greater(tmp,0.5),dtype=float))

In [None]:
def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
def jaccard_distance(y_true, y_pred, smooth=100):
    intersection = K.sum(K.abs(y_true * y_pred), axis=-1)
    sum_ = K.sum(K.abs(y_true) + K.abs(y_pred), axis=-1)
    jac = (intersection + smooth) / (sum_ - intersection + smooth)
    return (1 - jac) * smooth

In [None]:
def visualize_history(history, figsize=(20, 10), metrics: str = "categorical_accuracy"):
    fix, axs = plt.subplots(2, 1, figsize=figsize, sharex=True)

    plt.subplot(2, 1, 1)
    plt.title("Loss")
    sns.lineplot(data=history, x=history.index, y="loss", label="loss")
    sns.lineplot(data=history, x=history.index, y="val_loss", label="val_loss")
    plt.xlabel("epochs")
    plt.tick_params(labelright=True)
    plt.legend()
    plt.grid()

    plt.subplot(2, 1, 2)
    plt.title("Accuracy")
    sns.lineplot(data=history, x=history.index, y=metrics, label=metrics)
    sns.lineplot(data=history, x=history.index, y="val_" + metrics, label="val_" + metrics)
    plt.xlabel("epochs")
    plt.tick_params(labelright=True)
    plt.legend()
    plt.grid()

***
## 5.1 Data split

In [None]:
tags = data[["Tags_Reduced"]]
tags.head()

In [None]:
X = data[["Sentence_Pristine"]]

In [None]:
binarizer = MultiLabelBinarizer()
y_binarized = binarizer.fit_transform(data["Tags_Reduced"])
y_binarized_df = pd.DataFrame(y_binarized, columns=binarizer.classes_, index=data.index)
data = X.join(y_binarized_df)
data.head()

In [None]:
y = data.iloc[:, 1:]
y.head()

In [None]:
X = X["Sentence_Pristine"]
X.head()

In [None]:
data["Sentence_Pristine"].head()

In [None]:
X_train_, X_test_, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [None]:
text_lenghts = [len(t.split()) for t in X]
ax = sns.histplot(data=text_lenghts, kde=True, stat="density")
ax.set_title("Texts length distribution (number of words):")

In [None]:
max_length = 100

***
# 6 BERT_base

## 6.1 Loading BERT

In [None]:
if config["bert_base"]:

    model_name = "bert-base-uncased"

In [None]:
if config["bert_base"]:
    
    bert_config = BertConfig.from_pretrained(model_name)
    bert_config.output_hidden_states = False

In [None]:
if config["bert_base"]:

    tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path=model_name, config=bert_config)

***
## 6.2 BERT Tokenizer

In [None]:
if config["bert_base"]:
    
    X_train = tokenizer(
        text=X_train_.to_list(),
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids=True,
        return_attention_mask=True,
        verbose=True
    )

    X_test = tokenizer(
        text=X_test_.to_list(),
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding=True, 
        return_tensors='tf',
        return_token_type_ids = True,
        return_attention_mask = True,
        verbose = True
    )

In [None]:
if config["bert_base"]:

    X_train.keys()

In [None]:
if config["bert_base"]:

    X_train["input_ids"][0]

In [None]:
if config["bert_base"]:

    X_train["token_type_ids"][0]

In [None]:
if config["bert_base"]:

    X_train["attention_mask"][0]

***
## 6.3 Tensorflow dataset creation

In [None]:
if config["bert_base"]:
    
    ds_train = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train))
    ds_test = tf.data.Dataset.from_tensor_slices((dict(X_test), y_test))

***
## 6.4 Network

In [None]:
if config["bert_base"]:

    bert_model_name = "bert-base-uncased"
    max_seq_len = 100
    tags_nb = TAGS_NB

In [None]:
if config["bert_base"]:

    input_ids = keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32, name='input_ids')
    input_type = keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32, name='token_type_ids')
    input_mask = keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32, name='attention_mask')
    inputs = [input_ids, input_type, input_mask]

    bert = TFBertModel.from_pretrained(bert_model_name)
    bert_outputs = bert(inputs)
    last_hidden_states = bert_outputs.last_hidden_state

    avg = keras.layers.GlobalAveragePooling1D()(last_hidden_states)
    output = keras.layers.Dense(tags_nb, activation="sigmoid")(avg)

    model = keras.Model(inputs=inputs, outputs=output)

    model.summary()

In [None]:
if config["bert_base"]:

    max_epochs = 4
    batch_size = 4

    opt = tfa.optimizers.RectifiedAdam(learning_rate=3e-5)
    loss = keras.losses.BinaryCrossentropy()
    best_weights_file = "bert_base_weights.h5"

    m_ckpt = ModelCheckpoint(
        best_weights_file,
        monitor='val_auc_1',
        mode='max',
        verbose=2,
        save_weights_only=True,
        save_best_only=True
    )

    model.compile(
        loss=loss,
        optimizer=opt,
        metrics=[
            keras.metrics.AUC(multi_label=True, curve="ROC"),
            keras.metrics.BinaryAccuracy()
        ]
    )

    history = model.fit(
        ds_train.shuffle(1000).batch(batch_size),
        validation_data=ds_test.batch(batch_size),
        epochs=max_epochs,
        batch_size=batch_size,
        callbacks=[m_ckpt],
    )

***
## 6.5 Results

In [None]:
if config["bert_base"]:

    history_bert = pd.DataFrame(data=history.history)
    for metric in ["auc_1", "binary_accuracy"]:
        visualize_history(history_bert, metrics=metric)

***
### 6.5.1 History

<img src="records/bert_base_results_auc.png" style="background-color:white">

<img src="records/bert_base_results_binaccuracy.png" style="background-color:white">

***
# 7 BERT_SE

## 7.1 Loading BERT_SE

In [None]:
if config["bert_se"]:

    pret_model = pd.read_csv('data/bert_se/BERT_SE.csv', delimiter= ',', header=None)
    MAX_LEN = data.shape[0]
    print(f"MAX_LEN: {MAX_LEN}")

In [None]:
if config["bert_se"]:

    embedding_matrix = pret_model.iloc[0:MAX_LEN,:]
    dfEmbedding_mat = pd.DataFrame(embedding_matrix)
    embedding_mat = dfEmbedding_mat.fillna('0')

***
## 7.2 BERT_SE Tokenizer

In [None]:
if config["bert_se"]:

    model_name = "bert-base-uncased"
    bert_config = BertConfig.from_pretrained(model_name)
    bert_config.output_hidden_states = False
    tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path=model_name, config=bert_config)

In [None]:
if config["bert_se"]:

    X_train = tokenizer(
        text=X_train_.to_list(),
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids=True,
        return_attention_mask=True,
        verbose=True
    )

    X_test = tokenizer(
        text=X_test_.to_list(),
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding=True, 
        return_tensors='tf',
        return_token_type_ids = True,
        return_attention_mask = True,
        verbose = True
    )

In [None]:
if config["bert_se"]:

    X_train.keys()

In [None]:
if config["bert_se"]:

    X_train["input_ids"][0]

In [None]:
if config["bert_se"]:

    X_train["token_type_ids"][0]

In [None]:
if config["bert_se"]:

    X_train["attention_mask"][0]

***
## 7.3 Tensorflow dataset creation

In [None]:
if config["bert_se"]:

    ds_train = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train))
    ds_test = tf.data.Dataset.from_tensor_slices((dict(X_test), y_test))

***
## 7.4 Network

https://keras.io/examples/nlp/multi_label_classification/

In [None]:
if config["bert_se"]:

    bert_model_name = "bert-base-uncased"
    max_seq_len = 100
    tags_nb = TAGS_NB

In [None]:
if config["bert_se"]:

    input_ids = keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32, name='input_ids')
    input_type = keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32, name='token_type_ids')
    input_mask = keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32, name='attention_mask')
    inputs = [input_ids, input_type, input_mask]
    
    bert = Embedding(MAX_LEN, 768, input_length = 100, name='embedding', trainable=True)
    bert.build(input_shape=(1,))
    bert.set_weights([embedding_mat])
    bert_outputs = bert(input_ids + input_type + input_mask)

    avg = keras.layers.GlobalAveragePooling1D()(bert_outputs)

    output = keras.layers.Dense(tags_nb, activation="sigmoid")(avg)
    model = keras.Model(inputs=inputs, outputs=output)

    model.summary()

In [None]:
if config["bert_se"]:

    max_epochs = 4
    batch_size = 4 

    # opt = tfa.optimizers.RectifiedAdam(learning_rate=3e-5)
    opt = Adam(lr = 0.001, beta_1 = 0.99, beta_2 = 0.999, epsilon = None, decay = 0.01, amsgrad = False)
    loss = keras.losses.BinaryCrossentropy()
    best_weights_file = "bert_se_weights.h5"

    m_ckpt = ModelCheckpoint(
        best_weights_file,
        monitor='val_auc_1',
        mode='max',
        verbose=2,
        save_weights_only=True,
        save_best_only=True
    )

    model.compile(
        loss=loss,
        optimizer=opt,
        metrics=[
            keras.metrics.AUC(multi_label=True, curve="ROC"),
            keras.metrics.BinaryAccuracy()
        ]
    )

    history = model.fit(
        ds_train.shuffle(1000).batch(batch_size),
        validation_data=ds_test.batch(batch_size),
        epochs=max_epochs,
        batch_size=batch_size,
        callbacks=[m_ckpt],
    )

***
## 7.5 Results

In [None]:
if config["bert_se"]:

    history_bert = pd.DataFrame(data=history.history)
    for metric in ["auc", "binary_accuracy"]:
        visualize_history(history_bert, metrics=metric)

***
# 8 Conclusion