# 0 Imports

In [None]:
import os, warnings, re
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import multilabel_confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score, hamming_loss, precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

from bs4 import BeautifulSoup as bs
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer
# nltk.download()
from wordcloud import WordCloud

import tensorflow as tf
from tensorflow import keras

import scripts

def set_seed(seed=31415):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed()

plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=18, titlepad=10)
plt.rc('image', cmap='magma')
warnings.filterwarnings("ignore")

print("-----------------------------------------")
if tf.test.gpu_device_name():
    print(f"GPU used: {tf.test.gpu_device_name()}")
else:
    print(f"GPU not used")
print("-----------------------------------------")

from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

***
# 1 Config

In [None]:
config = {
    "raw_preprocessing": False,
    "preprocessing": False,
    "baseline": False,
    "bert_base": False,
    "bert_se": False
}

***
# 2 Preprocessing

## 2.0 Utils

In [None]:
def preproc_raw_data_body(cell):
    soup = bs(cell, "html.parser")

    script_tags = soup.find_all("script")
    for script_tag in script_tags:
        script_tag.extract()

    code_tags = soup.find_all("code")
    for code_tag in code_tags:
        code_tag.extract()

    preproc_cell = soup.get_text()
    preproc_cell = preproc_cell.replace(',', ' ')

    return preproc_cell

In [None]:
def tags_to_list(cell):
    return [tag for tag in re.split(r'[<>]', cell) if tag]

In [None]:
def lower(cell):
    return cell.lower()

In [None]:
def tokenize(*texts):
    tokens = []
    for text in texts:
        # https://regex101.com/
        tokenizer = nltk.RegexpTokenizer(r'\.?[a-z#]+')
        tokens_temp = tokenizer.tokenize(text)
        tokens += [re.sub("(.)\\1{3,}", "\\1", token) for token in tokens_temp]
    return tokens

In [None]:
def remove_stop_words(cell):
    return [word for word in cell if word not in stop_words]

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

In [None]:
def tag_pos(cell):
    treebank_tags = pos_tag(cell)
    pos = [(tag[0], get_wordnet_pos(tag[1])) for tag in treebank_tags]
    return pos

In [None]:
def lemmatize(cell, with_pos=False):
    lemmatizer = WordNetLemmatizer()
    if not with_pos:
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in cell]
    else:
        lemmatized_tokens = [lemmatizer.lemmatize(pair[0], pos=pair[1]) for pair in cell]
    return lemmatized_tokens

In [None]:
def stemmize(cell):
    stemmer = SnowballStemmer("english")
    stemmed_tokens = [stemmer.stem(token) for token in cell]
    return stemmed_tokens

## 2.1 Loading raw data

In [None]:
if config["raw_preprocessing"]:

    raw_data_body = pd.read_csv("data/raw_data_body.csv")
    raw_data_body.shape

## 2.2 Parse html

In [None]:
if config["raw_preprocessing"]:

    raw_data_body["Body"] = raw_data_body.apply(lambda row: preproc_raw_data_body(row.Body), axis=1)

## 2.3 Merge body with rest

In [None]:
display = None

if config["raw_preprocessing"]:

    raw_data_rest = pd.read_csv("data/raw_data_id_title_tags.csv")
    raw_data = raw_data_rest.join(raw_data_body)
    raw_data = raw_data.set_index("Id").reindex(["Title", "Body", "Tags"], axis="columns")
    display = raw_data.head()

display

## 2.4 Tags to list

In [None]:
display = None

if config["raw_preprocessing"]:

    raw_data["Tags"] = raw_data.apply(lambda row: tags_to_list(row["Tags"]), axis="columns")
    display = raw_data[["Tags"]].head()

display

## 2.5 Lowering

In [None]:
display = None

if config["raw_preprocessing"]:

    raw_data["Title"] = raw_data.apply(lambda row: lower(row["Title"]), axis="columns")
    raw_data["Body"] = raw_data.apply(lambda row: lower(row["Body"]), axis="columns")
    display = raw_data[["Title", "Body"]].head()

display

## 2.6 Save raw preprocessed data

In [None]:
if config["raw_preprocessing"]:
    
    raw_data.to_csv("data/raw_data.csv")

## 2.7 Load raw preprocessed data

In [None]:
if config["preprocessing"]:
    
    data = pd.read_csv("data/raw_data.csv", index_col="Id")

## 2.8 Tokenize

In [None]:
display = None

if config["preprocessing"]:

    data["Tokens"] = data.apply(lambda row: tokenize(row["Body"], row["Title"]), axis="columns")
    display = data[["Title", "Body", "Tokens"]].head()

display

## 2.9 StopWords deletion

In [None]:
display = None

if config["preprocessing"]:

    stop_words = stopwords.words("english")
    data["Tokens"] = data.apply(lambda row: remove_stop_words(row["Tokens"]), axis="columns")
    display = data[["Title", "Body", "Tokens"]].head()

display

## 2.10 POS - Part-Of-Speech

In [None]:
display = None

if config["preprocessing"]:

    data["POS"] = data.apply(lambda row: tag_pos(row["Tokens"]), axis="columns")
    display = data[["Tokens", "POS"]].head()

display

## 2.11 Lemmatize

In [None]:
display = None

if config["preprocessing"]:

    data["Lemmatized"] = data.apply(lambda row: lemmatize(row["POS"], with_pos=True), axis="columns")
    display = data[["Tokens", "Lemmatized"]].head()

display

## 2.12 Stemmize

In [None]:
display = None

if config["preprocessing"]:

    data["LemmaAndStem"] = data.apply(lambda row: stemmize(row["Tokens"]), axis="columns")
    display = data[["Tokens", "LemmaAndStem"]].head()

display

## 2.13 Generating sentence

In [None]:
display = None

if config["preprocessing"]:

    data["Sentence"] = data.apply(lambda row: " ".join([str(item) for item in row["LemmaAndStem"]]), axis="columns")
    display = data[["LemmaAndStem", "Sentence"]].head()

display

## 2.14 Saving data

In [None]:
display = None

if config["preprocessing"]:

    data.to_csv("data/data_cleaned.csv", index_label="Id")
    display = data.head()

display

***
# 3 Data preparation

## 3.0 Utils

In [None]:
def bow(dataset, max_features=None, min_df=0.0, max_df=1.0):
    vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=max_features, max_df=max_df, min_df=min_df)
    matrix = vectorizer.fit_transform(dataset)

    data_dense = matrix.todense()
    print(f"Sparcity: {((data_dense > 0).sum() / data_dense.size)*100:.4}%")

    vocab = vectorizer.get_feature_names_out()

    matrix = matrix.toarray()
    bag = pd.DataFrame(data=matrix, columns=vocab)
    return bag, vectorizer

In [None]:
def tfidf(dataset, max_features=None, min_df=0.0, max_df=1.0):
    vectorizer = TfidfVectorizer(tokenizer=None, stop_words=None, max_features=max_features, min_df=min_df, max_df=max_df)
    matrix = vectorizer.fit_transform(dataset).toarray()
    vocab = vectorizer.get_feature_names_out()
    tfidf = pd.DataFrame(data=matrix, columns=vocab)
    return tfidf, vectorizer

## 3.1 Loading

In [None]:
data = pd.read_csv("data/data_cleaned.csv", index_col="Id")

data["Tags"] = data["Tags"].apply(eval)

## 3.2 Tags

In [None]:
data[["Tags"]].head()

In [None]:
data.Tags.values

In [None]:
tags = []
for row in data.Tags.values:
    tags += row
tags_df = pd.DataFrame(data=tags, columns=["Tag"]).value_counts().reset_index()
tags_df.columns = ["Tag", "Count"]

In [None]:
tags_df.info()

In [None]:
tags_df.head()

In [None]:
plt.figure(figsize=(15, 4))

sns.barplot(data=tags_df.iloc[:20], x="Tag", y="Count")

plt.title("Tag count", size=20)
plt.xlabel("Tag", size=16)
plt.ylabel("Count", size=16)
plt.xticks(rotation=45, size=16, ha="right")
plt.yticks(size=16)
plt.show()

In [None]:
plt.figure(figsize=(15, 3))

ax = sns.ecdfplot(data=tags_df, x="Count", log_scale=True)

plt.axhline(0.98, linestyle="--", linewidth=1, color="r")
plt.axvline(100, linestyle="--", linewidth=1, color="r")

plt.title("Cummulative coverage percentage", size=20)
plt.xlabel("Number of post", size=16)
plt.ylabel("Proportion", size=16)
plt.xticks(rotation=45, size=16, ha="right")
plt.yticks(size=16)
plt.show()

In [None]:
tags_df = tags_df[:100]
tags_df.info()

In [None]:
word_frequencies = dict(zip(tags_df.Tag, tags_df.Count))
wordcloud = WordCloud(background_color="black", width=1600, height=800).generate_from_frequencies(word_frequencies)

fig = plt.figure(figsize=(15, 5))
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
tags = tags_df.Tag.tolist()

In [None]:
tags[:20]

In [None]:
def find_or_remove(cell, word_list):
    return [word for word in cell if word in word_list]

In [None]:
data["Tags_Reduced"] = data.apply(lambda row: find_or_remove(row["Tags"], tags),axis="columns")

In [None]:
data["Tags_Reduced"].isna().any()

***
# 4 Baseline

## 4.0 Utils

In [None]:
# https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn
#
def latent_dirichlet_allocation_tuning(dataset: pd.DataFrame, param_grid: dict):
    data_bow, vectorizer = bow(dataset, min_df=.005)
    feature_names = data_bow.columns

    lda = LatentDirichletAllocation()
    gs = GridSearchCV(lda, param_grid)
    gs.fit(data_bow)

    lda_model = gs.best_estimator_
    lda_output = lda_model.transform(data_bow)
    topic_names = ["Topic"+str(i) for i in range(lda_model.n_components)]

    lda_output_dataframe = pd.DataFrame(np.round(lda_output, 2), columns=topic_names)

    return gs, feature_names, data_bow, vectorizer, lda_output_dataframe

In [None]:
def get_dominant_topic(lda_model, data_bow, dataset_row_nb):
    lda_output = lda_model.transform(data_bow)

    topic_names = ["Topic"+str(i) for i in range(lda_model.n_components)]
    doc_names = ["Doc"+str(i) for i in range(dataset_row_nb)]

    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

    dominant_topics = np.argmax(df_document_topic.values, axis=1)
    df_document_topic["Dominant_Topic"] = dominant_topics

    return df_document_topic

In [None]:
def topic_distribution(dominant_topic_df):
    distribution = dominant_topic_df["Dominant_Topic"].value_counts().reset_index()
    distribution.columns = ["Dominant_Topic", "Count"]

    plt.figure(figsize=(15, 4))

    sns.countplot(data=dominant_topic_df, x="Dominant_Topic")

    plt.title("Tag count", size=20)
    plt.xlabel("Tag", size=16)
    plt.ylabel("Count", size=16)
    plt.xticks(rotation=45, size=16, ha="right")
    plt.yticks(size=16)
    plt.show()

    return distribution

In [None]:
def topic_words(lda_model, feature_names, n_words=20):
    keywords = np.array(feature_names)
    topic_keywords = []
    for topic_weight in lda_model.components_:
        topic_keyword_locs = (-topic_weight).argsort()[:n_words]
        topic_keywords.append(feature_names.take(topic_keyword_locs))
    
    topic_keywords_df = pd.DataFrame(data=topic_keywords)
    topic_keywords_df.columns = ["Word"+str(i) for i in range(topic_keywords_df.shape[1])]
    topic_keywords_df.index = ["Topic"+str(i) for i in range(topic_keywords_df.shape[0])]
    return topic_keywords_df

In [None]:
def make_prediction(lda_model, sentence, vectorizer, topic_keywords_dataset):
    sentence = scripts.preprocess_sentence(sentence)
    data_bow = vectorizer.transform([sentence])
    topic_probability_score = lda_model.transform(data_bow)
    topic = topic_keywords_dataset.iloc[np.argmax(topic_probability_score), :]
    topic_name = topic.name
    topic_words = topic.values.tolist()
    return topic_name, topic_words, topic_probability_score

In [None]:
def classifier_tuning_post_lda(dataset_X: pd.DataFrame, dataset_y: pd.DataFrame, meta_model, model, param_grid: dict, scoring: str = "f1_micro"):
    start = datetime.now()

    # target multi label binarizer
    multi_label_binarizer = MultiLabelBinarizer()
    y = multi_label_binarizer.fit_transform(dataset_y)

    feature_names = dataset_X.columns
    classes = multi_label_binarizer.classes_

    X_train, X_test, y_train, y_test = train_test_split(dataset_X, y, test_size = 0.33, random_state = 1)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # multioutput or onevsrest ...
    meta_model.fit(X_train, y_train)

    # gridsearch tuning/fitting
    gs = GridSearchCV(meta_model, param_grid, scoring=scoring, refit=True)
    gs.fit(X_train, y_train)

    # advanced evaluation
    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)

    print(f"classifier_tuning > Time taken to run this cell : {datetime.now() - start} \n")

    return gs, classes, y_test, y_pred

In [None]:
def evaluate(gs, classes, y_test, y_pred):
    start = datetime.now()

    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Hamming loss ", hamming_loss(y_test, y_pred))

    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')
    
    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    
    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    print(classification_report(y_test, y_pred, target_names=classes, zero_division=0))

    print(f"evaluate > Time taken to run this cell : {datetime.now() - start}\n")

***
## 4.1 Latent Dirichlet Allocation

In [None]:
if config["baseline"]:

    param_grid = {
        "n_components": [10],
        "learning_decay": [.7],
        "random_state": [0],
        "n_jobs": [10]
    }

    gs, feature_names, data_bow, vectorizer, lda_output_dataframe = latent_dirichlet_allocation_tuning(data["Sentence"], param_grid)

In [None]:
if config["baseline"]:

    print(f"Log likelihood: {gs.best_estimator_.score(data_bow)}")
    print(f"Perplexity: {gs.best_estimator_.perplexity(data_bow)}")
    print(f"Best params: {gs.best_params_}")

In [None]:
if config["baseline"]:

    dominant_topic_df = get_dominant_topic(gs.best_estimator_, data_bow, data.shape[0])
    dominant_topic_df.head()

In [None]:
if config["baseline"]:

    topic_distribution(dominant_topic_df)

In [None]:
if config["baseline"]:

    topic_keywords_df = pd.DataFrame(data=gs.best_estimator_.components_, columns=feature_names, index=dominant_topic_df.columns[:-1])
    topic_keywords_df.info()    

In [None]:
display = None

if config["baseline"]:

    display = topic_keywords_df.iloc[:, :20]

display

In [None]:
display = None

if config["baseline"]:

    topic_keywords_dataset = topic_words(lda_model=gs.best_estimator_, feature_names=feature_names, n_words=20)
    display = topic_keywords_dataset

display

In [None]:
if config["baseline"]:

    param_grid = {
        "estimator__solver": ["liblinear"],
        "estimator__penalty": ["l1"],
        "estimator__random_state": [0],
    }

    model = LogisticRegression()
    meta_model = OneVsRestClassifier(model)
    gs, classes, y_test, y_pred = classifier_tuning_post_lda(lda_output_dataframe, data["Tags_Reduced"], meta_model, model, param_grid)
    evaluate(gs, classes, y_test, y_pred)

***
## 4.2 Results

<img src="records/lda_results_metrics.png" style="background-color:white">

<img src="records/lda_results_topic_distrib_plot.png" style="background-color:white">

<img src="records/lda_results_topic_words.png" style="background-color:white">

<img src="records/logistic_classifier_results_01.png" style="background-color:white">

<img src="records/logistic_classifier_results_02.png" style="background-color:white">

<img src="records/logistic_classifier_results_03.png" style="background-color:white">

***
# 5 BERT_base

***
# 6 BERT_SE

***
# 7 Conclusion