# 0 Imports

In [None]:
from datetime import datetime
from enum import Enum
import pickle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import multilabel_confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score, hamming_loss, precision_score, recall_score, f1_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain

import nltk

from wordcloud import WordCloud

import scripts

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

np.random.seed(0)


***
# 1 Config

In [None]:
config = {
    "bag": False,
    "tfidf": False,
    "unsupervised": False,
    "supervised": {
        "knc": False, # must be False, too greedy
        "dtc": False, # should be False, long run
        "rfc": False, # must be False, too long
        "sgd": False, # should be True
        "lgc": True # should be True
    }
}

***
# 2 Data Loading

In [None]:
data = pd.read_csv("data/data_cleaned.csv", index_col="Id")

data["Tags"] = data["Tags"].apply(eval)
# data["Tokens"] = data["Tokens"].apply(eval)
# data["POS"] = data["POS"].apply(eval)
# data["Lemmatized"] = data["Lemmatized"].apply(eval)
# data["LemmaAndStem"] = data["LemmaAndStem"].apply(eval)

In [None]:
data.head()

***
# 3 Tags

In [None]:
data[["Tags"]].head()

In [None]:
data.Tags.values

In [None]:
tags = []
for row in data.Tags.values:
    tags += row
tags_df = pd.DataFrame(data=tags, columns=["Tag"]).value_counts().reset_index()
tags_df.columns = ["Tag", "Count"]

In [None]:
tags_df.info()

In [None]:
tags_df.head()

In [None]:
plt.figure(figsize=(15, 4))

sns.barplot(data=tags_df.iloc[:20], x="Tag", y="Count")

plt.title("Tag count", size=20)
plt.xlabel("Tag", size=16)
plt.ylabel("Count", size=16)
plt.xticks(rotation=45, size=16, ha="right")
plt.yticks(size=16)
plt.show()

In [None]:
plt.figure(figsize=(15, 3))

ax = sns.ecdfplot(data=tags_df, x="Count", log_scale=True)

plt.axhline(0.98, linestyle="--", linewidth=1, color="r")
plt.axvline(200, linestyle="--", linewidth=1, color="r")

plt.title("Cummulative coverage percentage", size=20)
plt.xlabel("Number of post", size=16)
plt.ylabel("Proportion", size=16)
plt.xticks(rotation=45, size=16, ha="right")
plt.yticks(size=16)
plt.show()

In [None]:
tags_df = tags_df[:200]
tags_df.info()

In [None]:
word_frequencies = dict(zip(tags_df.Tag, tags_df.Count))
wordcloud = WordCloud(background_color="black", width=1600, height=800).generate_from_frequencies(word_frequencies)

fig = plt.figure(figsize=(15, 5))
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
tags = tags_df.Tag.tolist()

In [None]:
tags[:20]

In [None]:
def find_or_remove(cell, word_list):
    return [word for word in cell if word in word_list]


In [None]:
data["Tags_Reduced"] = data.apply(lambda row: find_or_remove(row["Tags"], tags),axis="columns")

In [None]:
data["Tags_Reduced"].isna().any()

***
# 4 Bag-Of-Words

In [None]:
def bow(dataset, max_features=None, min_df=0.0, max_df=1.0):
    vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=max_features, max_df=max_df, min_df=min_df)
    matrix = vectorizer.fit_transform(dataset)

    data_dense = matrix.todense()
    print(f"Sparcity: {((data_dense > 0).sum() / data_dense.size)*100:.4}%")

    vocab = vectorizer.get_feature_names_out()

    matrix = matrix.toarray()
    bag = pd.DataFrame(data=matrix, columns=vocab)
    return bag, vectorizer

In [None]:
display = None
if config["bag"]:
    bag = bow(data, "Sentence")
    display = bag.iloc[:5, :20]
display

***
# 5 TF-IDF

In [None]:
def tfidf(dataset, max_features=None, min_df=0.0, max_df=1.0):
    vectorizer = TfidfVectorizer(tokenizer=None, stop_words=None, max_features=max_features, min_df=min_df, max_df=max_df)
    matrix = vectorizer.fit_transform(dataset).toarray()
    vocab = vectorizer.get_feature_names_out()
    tfidf = pd.DataFrame(data=matrix, columns=vocab)
    return tfidf, vectorizer

In [None]:
display = None
if config["tfidf"]:
    tfidf = tfidf(data, "Sentence")
    display = tfidf.iloc[:5, :20]
display

***
# 6 Unsupervised

## 6.0 Utils

In [None]:
# https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn
#
def latent_dirichlet_allocation_tuning(dataset: pd.DataFrame, param_grid: dict):
    data_bow, vectorizer = bow(dataset, min_df=.005)
    feature_names = data_bow.columns

    lda = LatentDirichletAllocation()
    gs = GridSearchCV(lda, param_grid)
    gs.fit(data_bow)

    lda_model = gs.best_estimator_
    lda_output = lda_model.transform(data_bow)
    topic_names = ["Topic"+str(i) for i in range(lda_model.n_components)]

    lda_output_dataframe = pd.DataFrame(np.round(lda_output, 2), columns=topic_names)

    return gs, feature_names, data_bow, vectorizer, lda_output_dataframe

In [None]:
def get_dominant_topic(lda_model, data_bow, dataset_row_nb):
    lda_output = lda_model.transform(data_bow)

    topic_names = ["Topic"+str(i) for i in range(lda_model.n_components)]
    doc_names = ["Doc"+str(i) for i in range(dataset_row_nb)]

    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

    dominant_topics = np.argmax(df_document_topic.values, axis=1)
    df_document_topic["Dominant_Topic"] = dominant_topics

    return df_document_topic

In [None]:
def topic_distribution(dominant_topic_df):
    distribution = dominant_topic_df["Dominant_Topic"].value_counts().reset_index()
    distribution.columns = ["Dominant_Topic", "Count"]

    plt.figure(figsize=(15, 4))

    sns.countplot(data=dominant_topic_df, x="Dominant_Topic")

    plt.title("Tag count", size=20)
    plt.xlabel("Tag", size=16)
    plt.ylabel("Count", size=16)
    plt.xticks(rotation=45, size=16, ha="right")
    plt.yticks(size=16)
    plt.show()

    return distribution

In [None]:
def topic_words(lda_model, feature_names, n_words=20):
    keywords = np.array(feature_names)
    topic_keywords = []
    for topic_weight in lda_model.components_:
        topic_keyword_locs = (-topic_weight).argsort()[:n_words]
        topic_keywords.append(feature_names.take(topic_keyword_locs))
    
    topic_keywords_df = pd.DataFrame(data=topic_keywords)
    topic_keywords_df.columns = ["Word"+str(i) for i in range(topic_keywords_df.shape[1])]
    topic_keywords_df.index = ["Topic"+str(i) for i in range(topic_keywords_df.shape[0])]
    return topic_keywords_df

In [None]:
def make_prediction(lda_model, sentence, vectorizer, topic_keywords_dataset):
    sentence = scripts.preprocess_sentence(sentence)
    data_bow = vectorizer.transform([sentence])
    topic_probability_score = lda_model.transform(data_bow)
    topic = topic_keywords_dataset.iloc[np.argmax(topic_probability_score), :]
    topic_name = topic.name
    topic_words = topic.values.tolist()
    return topic_name, topic_words, topic_probability_score

In [None]:
def classifier_tuning_post_lda(dataset_X: pd.DataFrame, dataset_y: pd.DataFrame, meta_model, model, param_grid: dict, scoring: str = "f1_micro"):
    start = datetime.now()

    # target multi label binarizer
    multi_label_binarizer = MultiLabelBinarizer()
    y = multi_label_binarizer.fit_transform(dataset_y)

    feature_names = dataset_X.columns
    classes = multi_label_binarizer.classes_

    X_train, X_test, y_train, y_test = train_test_split(dataset_X, y, test_size = 0.33, random_state = 1)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # multioutput or onevsrest ...
    meta_model.fit(X_train, y_train)

    # gridsearch tuning/fitting
    gs = GridSearchCV(meta_model, param_grid, scoring=scoring, refit=True)
    gs.fit(X_train, y_train)

    # advanced evaluation
    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)

    print(f"classifier_tuning > Time taken to run this cell : {datetime.now() - start} \n")

    return gs, classes, y_test, y_pred

In [None]:
def evaluate(gs, classes, y_test, y_pred):
    start = datetime.now()

    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Hamming loss ", hamming_loss(y_test, y_pred))

    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')
    
    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    
    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    print(classification_report(y_test, y_pred, target_names=classes, zero_division=0))

    print(f"evaluate > Time taken to run this cell : {datetime.now() - start}\n")

***
## 6.1 Latent Dirichlet Allocation

In [None]:
if config["unsupervised"]:
    param_grid = {
        "n_components": [10],
        "learning_decay": [.7],
        "random_state": [0],
        "n_jobs": [10]
    }

    gs, feature_names, data_bow, vectorizer, lda_output_dataframe = latent_dirichlet_allocation_tuning(data["Sentence"], param_grid)

In [None]:
if config["unsupervised"]:
    print(f"Log likelihood: {gs.best_estimator_.score(data_bow)}")
    print(f"Perplexity: {gs.best_estimator_.perplexity(data_bow)}")
    print(f"Best params: {gs.best_params_}")

In [None]:
if config["unsupervised"]:
    dominant_topic_df = get_dominant_topic(gs.best_estimator_, data_bow, data.shape[0])
    dominant_topic_df.head()

In [None]:
if config["unsupervised"]:
    topic_distribution(dominant_topic_df)

In [None]:
if config["unsupervised"]:
    topic_keywords_df = pd.DataFrame(data=gs.best_estimator_.components_, columns=feature_names, index=dominant_topic_df.columns[:-1])
    topic_keywords_df.info()

In [None]:
if config["unsupervised"]:
    topic_keywords_df.iloc[:, :20]

In [None]:
if config["unsupervised"]:
    topic_keywords_dataset = topic_words(lda_model=gs.best_estimator_, feature_names=feature_names, n_words=20)
    topic_keywords_dataset

In [None]:
if config["unsupervised"]:
    sentence = "<p>I want to create a sql script to<code>something</code> automatise the data seeding</p>"

In [None]:
if config["unsupervised"]:
    topic_name, topic_words, topic_probability_score = make_prediction(lda_model=gs.best_estimator_, sentence=sentence, vectorizer=vectorizer, topic_keywords_dataset=topic_keywords_dataset)
    topic_name

In [None]:
if config["unsupervised"]:
    topic_words

In [None]:
if config["unsupervised"]:
    param_grid = {
        "estimator__solver": ["liblinear"],
        "estimator__penalty": ["l1"],
        "estimator__random_state": [0],
    }

    model = LogisticRegression()
    meta_model = OneVsRestClassifier(model)
    gs, classes, y_test, y_pred = classifier_tuning_post_lda(lda_output_dataframe, data["Tags_Reduced"], meta_model, model, param_grid)
    evaluate(gs, classes, y_test, y_pred)

***
# 7 Supervised

## 7.0 Utils

In [None]:
# https://www.codementor.io/@agarrahul01/multiclass-classification-using-random-forest-on-scikit-learn-library-hkk4lwawu
# https://www.kaggle.com/patrickaudriaz/random-forests-for-multiclass-classification
# 
def classifier_tuning(dataset: pd.DataFrame, meta_model, model, param_grid: dict, scoring: str = "f1_micro"):
    start = datetime.now()

    X, vectorizer = tfidf(dataset["Sentence"], min_df=.001, max_df=1.0)

    # target multi label binarizer
    multi_label_binarizer = MultiLabelBinarizer()
    y = multi_label_binarizer.fit_transform(dataset["Tags_Reduced"])

    feature_names = X.columns
    classes = multi_label_binarizer.classes_

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 1)

    # multioutput or onevsrest ...
    meta_model.fit(X_train, y_train)

    # gridsearch tuning/fitting
    gs = GridSearchCV(meta_model, param_grid, scoring=scoring, refit=True)
    gs.fit(X_train, y_train)

    # advanced evaluation
    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)

    print(f"classifier_tuning > Time taken to run this cell : {datetime.now() - start} \n")

    return gs, classes, y_test, y_pred, vectorizer

In [None]:
def evaluate(gs, classes, y_test, y_pred):
    start = datetime.now()

    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Hamming loss ", hamming_loss(y_test, y_pred))

    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')
    
    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    
    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    print(classification_report(y_test, y_pred, target_names=classes, zero_division=0))

    print(f"evaluate > Time taken to run this cell : {datetime.now() - start}\n")

***
## 7.1 K Neighbors Classifier with MultiOutput Classifier

too greedy...

In [None]:
if config["supervised"]["knc"]:
    param_grid = {
        "estimator__n_neighbors": [5],
        "estimator__n_jobs": [10],
        "n_jobs": [10]
    }

    meta_model = MultiOutputClassifier
    model = KNeighborsClassifier()
    gs, classes, y_test, y_pred, vectorizer = classifier_tuning(data, meta_model, model, param_grid)
    evaluate(gs, classes, y_test, y_pred)

***
## 7.2 Decision Tree Classifier with MultiOutput Classifier

In [None]:
if config["supervised"]["dtc"]:
    param_grid = {
        "estimator__max_depth": [50],
        "estimator__criterion": ["entropy"],
        "estimator__random_state": [0],
        "n_jobs": [10]
    }

    meta_model = MultiOutputClassifier
    model = DecisionTreeClassifier()
    gs, classes, y_test, y_pred, vectorizer = classifier_tuning(data, meta_model, model, param_grid)
    evaluate(gs, classes, y_test, y_pred)

***
## 7.3 Random Forest Classifier with MultiOutput Classifier

too long...

In [None]:
if config["supervised"]["rfc"]:
    param_grid = {
        "estimator__n_estimators": [100],
        "estimator__max_depth": [50],
        "estimator__criterion": ["entropy"],
        "estimator__n_jobs": [10],
        "estimator__random_state": [0],
        "n_jobs": [10]
    }

    meta_model = MultiOutputClassifier
    model = RandomForestClassifier()
    gs, classes, y_test, y_pred, vectorizer = classifier_tuning(data, meta_model, model, param_grid)
    evaluate(gs, classes, y_test, y_pred)

***
## 7.4 SGD  with OneVsRest Classifier

In [None]:
if config["supervised"]["sgd"]:
    param_grid = {
        "estimator__loss": ["log"],
        "estimator__alpha": [0.00001],
        "estimator__penalty": ["l1"],
        "estimator__n_jobs": [10],
        "estimator__random_state": [0],
        "n_jobs": [10]
    }

    model = SGDClassifier()
    meta_model = OneVsRestClassifier(model)
    gs, classes, y_test, y_pred, vectorizer = classifier_tuning(data, meta_model, model, param_grid)
    evaluate(gs, classes, y_test, y_pred)

***
## 7.5 Logistic Regression with OneVsRest Classifier

In [None]:
# tfidf_data, vectorizer = tfidf(data["Sentence"], min_df=0.001, max_df=1.0)
# tfidf_data.info()

tfidf.shape = 2400 * 470000

In [None]:
if config["supervised"]["lgc"]:
    param_grid = {
        # "estimator__max_iter": [1000],
        # "estimator__solver": ["liblinear"],
        # "estimator__penalty": ["l1"],
        # "estimator__random_state": [0],
        "n_jobs": [10]
    }

    model = LogisticRegression(solver="liblinear", class_weight="balanced")
    meta_model = OneVsRestClassifier(model)
    gs, classes, y_test, y_pred, vectorizer = classifier_tuning(data, meta_model, model, param_grid)
    evaluate(gs, classes, y_test, y_pred)

In [None]:
if config["supervised"]["lgc"]:
    param_grid = {
        # "classifier__solver": ["liblinear"],
        # "classifier__penalty": ["l1"],
        # "classifier__random_state": [0],
    }

    model = SGDClassifier(max_iter=1000, tol=1e-3, alpha=20, loss="modified_huber", class_weight="balanced")
    meta_model = OneVsRestClassifier(model)
    gs, classes, y_test, y_pred, vectorizer = classifier_tuning(data, meta_model, model, param_grid)
    evaluate(gs, classes, y_test, y_pred)

***
## 7.6 Serializing the model

In [None]:
if config["supervised"]["lgc"] and False:
    with open("model.pkl", "wb") as handle:
        pickle.dump(gs.best_estimator_, handle, pickle.HIGHEST_PROTOCOL)
    with open("tfidf_vectorizer.pkl", "wb") as handle:
        pickle.dump(vectorizer, handle, pickle.HIGHEST_PROTOCOL)
    with open("labels.pkl", "wb") as handle:
        pickle.dump(classes, handle, pickle.HIGHEST_PROTOCOL)