# 0 Imports

In [None]:
from enum import Enum

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

import nltk

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

np.random.seed(0)


***
# 1 Config

In [None]:
config = {
    "bag": False,
    "tfidf": False,
    "unsupervised": True,
    "supervised": False
}

***
# 2 Data Loading

In [None]:
data = pd.read_csv("data/data_cleaned.csv", index_col="Id")

data["Tags"] = data["Tags"].apply(eval)
# data["Tokens"] = data["Tokens"].apply(eval)
# data["POS"] = data["POS"].apply(eval)
# data["Lemmatized"] = data["Lemmatized"].apply(eval)
# data["LemmaAndStem"] = data["LemmaAndStem"].apply(eval)

In [None]:
data.head()

***
# 3 Tags

In [None]:
data[["Tags"]].head()

In [None]:
data.Tags.values

In [None]:
tags = []
for row in data.Tags.values:
    tags += row
tags = list(set(tags))
tags[:10]

In [None]:
len(tags)

***
# 4 Bag-Of-Words

In [None]:
def bow(dataset, max_features=None, min_df=0.0, max_df=1.0):
    vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=max_features, max_df=max_df, min_df=min_df)
    matrix = vectorizer.fit_transform(dataset).toarray()
    vocab = vectorizer.get_feature_names_out()
    bag = pd.DataFrame(data=matrix, columns=vocab)
    return bag

In [None]:
display = None
if config["bag"]:
    bag = bow(data, "Sentence")
    display = bag.iloc[:5, :20]
display

***
# 5 TF-IDF

In [None]:
def tfidf(dataset, feature, max_features=None):
    vectorizer = TfidfVectorizer(tokenizer=None, stop_words=None, max_features=max_features)
    matrix = vectorizer.fit_transform(dataset[feature]).toarray()
    vocab = vectorizer.get_feature_names_out()
    tfidf = pd.DataFrame(data=matrix, columns=vocab)
    return tfidf

In [None]:
display = None
if config["tfidf"]:
    tfidf = tfidf(data, "Sentence")
    display = tfidf.iloc[:5, :20]
display

***
# 6 Unsupervised

## 6.0 Utils

In [None]:
def latent_dirichlet_allocation(dataset: pd.DataFrame, n_topics: int, max_iter=5, learning_offset=50, max_features=None):
    feature_names = dataset.columns

    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter, learning_method="online", learning_offset=learning_offset, random_state=0)
    lda.fit(data)
    return lda, feature_names

In [None]:
# https://blog.mlreview.com/topic-modeling-with-scikit-learn-e80d33668730
#
def display_topics(model, feature_names, no_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
# https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn
#
def latent_dirichlet_allocation_tuning(dataset: pd.DataFrame, param_grid: dict):
    data_bow = bow(dataset, min_df=.005)
    feature_names = data_bow.columns

    lda = LatentDirichletAllocation()
    gs = GridSearchCV(lda, param_grid)
    gs.fit(data_bow)

    return gs, feature_names

***
## 6.1 Linear Discriminant Analysis

***
## 6.2 Latent Dirichlet Allocation

In [None]:
data_bow = bow(data["Sentence"], min_df=.005, max_df=1.0)

In [None]:
data_bow.info()

In [None]:
if config["unsupervised"]:
    param_grid = {
        "n_components": [10],
        "learning_decay": [.7],
        "random_state": [0],
        "n_jobs": [10]
    }

    latent_dirichlet_allocation_tuning(data["Sentence"], param_grid)

In [None]:
# lda, feature_names = lda(data, "Sentence", FEATURE_EXTRACTION.TFIDF, 20, 1000)

In [None]:
# display_topics(lda, feature_names, 20)

***
# 7 Supervised

## 7.0 Utils

In [None]:
def scree_plot(dataset, figsize=(15, 5)):
    pca = PCA()
    pca.fit(dataset)

    plt.figure(figsize=figsize)
    explain_variance = pd.Series(pca.explained_variance_ratio_)
    explain_variance.plot(kind="bar", alpha=0.7)

    total = 0
    var_ls = []
    for x in explain_variance:
        total = total + x
        var_ls.append(total)

    pd.Series(var_ls).plot(marker="o", alpha=0.7)
    plt.xlabel("Principle Components", fontsize="x-large")
    plt.ylabel("Percentage Variance Explained", fontsize="x-large")
    plt.title("Scree plot", fontsize="xx-large")
    plt.show()

    return pca

In [None]:
def apply_pca(dataset, n_components):
    pca = PCA(n_components=n_components)
    pca_data = pca.fit_transform(dataset)
    components_name = [f"PC{i+1}" for i in range(pca_data.shape[1])]
    pca_data = pd.DataFrame(data=pca_data, columns=components_name)
    loadings = pd.DataFrame(
        data=pca.components_.T,
        columns=components_name,
        index=dataset.columns)
    return pca, pca_data, loadings

In [None]:
# https://www.codementor.io/@agarrahul01/multiclass-classification-using-random-forest-on-scikit-learn-library-hkk4lwawu
# https://www.kaggle.com/patrickaudriaz/random-forests-for-multiclass-classification
# 
def classifier_tuning(dataset: pd.DataFrame, model, param_grid: dict):
    X = tfidf(dataset, "Sentence")
    y = ?
    feature_names = X.columns

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

    gs = GridSearchCV(model, param_grid)
    gs.fit(X_train, y_train)

    return gs, feature_names

***
## 7.1 PCA

In [None]:
if config["supervised"]:
    data_tfidf = tfidf(data, "Sentence")
    pca = scree_plot(data_tfidf.iloc[:, :20])

***
## 7.2 Random Forest Classifier

In [None]:
if config["supervised"]:
    param_grid = {
        "n_estimators": [10],
        "criterion": ["entropy"],
        "random_state": [0],
    }

    model = RandomForestClassifier()
    gs, feature_names = classifier_tuning(data, model, param_grid)