# 0 Imports

In [1]:
from enum import Enum

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

import nltk

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

np.random.seed(0)


***
# 1 Config

In [2]:
config = {
    "bag": False,
    "tfidf": False,
    "unsupervised": False,
    "supervised": True
}

***
# 2 Data Loading

In [3]:
data = pd.read_csv("data/data_cleaned.csv", index_col="Id")

data["Tags"] = data["Tags"].apply(eval)
# data["Tokens"] = data["Tokens"].apply(eval)
# data["POS"] = data["POS"].apply(eval)
# data["Lemmatized"] = data["Lemmatized"].apply(eval)
# data["LemmaAndStem"] = data["LemmaAndStem"].apply(eval)

In [4]:
data.head()

Unnamed: 0_level_0,Title,Body,Tags,Tokens,POS,Lemmatized,LemmaAndStem,Sentence
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4,how to convert a decimal to a double in c#?,i want to use a to change a 's opacity.\nthis is my code:\n\nwhen i build the application it gives the following error:\n\n\n\ni have tried using and but then the doesn't work. this code worked fine in a past vb.net project.\n,"[c#, floating-point, type-conversion, double, decimal]","['want', 'use', 'change', 'opacity', 'code', 'build', 'application', 'gives', 'following', 'error', 'tried', 'using', 'work', 'code', 'worked', 'fine', 'past', 'vb', '.net', 'project', 'convert', 'decimal', 'double', 'c#']","[('want', 'n'), ('use', 'n'), ('change', 'n'), ('opacity', 'n'), ('code', 'n'), ('build', 'a'), ('application', 'n'), ('gives', 'v'), ('following', 'v'), ('error', 'n'), ('tried', 'v'), ('using', 'v'), ('work', 'n'), ('code', 'n'), ('worked', 'v'), ('fine', 'a'), ('past', 'n'), ('vb', 'a'), ('.net', 'n'), ('project', 'n'), ('convert', 'v'), ('decimal', 'a'), ('double', 'a'), ('c#', 'n')]","['want', 'use', 'change', 'opacity', 'code', 'build', 'application', 'give', 'follow', 'error', 'try', 'use', 'work', 'code', 'work', 'fine', 'past', 'vb', '.net', 'project', 'convert', 'decimal', 'double', 'c#']","['want', 'use', 'chang', 'opac', 'code', 'build', 'applic', 'give', 'follow', 'error', 'tri', 'use', 'work', 'code', 'work', 'fine', 'past', 'vb', '.net', 'project', 'convert', 'decim', 'doubl', 'c#']",want use chang opac code build applic give follow error tri use work code work fine past vb .net project convert decim doubl c#
6,why did the width collapse in the percentage width child element in an absolutely positioned parent on internet explorer 7?,i have an absolutely positioned containing several children one of which is a relatively positioned . when i use a on the child it collapses to on ie7 but not on firefox or safari.\nif i use it works. if the parent is relatively positioned the percentage width on the child works.\n\nis there something i'm missing here?\nis there an easy fix for this besides the on the child?\nis there an area of the css specification that covers this?\n\n,"[html, css, internet-explorer-7]","['absolutely', 'positioned', 'containing', 'several', 'children', 'one', 'relatively', 'positioned', 'use', 'child', 'collapses', 'ie', 'firefox', 'safari', 'use', 'works', 'parent', 'relatively', 'positioned', 'percentage', 'width', 'child', 'works', 'something', 'missing', 'easy', 'fix', 'besides', 'child', 'area', 'css', 'specification', 'covers', 'width', 'collapse', 'percentage', 'width', 'child', 'element', 'absolutely', 'positioned', 'parent', 'internet', 'explorer']","[('absolutely', 'r'), ('positioned', 'v'), ('containing', 'v'), ('several', 'a'), ('children', 'n'), ('one', 'n'), ('relatively', 'r'), ('positioned', 'v'), ('use', 'n'), ('child', 'n'), ('collapses', 'n'), ('ie', 'v'), ('firefox', 'n'), ('safari', 'n'), ('use', 'n'), ('works', 'v'), ('parent', 'n'), ('relatively', 'r'), ('positioned', 'v'), ('percentage', 'n'), ('width', 'n'), ('child', 'n'), ('works', 'v'), ('something', 'n'), ('missing', 'v'), ('easy', 'a'), ('fix', 'a'), ('besides', 'n'), ('child', 'a'), ('area', 'n'), ('css', 'n'), ('specification', 'n'), ('covers', 'v'), ('width', 'a'), ('collapse', 'n'), ('percentage', 'n'), ('width', 'n'), ('child', 'n'), ('element', 'n'), ('absolutely', 'r'), ('positioned', 'v'), ('parent', 'a'), ('internet', 'n'), ('explorer', 'n')]","['absolutely', 'position', 'contain', 'several', 'child', 'one', 'relatively', 'position', 'use', 'child', 'collapse', 'ie', 'firefox', 'safari', 'use', 'work', 'parent', 'relatively', 'position', 'percentage', 'width', 'child', 'work', 'something', 'miss', 'easy', 'fix', 'besides', 'child', 'area', 'cs', 'specification', 'cover', 'width', 'collapse', 'percentage', 'width', 'child', 'element', 'absolutely', 'position', 'parent', 'internet', 'explorer']","['absolut', 'posit', 'contain', 'sever', 'children', 'one', 'relat', 'posit', 'use', 'child', 'collaps', 'ie', 'firefox', 'safari', 'use', 'work', 'parent', 'relat', 'posit', 'percentag', 'width', 'child', 'work', 'someth', 'miss', 'easi', 'fix', 'besid', 'child', 'area', 'css', 'specif', 'cover', 'width', 'collaps', 'percentag', 'width', 'child', 'element', 'absolut', 'posit', 'parent', 'internet', 'explor']",absolut posit contain sever children one relat posit use child collaps ie firefox safari use work parent relat posit percentag width child work someth miss easi fix besid child area css specif cover width collaps percentag width child element absolut posit parent internet explor
9,how do i calculate someone's age based on a datetime type birthday?,given a representing a person's birthday how do i calculate their age in years?\n,"[c#, .net, datetime]","['given', 'representing', 'person', 'birthday', 'calculate', 'age', 'years', 'calculate', 'someone', 'age', 'based', 'datetime', 'type', 'birthday']","[('given', 'v'), ('representing', 'v'), ('person', 'n'), ('birthday', 'a'), ('calculate', 'a'), ('age', 'n'), ('years', 'n'), ('calculate', 'v'), ('someone', 'n'), ('age', 'n'), ('based', 'v'), ('datetime', 'a'), ('type', 'n'), ('birthday', 'n')]","['give', 'represent', 'person', 'birthday', 'calculate', 'age', 'year', 'calculate', 'someone', 'age', 'base', 'datetime', 'type', 'birthday']","['given', 'repres', 'person', 'birthday', 'calcul', 'age', 'year', 'calcul', 'someon', 'age', 'base', 'datetim', 'type', 'birthday']",given repres person birthday calcul age year calcul someon age base datetim type birthday
11,calculate relative time in c#,given a specific value how do i display relative time like:\n\n2 hours ago\n3 days ago\na month ago\n\n,"[c#, datetime, time, datediff, relative-time-span]","['given', 'specific', 'value', 'display', 'relative', 'time', 'like', 'hours', 'ago', 'days', 'ago', 'month', 'ago', 'calculate', 'relative', 'time', 'c#']","[('given', 'v'), ('specific', 'a'), ('value', 'n'), ('display', 'n'), ('relative', 'a'), ('time', 'n'), ('like', 'n'), ('hours', 'n'), ('ago', 'r'), ('days', 'n'), ('ago', 'r'), ('month', 'n'), ('ago', 'r'), ('calculate', 'v'), ('relative', 'a'), ('time', 'n'), ('c#', 'n')]","['give', 'specific', 'value', 'display', 'relative', 'time', 'like', 'hour', 'ago', 'day', 'ago', 'month', 'ago', 'calculate', 'relative', 'time', 'c#']","['given', 'specif', 'valu', 'display', 'relat', 'time', 'like', 'hour', 'ago', 'day', 'ago', 'month', 'ago', 'calcul', 'relat', 'time', 'c#']",given specif valu display relat time like hour ago day ago month ago calcul relat time c#
13,determine a user's timezone,is there a standard way for a web server to be able to determine a user's timezone within a web page? \nperhaps from an http header or part of the string?\n,"[html, browser, timezone, user-agent, timezone-offset]","['standard', 'way', 'web', 'server', 'able', 'determine', 'user', 'timezone', 'within', 'web', 'page', 'perhaps', 'http', 'header', 'part', 'string', 'determine', 'user', 'timezone']","[('standard', 'a'), ('way', 'n'), ('web', 'n'), ('server', 'n'), ('able', 'a'), ('determine', 'n'), ('user', 'n'), ('timezone', 'n'), ('within', 'n'), ('web', 'a'), ('page', 'n'), ('perhaps', 'r'), ('http', 'a'), ('header', 'a'), ('part', 'n'), ('string', 'v'), ('determine', 'a'), ('user', 'n'), ('timezone', 'n')]","['standard', 'way', 'web', 'server', 'able', 'determine', 'user', 'timezone', 'within', 'web', 'page', 'perhaps', 'http', 'header', 'part', 'string', 'determine', 'user', 'timezone']","['standard', 'way', 'web', 'server', 'abl', 'determin', 'user', 'timezon', 'within', 'web', 'page', 'perhap', 'http', 'header', 'part', 'string', 'determin', 'user', 'timezon']",standard way web server abl determin user timezon within web page perhap http header part string determin user timezon


***
# 3 Tags

In [5]:
data[["Tags"]].head()

Unnamed: 0_level_0,Tags
Id,Unnamed: 1_level_1
4,"[c#, floating-point, type-conversion, double, decimal]"
6,"[html, css, internet-explorer-7]"
9,"[c#, .net, datetime]"
11,"[c#, datetime, time, datediff, relative-time-span]"
13,"[html, browser, timezone, user-agent, timezone-offset]"


In [6]:
# data["Tags_Sentence"] = data.apply(lambda row: " ".join([str(item) for item in row["Tags"]]), axis="columns")

In [7]:
data.Tags.values

array([list(['c#', 'floating-point', 'type-conversion', 'double', 'decimal']),
       list(['html', 'css', 'internet-explorer-7']),
       list(['c#', '.net', 'datetime']), ...,
       list(['css', 'xhtml', 'refactoring']),
       list(['python', 'class', 'anonymous-class']),
       list(['c', 'linux', 'file', 'io', 'std'])], dtype=object)

In [8]:
tags = []
for row in data.Tags.values:
    tags += row
tags = list(set(tags))
tags[:10]

['externalizing',
 'xargs',
 'jvm-bytecode',
 'codewarrior',
 'prettify',
 'emoticons',
 'qt-designer',
 'ping',
 'dbunit',
 'after-save']

In [9]:
len(tags)

9252

In [10]:
data.head()

Unnamed: 0_level_0,Title,Body,Tags,Tokens,POS,Lemmatized,LemmaAndStem,Sentence
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4,how to convert a decimal to a double in c#?,i want to use a to change a 's opacity.\nthis is my code:\n\nwhen i build the application it gives the following error:\n\n\n\ni have tried using and but then the doesn't work. this code worked fine in a past vb.net project.\n,"[c#, floating-point, type-conversion, double, decimal]","['want', 'use', 'change', 'opacity', 'code', 'build', 'application', 'gives', 'following', 'error', 'tried', 'using', 'work', 'code', 'worked', 'fine', 'past', 'vb', '.net', 'project', 'convert', 'decimal', 'double', 'c#']","[('want', 'n'), ('use', 'n'), ('change', 'n'), ('opacity', 'n'), ('code', 'n'), ('build', 'a'), ('application', 'n'), ('gives', 'v'), ('following', 'v'), ('error', 'n'), ('tried', 'v'), ('using', 'v'), ('work', 'n'), ('code', 'n'), ('worked', 'v'), ('fine', 'a'), ('past', 'n'), ('vb', 'a'), ('.net', 'n'), ('project', 'n'), ('convert', 'v'), ('decimal', 'a'), ('double', 'a'), ('c#', 'n')]","['want', 'use', 'change', 'opacity', 'code', 'build', 'application', 'give', 'follow', 'error', 'try', 'use', 'work', 'code', 'work', 'fine', 'past', 'vb', '.net', 'project', 'convert', 'decimal', 'double', 'c#']","['want', 'use', 'chang', 'opac', 'code', 'build', 'applic', 'give', 'follow', 'error', 'tri', 'use', 'work', 'code', 'work', 'fine', 'past', 'vb', '.net', 'project', 'convert', 'decim', 'doubl', 'c#']",want use chang opac code build applic give follow error tri use work code work fine past vb .net project convert decim doubl c#
6,why did the width collapse in the percentage width child element in an absolutely positioned parent on internet explorer 7?,i have an absolutely positioned containing several children one of which is a relatively positioned . when i use a on the child it collapses to on ie7 but not on firefox or safari.\nif i use it works. if the parent is relatively positioned the percentage width on the child works.\n\nis there something i'm missing here?\nis there an easy fix for this besides the on the child?\nis there an area of the css specification that covers this?\n\n,"[html, css, internet-explorer-7]","['absolutely', 'positioned', 'containing', 'several', 'children', 'one', 'relatively', 'positioned', 'use', 'child', 'collapses', 'ie', 'firefox', 'safari', 'use', 'works', 'parent', 'relatively', 'positioned', 'percentage', 'width', 'child', 'works', 'something', 'missing', 'easy', 'fix', 'besides', 'child', 'area', 'css', 'specification', 'covers', 'width', 'collapse', 'percentage', 'width', 'child', 'element', 'absolutely', 'positioned', 'parent', 'internet', 'explorer']","[('absolutely', 'r'), ('positioned', 'v'), ('containing', 'v'), ('several', 'a'), ('children', 'n'), ('one', 'n'), ('relatively', 'r'), ('positioned', 'v'), ('use', 'n'), ('child', 'n'), ('collapses', 'n'), ('ie', 'v'), ('firefox', 'n'), ('safari', 'n'), ('use', 'n'), ('works', 'v'), ('parent', 'n'), ('relatively', 'r'), ('positioned', 'v'), ('percentage', 'n'), ('width', 'n'), ('child', 'n'), ('works', 'v'), ('something', 'n'), ('missing', 'v'), ('easy', 'a'), ('fix', 'a'), ('besides', 'n'), ('child', 'a'), ('area', 'n'), ('css', 'n'), ('specification', 'n'), ('covers', 'v'), ('width', 'a'), ('collapse', 'n'), ('percentage', 'n'), ('width', 'n'), ('child', 'n'), ('element', 'n'), ('absolutely', 'r'), ('positioned', 'v'), ('parent', 'a'), ('internet', 'n'), ('explorer', 'n')]","['absolutely', 'position', 'contain', 'several', 'child', 'one', 'relatively', 'position', 'use', 'child', 'collapse', 'ie', 'firefox', 'safari', 'use', 'work', 'parent', 'relatively', 'position', 'percentage', 'width', 'child', 'work', 'something', 'miss', 'easy', 'fix', 'besides', 'child', 'area', 'cs', 'specification', 'cover', 'width', 'collapse', 'percentage', 'width', 'child', 'element', 'absolutely', 'position', 'parent', 'internet', 'explorer']","['absolut', 'posit', 'contain', 'sever', 'children', 'one', 'relat', 'posit', 'use', 'child', 'collaps', 'ie', 'firefox', 'safari', 'use', 'work', 'parent', 'relat', 'posit', 'percentag', 'width', 'child', 'work', 'someth', 'miss', 'easi', 'fix', 'besid', 'child', 'area', 'css', 'specif', 'cover', 'width', 'collaps', 'percentag', 'width', 'child', 'element', 'absolut', 'posit', 'parent', 'internet', 'explor']",absolut posit contain sever children one relat posit use child collaps ie firefox safari use work parent relat posit percentag width child work someth miss easi fix besid child area css specif cover width collaps percentag width child element absolut posit parent internet explor
9,how do i calculate someone's age based on a datetime type birthday?,given a representing a person's birthday how do i calculate their age in years?\n,"[c#, .net, datetime]","['given', 'representing', 'person', 'birthday', 'calculate', 'age', 'years', 'calculate', 'someone', 'age', 'based', 'datetime', 'type', 'birthday']","[('given', 'v'), ('representing', 'v'), ('person', 'n'), ('birthday', 'a'), ('calculate', 'a'), ('age', 'n'), ('years', 'n'), ('calculate', 'v'), ('someone', 'n'), ('age', 'n'), ('based', 'v'), ('datetime', 'a'), ('type', 'n'), ('birthday', 'n')]","['give', 'represent', 'person', 'birthday', 'calculate', 'age', 'year', 'calculate', 'someone', 'age', 'base', 'datetime', 'type', 'birthday']","['given', 'repres', 'person', 'birthday', 'calcul', 'age', 'year', 'calcul', 'someon', 'age', 'base', 'datetim', 'type', 'birthday']",given repres person birthday calcul age year calcul someon age base datetim type birthday
11,calculate relative time in c#,given a specific value how do i display relative time like:\n\n2 hours ago\n3 days ago\na month ago\n\n,"[c#, datetime, time, datediff, relative-time-span]","['given', 'specific', 'value', 'display', 'relative', 'time', 'like', 'hours', 'ago', 'days', 'ago', 'month', 'ago', 'calculate', 'relative', 'time', 'c#']","[('given', 'v'), ('specific', 'a'), ('value', 'n'), ('display', 'n'), ('relative', 'a'), ('time', 'n'), ('like', 'n'), ('hours', 'n'), ('ago', 'r'), ('days', 'n'), ('ago', 'r'), ('month', 'n'), ('ago', 'r'), ('calculate', 'v'), ('relative', 'a'), ('time', 'n'), ('c#', 'n')]","['give', 'specific', 'value', 'display', 'relative', 'time', 'like', 'hour', 'ago', 'day', 'ago', 'month', 'ago', 'calculate', 'relative', 'time', 'c#']","['given', 'specif', 'valu', 'display', 'relat', 'time', 'like', 'hour', 'ago', 'day', 'ago', 'month', 'ago', 'calcul', 'relat', 'time', 'c#']",given specif valu display relat time like hour ago day ago month ago calcul relat time c#
13,determine a user's timezone,is there a standard way for a web server to be able to determine a user's timezone within a web page? \nperhaps from an http header or part of the string?\n,"[html, browser, timezone, user-agent, timezone-offset]","['standard', 'way', 'web', 'server', 'able', 'determine', 'user', 'timezone', 'within', 'web', 'page', 'perhaps', 'http', 'header', 'part', 'string', 'determine', 'user', 'timezone']","[('standard', 'a'), ('way', 'n'), ('web', 'n'), ('server', 'n'), ('able', 'a'), ('determine', 'n'), ('user', 'n'), ('timezone', 'n'), ('within', 'n'), ('web', 'a'), ('page', 'n'), ('perhaps', 'r'), ('http', 'a'), ('header', 'a'), ('part', 'n'), ('string', 'v'), ('determine', 'a'), ('user', 'n'), ('timezone', 'n')]","['standard', 'way', 'web', 'server', 'able', 'determine', 'user', 'timezone', 'within', 'web', 'page', 'perhaps', 'http', 'header', 'part', 'string', 'determine', 'user', 'timezone']","['standard', 'way', 'web', 'server', 'abl', 'determin', 'user', 'timezon', 'within', 'web', 'page', 'perhap', 'http', 'header', 'part', 'string', 'determin', 'user', 'timezon']",standard way web server abl determin user timezon within web page perhap http header part string determin user timezon


***
# 4 Bag-Of-Words

In [11]:
def bow(dataset, max_features=None, min_df=0.0, max_df=1.0):
    vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=max_features, max_df=max_df, min_df=min_df)
    matrix = vectorizer.fit_transform(dataset).toarray()
    vocab = vectorizer.get_feature_names_out()
    bag = pd.DataFrame(data=matrix, columns=vocab)
    return bag

In [12]:
display = None
if config["bag"]:
    bag = bow(data, "Sentence")
    display = bag.iloc[:5, :20]
display

***
# 5 TF-IDF

In [13]:
def tfidf(dataset, feature, max_features=None):
    vectorizer = TfidfVectorizer(tokenizer=None, stop_words=None, max_features=max_features)
    matrix = vectorizer.fit_transform(dataset[feature]).toarray()
    vocab = vectorizer.get_feature_names_out()
    tfidf = pd.DataFrame(data=matrix, columns=vocab)
    return tfidf

In [14]:
display = None
if config["tfidf"]:
    tfidf = tfidf(data, "Sentence")
    display = tfidf.iloc[:5, :20]
display

***
# 6 Unsupervised

## 6.0 Utils

In [15]:
def latent_dirichlet_allocation(dataset: pd.DataFrame, n_topics: int, max_iter=5, learning_offset=50, max_features=None):
    feature_names = dataset.columns

    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter, learning_method="online", learning_offset=learning_offset, random_state=0)
    lda.fit(data)
    return lda, feature_names

In [16]:
# https://blog.mlreview.com/topic-modeling-with-scikit-learn-e80d33668730
#
def display_topics(model, feature_names, no_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [17]:
# https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn
#
def latent_dirichlet_allocation_tuning(dataset: pd.DataFrame, param_grid: dict):
    data_bow = bow(dataset, min_df=.005)
    feature_names = data_bow.columns

    lda = LatentDirichletAllocation()
    gs = GridSearchCV(lda, param_grid)
    gs.fit(data_bow)

    return gs, feature_names

***
## 6.1 Linear Discriminant Analysis

***
## 6.2 Latent Dirichlet Allocation

In [18]:
if config["unsupervised"]:
    data_bow = bow(data["Sentence"], min_df=.005, max_df=1.0)

In [19]:
if config["unsupervised"]:
    data_bow.info()

In [20]:
if config["unsupervised"]:
    param_grid = {
        "n_components": [10],
        "learning_decay": [.7],
        "random_state": [0],
        "n_jobs": [10]
    }

    gs, feature_names = latent_dirichlet_allocation_tuning(data["Sentence"], param_grid)

In [21]:
if config["unsupervised"]:
    display_topics(gs.best_estimator_, feature_names, 20)

***
# 7 Supervised

## 7.0 Utils

In [22]:
def scree_plot(dataset, figsize=(15, 5)):
    pca = PCA()
    pca.fit(dataset)

    plt.figure(figsize=figsize)
    explain_variance = pd.Series(pca.explained_variance_ratio_)
    explain_variance.plot(kind="bar", alpha=0.7)

    total = 0
    var_ls = []
    for x in explain_variance:
        total = total + x
        var_ls.append(total)

    pd.Series(var_ls).plot(marker="o", alpha=0.7)
    plt.xlabel("Principle Components", fontsize="x-large")
    plt.ylabel("Percentage Variance Explained", fontsize="x-large")
    plt.title("Scree plot", fontsize="xx-large")
    plt.show()

    return pca

In [23]:
def apply_pca(dataset, n_components):
    pca = PCA(n_components=n_components)
    pca_data = pca.fit_transform(dataset)
    components_name = [f"PC{i+1}" for i in range(pca_data.shape[1])]
    pca_data = pd.DataFrame(data=pca_data, columns=components_name)
    loadings = pd.DataFrame(
        data=pca.components_.T,
        columns=components_name,
        index=dataset.columns)
    return pca, pca_data, loadings

In [48]:
# https://www.codementor.io/@agarrahul01/multiclass-classification-using-random-forest-on-scikit-learn-library-hkk4lwawu
# https://www.kaggle.com/patrickaudriaz/random-forests-for-multiclass-classification
# 
def classifier_tuning(dataset: pd.DataFrame, model, param_grid: dict, scoring: str = "neg_root_mean_squared_error"):
    X = bow(dataset["Sentence"], min_df=.005, max_df=1.0)

    # target multi label binarizer
    multi_label_binarizer = MultiLabelBinarizer()
    y = multi_label_binarizer.fit_transform(dataset["Tags"])

    feature_names = X.columns
    classes = multi_label_binarizer.classes_

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

    # multioutput
    m_out = MultiOutputClassifier(model)
    m_out.fit(X_train, y_train)

    # gridsearch tuning/fitting
    gs = GridSearchCV(m_out, param_grid, scoring=scoring, refit=True)
    gs.fit(X_train, y_train)

    # evaluate
    print(f"Score: {gs.score(X_test, y_test):.4}")
    print(f"Best params: {gs.best_params_}")

    # advanced evaluation
    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)

    conf_matrix = multilabel_confusion_matrix(y_test, y_pred)
    classif_report = classification_report(y_test, y_pred)

    return gs, feature_names, classes, conf_matrix, classif_report

***
## 7.1 PCA

In [25]:
# if config["supervised"]:
#     data_tfidf = tfidf(data, "Sentence")
#     pca = scree_plot(data_tfidf.iloc[:, :20])

***
## 7.2 Random Forest Classifier

In [49]:
if config["supervised"]:
    param_grid = {
        "estimator__n_estimators": [5],
        "estimator__criterion": ["entropy"],
        "estimator__random_state": [0],
        "n_jobs": [10]
    }

    model = RandomForestClassifier()
    # model = MultinomialNB()
    gs, feature_names, classes, conf_matrix, classif_report = classifier_tuning(data.iloc[:10], model, param_grid)

Score: -0.1597
Best params: {'estimator__criterion': 'entropy', 'estimator__n_estimators': 5, 'estimator__random_state': 0, 'n_jobs': 10}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
