# Train a model using AutoGluon

In [None]:
from autogluon.tabular import TabularDataset
from autogluon.text import TextPredictor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

import pandas as pd

import os
import pandas as pd
import numpy as np

from autogluon.text import TextPredictor


In [None]:
import os
import pandas as pd
import numpy as np

from autogluon.text import TextPredictor

# Define a custom MultiLabelPredictor that actually wraps multiple text classifier inside
class MultiLabelTextPredictor:
    def __init__(
        self,
        labels: list,
        problem_type: str = None,
        eval_metric: str = None,
        path: str = None,
        verbosity: int = 3,
        warn_if_exist: bool = True,
        text_column: str = "comment_text",
    ):

        self.labels = labels
        self.text_predictors = dict()
        self.path = path
        self.verbosity = verbosity
        self.warn_if_exist = warn_if_exist
        self.text_column = text_column
        self.samples_per_class = 500

        for label in self.labels:
            self.text_predictors[label] = TextPredictor(
                label=label,
                problem_type=problem_type,
                eval_metric=eval_metric,
                path=os.path.join(path, label),
                verbosity=verbosity,
                warn_if_exist=warn_if_exist,
            )

    def fit(
        self,
        train_data: pd.DataFrame,
        tuning_data: pd.DataFrame = None,
        time_limit: int = None,
    ) -> None:

        for i, label in enumerate(self.labels):
            print(
                f"Training a text classifier for class: {label} ({i}/{len(self.labels)})"
            )

            temp_train_data = train_data  # .groupby(label, group_keys=False).apply(lambda x: x.sample(min(len(x), self.samples_per_class)))

            self.text_predictors[label].fit(
                train_data=temp_train_data[[self.text_column, label]],
                time_limit=time_limit,
            )

    def predict(self, train_data: pd.DataFrame) -> np.array:

        y_pred: np.array = np.zeros((train_data.shape[0], len(self.labels)))

        for i, label in enumerate(self.labels):

            y_pred[:, i] = self.text_predictors[label].predict(
                train_data[[self.text_column]]
            )

        return y_pred

    def load(self, path: str) -> None:
        """

        :type path: pathname where text classifiers are being stored
        """
        for label in self.labels:
            self.text_predictors[label] = TextPredictor.load(os.path.join(path, label))


In [None]:
train_df = pd.read_csv("data/train.csv.zip", compression="zip")


In [None]:
train_df.head()


In [None]:
class_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
data_dir = "toxic-multilabel"


In [None]:
train_df = train_df.drop(
    columns=["id"]
) 


In [None]:
train_df, test_df = train_test_split(train_df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.1)


In [None]:
train_df = TabularDataset(train_df)
val_df = TabularDataset(val_df)
test_df = TabularDataset(test_df)


In [None]:
# Remove previous runs
!rm -rf toxic-multilabel

## Train a MultiLabelTextPredictor

### Init the model

In [None]:
predictor = MultiLabelTextPredictor(
    labels=class_labels,
    # problem_type='binary',
    eval_metric="roc_auc",
    path=data_dir,
)


### Train the model

In [None]:
predictor.fit(train_data=train_df, tuning_data=val_df)


In [None]:
predictor.load(path="toxic-multilabel")


## Evaluate the model

In [None]:
y_test_pred = predictor.predict(test_df)


In [None]:
print(roc_auc_score(test_df[class_labels], y_test_pred))
print(classification_report(test_df[class_labels], y_test_pred))


## Predict real test samples
(samples which true labels we dont know)

### Load data

In [None]:
real_test_df = pd.read_csv("data/test.csv.zip", compression="zip")


In [None]:
predicted_toxic = predictor.predict(real_test_df)


In [None]:
predicted_toxic_df = pd.DataFrame(predicted_toxic, columns=class_labels)
predicted_toxic_df["id"] = real_test_df["id"]


In [None]:
predicted_toxic_df[
    ["id", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
].head()


In [None]:
predicted_toxic_df[
    ["id", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
].to_csv("toxic-challenge-autogluon.csv", index=False)
