In [2]:
from autogluon.tabular import TabularDataset
from autogluon.text import TextPredictor
from sklearn.model_selection import train_test_split
import pandas as pd

import os
import pandas as pd
import numpy as np

from autogluon.text import TextPredictor


In [3]:
import os
import pandas as pd
import numpy as np

from autogluon.text import TextPredictor


class MultiLabelTextPredictor:

    def __init__(self,
                 labels: list,
                 problem_type: str = None,
                 eval_metric: str = None,
                 path: str = None,
                 verbosity: int = 3,
                 warn_if_exist: bool = True,
                 text_column: str = 'comment_text'):

        self.labels = labels
        self.text_predictors = dict()
        self.path = path
        self.verbosity = verbosity
        self.warn_if_exist = warn_if_exist
        self.text_column = text_column
        self.samples_per_class = 500

        for label in self.labels:
            self.text_predictors[label] = TextPredictor(label=label,
                                                        problem_type=problem_type,
                                                        eval_metric=eval_metric,
                                                        path=os.path.join(path, label),
                                                        verbosity=verbosity,
                                                        warn_if_exist=warn_if_exist)

    def fit(self, train_data: pd.DataFrame,
            tuning_data: pd.DataFrame = None, time_limit: int = None) -> None:

        for i, label in enumerate(self.labels):
            print(f'Training a text classifier for class: {label} ({i}/{len(self.labels)})')
            
            temp_train_data = train_data #.groupby(label, group_keys=False).apply(lambda x: x.sample(min(len(x), self.samples_per_class)))

            self.text_predictors[label].fit(train_data=temp_train_data[[self.text_column, label]], time_limit=time_limit)

    def predict(self, train_data: pd.DataFrame) -> np.array:

        y_pred: np.array = np.zeros((train_data.shape[0], len(self.labels)))

        for i, label in enumerate(self.labels):

            y_pred[:, i] = self.text_predictors[label].predict(train_data[[self.text_column]])

        return y_pred

    def load(self, path: str) -> None:
        """

        :type path: pathname where text classifiers are being stored
        """
        for label in self.labels:
            self.text_predictors[label] = TextPredictor.load(os.path.join(path, label))


  and should_run_async(code)


In [104]:
train_df = pd.read_csv('data/train.csv.zip', compression='zip')

In [105]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
class_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
data_dir = 'toxic-multilabel'

In [107]:
train_df = train_df.drop(columns=['id']) #, 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [108]:
train_df, test_df = train_test_split(train_df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.1)

In [109]:
train_df = TabularDataset(train_df)
val_df = TabularDataset(val_df)
test_df = TabularDataset(test_df)

In [110]:
!rm -rf toxic-multilabel

In [111]:
train_df.columns

Index(['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [7]:
predictor = MultiLabelTextPredictor(labels=class_labels,
                          #problem_type='binary',
                          eval_metric='roc_auc',
                          path=data_dir)



In [None]:
predictor.fit(train_data=train_df,
             tuning_data=val_df)

Training a text classifier for class: toxic (0/6)


Problem Type="binary"
Column Types:
   - "comment_text": text
   - "toxic": categorical

The GluonNLP V0 backend is used. We will use 8 cpus and 1 gpus to train each trial.


All Logs will be saved to /home/jupyter/toxic-multilabel/toxic/task0/training.log


Fitting and transforming the train data...
Done! Preprocessor saved to /home/jupyter/toxic-multilabel/toxic/task0/preprocessor.pkl
Process dev set...
Done!
Max length for chunking text: 320, Stochastic chunk: Train-False/Test-False, Test #repeat: 1.
#Total Params/Fixed Params=108990466/0
Using gradient accumulation. Global batch size = 128
Local training results will be saved to /home/jupyter/toxic-multilabel/toxic/task0/results_local.jsonl.
[Iter 44/8790, Epoch 0] train loss=2.85e-01, gnorm=3.50e+00, lr=5.01e-06, #samples processed=5632, #sample per second=25.44. ETA=733.46min
[Iter 88/8790, Epoch 0] train loss=1.71e-01, gnorm=2.38e+00, lr=1.00e-05, #samples processed=5632, #sample per second=25.08. ETA=734.98min
[Iter 132/8790, Epoch 0] train loss=1.45e-01, gnorm=2.07e+00, lr=1.50e-05, #samples processed=5632, #sample per second=25.12. ETA=732.58min
[Iter 176/8790, Epoch 0] train loss=1.49e-01, gnorm=1.12e+00, lr=2.00e-05, #samples processed=5632, #sample per second=25.29. ETA=728.28

In [8]:
predictor.load(path='toxic-multilabel')

NumPy-shape semantics has been activated in your code. This is required for creating and manipulating scalar and zero-size tensors, which were not supported in MXNet before, as in the official NumPy library. Please DO NOT manually deactivate this semantics while using `mxnet.numpy` and `mxnet.numpy_extension` modules.


In [9]:
predictor.text_predictors['severe_toxic'].__dict__

{'verbosity': None,
 '_label': 'severe_toxic',
 '_problem_type': 'binary',
 '_eval_metric': 'roc_auc',
 '_path': 'toxic-multilabel/severe_toxic/',
 '_model': <autogluon.text.text_prediction.mx.models.MultiModalTextModel at 0x7f0ff8bd5910>,
 '_fit_called': False,
 '_backend': 'gluonnlp_v0'}

In [None]:
y_test_pred = predictor.predict(test_df)

In [None]:
y_test_pred

In [None]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [None]:
print(roc_auc_score(test_df[class_labels], y_test_pred))
print(classification_report(test_df[class_labels], y_test_pred))

In [10]:
real_test_df = pd.read_csv('data/test.csv.zip', compression='zip')

In [None]:
predicted_toxic = predictor.predict(real_test_df)

In [12]:
predicted_toxic

  and should_run_async(code)


array([[1., 0., 1., 0., 1., 1.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [None]:
predicted_toxic_df = pd.DataFrame(predicted_toxic, columns=class_labels)
predicted_toxic_df['id'] = real_test_df['id']

In [None]:
predicted_toxic_df.columns

In [None]:
predicted_toxic_df[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].head()

In [None]:
predicted_toxic_df[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].to_csv('toxic-challenge-autogluon.csv', index=False)

In [21]:
ls -lh

total 836M
-rw-r--r-- 1 jupyter jupyter 1.9K Jul 30 06:57 LightGBM_compilation.log
-rw-r--r-- 1 jupyter jupyter 1.1M Jul 30 06:44 Untitled.ipynb
-rw-r--r-- 1 jupyter jupyter  37K Jul 30 07:49 Untitled1.ipynb
drwxr-xr-x 4 jupyter jupyter 4.0K Jul 29 13:30 [0m[01;34magModels-predictClass[0m/
-rw-r--r-- 1 jupyter jupyter  14K Aug  5 12:56 collect-gcp-vertex-automl-batch-predictions.ipynb
drwxr-xr-x 3 jupyter jupyter 4.0K Aug  5 17:43 [01;34mdata[0m/
-rw-r--r-- 1 jupyter jupyter  22K Aug 16 14:30 jigsaw-toxic-comments-challenge-autogluon-multilabel.ipynb
-rw-r--r-- 1 jupyter jupyter  25K Aug  5 19:37 jigsaw-toxic-comments-challenge-autogluon.ipynb
-rw-r--r-- 1 jupyter jupyter 2.6M Dec 11  2019 sample_submission.csv
-rw-r--r-- 1 jupyter jupyter 251M Jul 29 13:15 santander-customer-transaction-prediction.zip
drwxr-xr-x 4 jupyter jupyter 4.0K Jul 29 13:49 [01;34msantander-models[0m/
drwxr-xr-x 4 jupyter jupyter 4.0K Jul 30 07:18 [01;34msantander-models-2[0m/
drwxr-xr-x 4 jupyter jupy