In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Overwriting setup.sh


In [None]:
%%bash 
sh setup.sh

In [None]:
%%bash
pip install fast-bert

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
import apex
from sklearn.model_selection import train_test_split

import datetime

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
#from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc, F1

In [None]:
pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

  """Entry point for launching an IPython kernel.


In [None]:
DATA_PATH = Path('./data/')
LABEL_PATH = Path('./labels/')

MODEL_PATH=Path('./models/')
LOG_PATH=Path('./logs/')
MODEL_PATH.mkdir(exist_ok=True)

model_state_dict = None


FINETUNED_PATH = None

LOG_PATH.mkdir(exist_ok=True)

OUTPUT_PATH = MODEL_PATH/'output'
OUTPUT_PATH.mkdir(exist_ok=True)

In [None]:
args = Box({
    "run_text": "multilabel Data Practices classification ",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "data_practice_classification_lib",
    "no_cuda": False,
    "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "learning_rate": 1e-3,
    "num_train_epochs": 5,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": True,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'xlnet-base-cased',
    "model_type": 'xlnet'
})

In [None]:
import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [None]:
logger.info(args)

07/05/2020 23:41:25 - INFO - root -   {'run_text': 'multilabel Data Practices classification ', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('logs'), 'full_data_dir': PosixPath('data'), 'data_dir': PosixPath('data'), 'task_name': 'intent', 'no_cuda': False, 'bert_model': PosixPath('bert_models/pretrained-weights/uncased_L-12_H-768_A-12'), 'output_dir': PosixPath('models/output'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 8, 'eval_batch_size': 16, 'learning_rate': 0.001, 'num_train_epochs': 5, 'warmup_proportion': 0.0, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': False, 'fp16': True, 'fp16_opt_level': 'O1', 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': -1, 'warmup_steps': 500, 'logging_steps': 50, 'eval_all_checkpoints': True, 'overwrite_output_dir': True, 'overwrite_cache': False, 'loss_scale': 128, 'model_name': 'xlnet-base-cased', 'model_type': 

In [None]:
device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

In [None]:
label_cols = ["First Party Collection/Use","Third Party Sharing/Collection","User Access Edit and Deletion",
              "Data Retention","Data Security","International and Specific Audiences","Do Not Track","Policy Change",
              "User Choice/Control","Introductory/Generic","Practice not covered","Privacy contact information"]


In [None]:
databunch = BertDataBunch(args['data_dir'], LABEL_PATH, args.model_name, train_file='train.csv', val_file='val.csv',
                          test_data='test.csv',
                          text_col="text", label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type)

07/05/2020 23:41:29 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at /root/.cache/torch/transformers/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.69e5e35e0b798cab5e473f253752f8bf4d280ee37682281a23eed80f6e2d09c6
07/05/2020 23:41:29 - INFO - transformers.configuration_utils -   Model config XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 4,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summa

In [None]:
databunch.train_dl.dataset[0][3]

tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
from fast_bert.metrics import roc_auc, accuracy_thresh, fbeta , accuracy_multilabel
from sklearn.metrics import classification_report,hamming_loss, accuracy_score, roc_curve, auc, roc_auc_score, f1_score, multilabel_confusion_matrix
from torch import Tensor

threshold = 0.5


### Metrics functions:
def Hamming_loss(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, sample_weight = None):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return hamming_loss(y_true, y_pred, sample_weight = sample_weight)

def Exact_Match_Ratio(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, normalize:bool = True, sample_weight = None):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return accuracy_score(y_true, y_pred, normalize = normalize, sample_weight = sample_weight)

def roc_auc_score_macro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, average = 'macro', sample_weight = None):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    return roc_auc_score(y_true, y_pred, average = average, sample_weight = sample_weight)

def roc_auc_score_micro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True):
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    return roc_auc_score_macro(y_pred, y_true, sigmoid = sigmoid, average = 'micro')

def roc_auc_score_by_label(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True):
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    return roc_auc_score_macro(y_pred, y_true, sigmoid = sigmoid, average = None)

def ROC_AUC_by_label(y_pred: Tensor, y_true: Tensor, sigmoid:bool = True, labels:list = label_cols):
    # Compute ROC curve and ROC area for each label
    if sigmoid: y_pred = y_pred.sigmoid()
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    for i in range(len(labels)):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
        roc_auc[label_cols[i]] = auc(fpr[i], tpr[i])
    return roc_auc

def F1(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, threshold:float = threshold):
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    return fbeta(y_pred, y_true, sigmoid = sigmoid, thresh = threshold, beta = 1)

def F1_macro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, average = 'macro', sample_weight = None):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return f1_score(y_true, y_pred, average = average, sample_weight = sample_weight)

def F1_micro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True):
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    return F1_macro(y_pred, y_true, sigmoid = sigmoid, average = 'micro')

def F1_by_label(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, sample_weight = None, labels:list = label_cols):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return f1_score(y_true, y_pred, average = None)

def cls_report(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold,labels:list = label_cols):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return classification_report(y_true,y_pred,target_names=label_cols)

def accuracy_by_label(y_pred: Tensor, y_true: Tensor, sigmoid:bool = True, thresh:float = threshold, normalize:bool = True, sample_weight = None, labels:list = label_cols):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    accuracies = {}
    for i in range(len(labels)):
        accuracies[label_cols[i]] = accuracy_score(y_true[:, i], y_pred[:, i], normalize = normalize, sample_weight = sample_weight)
    return accuracies

def confusion_matrix_by_label(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, sample_weight = None, samplewise = False, labels:list = label_cols):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return multilabel_confusion_matrix(y_true, y_pred, labels = [i for i in range(len(labels))], sample_weight = sample_weight, samplewise = samplewise)

In [None]:
metrics = []
metrics.append({'name': 'F1_macro', 'function': F1_macro})
metrics.append({'name': 'F1_micro', 'function': F1_micro})
metrics.append({'name': 'roc_auc_score_macro', 'function': roc_auc_score_macro})
metrics.append({'name': 'roc_auc_score_micro', 'function': roc_auc_score_micro})
metrics.append({'name': 'F1_by_label', 'function': F1_by_label})
metrics.append({'name': 'cls_report', 'function': cls_report})

In [None]:
learner = BertLearner.from_pretrained_model(databunch, args.model_name, metrics=metrics, 
                                            device=device, logger=logger, output_dir=args.output_dir, 
                                            finetuned_wgts_path=FINETUNED_PATH, warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu, is_fp16=args.fp16, 
                                            multi_label=True, logging_steps=0)

07/05/2020 23:41:40 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at /root/.cache/torch/transformers/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.69e5e35e0b798cab5e473f253752f8bf4d280ee37682281a23eed80f6e2d09c6
07/05/2020 23:41:40 - INFO - transformers.configuration_utils -   Model config XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11"
  },
  "initializer_range": 0.0

In [None]:
learner.fit(args.num_train_epochs, args.learning_rate, validate=True,
            schedule_type="warmup_cosine", optimizer_type="lamb")

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
07/05/2020 22:04:46 - INFO - root -   ***** Running training *****
07/05/2020 22:04:46 - INFO - root -     Num examples = 2185
07/05/2020 22:04:46 - INFO - root -     Num Epochs = 5
07/05/2020 22:04:46 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 8
07/05/2020 22:04:46 - 

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


07/05/2020 22:09:20 - INFO - root -   Running evaluation
07/05/2020 22:09:20 - INFO - root -     Num examples = 550
07/05/2020 22:09:20 - INFO - root -     Batch size = 16


07/05/2020 22:09:50 - INFO - root -   eval_loss after epoch 1: 0.1701027369924954: 
07/05/2020 22:09:50 - INFO - root -   eval_F1_macro after epoch 1: 0.3787892329552817: 
07/05/2020 22:09:50 - INFO - root -   eval_F1_micro after epoch 1: 0.6666666666666667: 
07/05/2020 22:09:50 - INFO - root -   eval_roc_auc_score_macro after epoch 1: 0.8362272938537495: 
07/05/2020 22:09:50 - INFO - root -   eval_roc_auc_score_micro after epoch 1: 0.927212446779462: 
07/05/2020 22:09:50 - INFO - root -   eval_F1_by_label after epoch 1: [0.79268293 0.84242424 0.         0.         0.70833333 0.33333333
 0.         0.38709677 0.31578947 0.5106383  0.         0.65517241]: 
07/05/2020 22:09:50 - INFO - root -   eval_cls_report after epoch 1:                                       precision    recall  f1-score   support

          First Party Collection/Use       0.85      0.74      0.79       175
      Third Party Sharing/Collection       0.81      0.88      0.84       158
       User Access Edit and Dele

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


07/05/2020 22:14:26 - INFO - root -   Running evaluation
07/05/2020 22:14:26 - INFO - root -     Num examples = 550
07/05/2020 22:14:26 - INFO - root -     Batch size = 16


07/05/2020 22:14:55 - INFO - root -   eval_loss after epoch 2: 0.1095046137060438: 
07/05/2020 22:14:55 - INFO - root -   eval_F1_macro after epoch 2: 0.5963099845837297: 
07/05/2020 22:14:55 - INFO - root -   eval_F1_micro after epoch 2: 0.7928994082840237: 
07/05/2020 22:14:55 - INFO - root -   eval_roc_auc_score_macro after epoch 2: 0.9355298181204015: 
07/05/2020 22:14:55 - INFO - root -   eval_roc_auc_score_micro after epoch 2: 0.9670652474697746: 
07/05/2020 22:14:55 - INFO - root -   eval_F1_by_label after epoch 2: [0.81619938 0.88957055 0.61111111 0.         0.73469388 0.90909091
 0.         0.86363636 0.7816092  0.72222222 0.         0.82758621]: 
07/05/2020 22:14:55 - INFO - root -   eval_cls_report after epoch 2:                                       precision    recall  f1-score   support

          First Party Collection/Use       0.90      0.75      0.82       175
      Third Party Sharing/Collection       0.86      0.92      0.89       158
       User Access Edit and Del

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


07/05/2020 22:19:30 - INFO - root -   Running evaluation
07/05/2020 22:19:30 - INFO - root -     Num examples = 550
07/05/2020 22:19:30 - INFO - root -     Batch size = 16


07/05/2020 22:20:00 - INFO - root -   eval_loss after epoch 3: 0.09782147694911275: 
07/05/2020 22:20:00 - INFO - root -   eval_F1_macro after epoch 3: 0.715844448785024: 
07/05/2020 22:20:00 - INFO - root -   eval_F1_micro after epoch 3: 0.8225674570727719: 
07/05/2020 22:20:00 - INFO - root -   eval_roc_auc_score_macro after epoch 3: 0.9563245462151985: 
07/05/2020 22:20:00 - INFO - root -   eval_roc_auc_score_micro after epoch 3: 0.9753796025309714: 
07/05/2020 22:20:00 - INFO - root -   eval_F1_by_label after epoch 3: [0.86904762 0.84067797 0.88372093 0.44444444 0.88135593 0.94736842
 0.5        0.8627451  0.81188119 0.76923077 0.         0.77966102]: 
07/05/2020 22:20:00 - INFO - root -   eval_cls_report after epoch 3:                                       precision    recall  f1-score   support

          First Party Collection/Use       0.91      0.83      0.87       175
      Third Party Sharing/Collection       0.91      0.78      0.84       158
       User Access Edit and Del

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


07/05/2020 22:20:00 - INFO - root -   Running evaluation
07/05/2020 22:20:00 - INFO - root -     Num examples = 550
07/05/2020 22:20:00 - INFO - root -     Batch size = 16


07/05/2020 22:24:17 - INFO - root -   eval_loss after epoch 4: 0.08811815359762737: 
07/05/2020 22:24:17 - INFO - root -   eval_F1_macro after epoch 4: 0.8017567785434433: 
07/05/2020 22:24:17 - INFO - root -   eval_F1_micro after epoch 4: 0.8489208633093525: 
07/05/2020 22:24:17 - INFO - root -   eval_roc_auc_score_macro after epoch 4: 0.966414074111848: 
07/05/2020 22:24:17 - INFO - root -   eval_roc_auc_score_micro after epoch 4: 0.9801618914899237: 
07/05/2020 22:24:17 - INFO - root -   eval_F1_by_label after epoch 4: [0.88439306 0.86503067 0.81818182 0.6        0.87719298 0.93975904
 0.8        0.89361702 0.84444444 0.796875   0.44444444 0.85714286]: 
07/05/2020 22:24:17 - INFO - root -   eval_cls_report after epoch 4:                                       precision    recall  f1-score   support

          First Party Collection/Use       0.89      0.87      0.88       175
      Third Party Sharing/Collection       0.84      0.89      0.87       158
       User Access Edit and Del

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


07/05/2020 22:24:57 - INFO - root -   Running evaluation
07/05/2020 22:24:57 - INFO - root -     Num examples = 550
07/05/2020 22:24:57 - INFO - root -     Batch size = 16


07/05/2020 22:28:04 - INFO - root -   eval_loss after epoch 5: 0.08510403476123299: 
07/05/2020 22:28:04 - INFO - root -   eval_F1_macro after epoch 5: 0.8169094358821195: 
07/05/2020 22:28:04 - INFO - root -   eval_F1_micro after epoch 5: 0.8621236133122028: 
07/05/2020 22:28:04 - INFO - root -   eval_roc_auc_score_macro after epoch 5: 0.9686897098043082: 
07/05/2020 22:28:04 - INFO - root -   eval_roc_auc_score_micro after epoch 5: 0.9820359822280121: 
07/05/2020 22:28:04 - INFO - root -   eval_F1_by_label after epoch 5: [0.89918256 0.89808917 0.8        0.75       0.86666667 0.98765432
 0.8        0.89361702 0.81395349 0.78125    0.5        0.8125    ]: 
07/05/2020 22:28:04 - INFO - root -   eval_cls_report after epoch 5:                                       precision    recall  f1-score   support

          First Party Collection/Use       0.86      0.94      0.90       175
      Third Party Sharing/Collection       0.90      0.89      0.90       158
       User Access Edit and De

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


07/05/2020 22:28:37  - INFO - root -   Running evaluation
07/05/2020 22:28:37 - INFO - root -     Num examples = 550
07/05/2020 22:28:37 - INFO - root -     Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(1370, 0.13800626009240421)

In [None]:
learner.validate()

{'F1_macro': 0.8223960148166968,
 'F1_micro': 0.8639618138424819,
 'loss': 0.08499825160418238,
 'roc_auc_score_macro': 0.9704352206753627,
 'roc_auc_score_micro': 0.9828776671791827}

In [None]:
learner.save_model()

07/05/2020 23:43:36 - INFO - transformers.configuration_utils -   Configuration saved in models/output/model_out/config.json
07/05/2020 23:43:37 - INFO - transformers.modeling_utils -   Model weights saved in models/output/model_out/pytorch_model.bin


In [None]:
import textwrap 
 
# Wrap this text. 
wrapper = textwrap.TextWrapper(width=100) 

In [None]:
print(wrapper.fill(list(pd.read_csv('./data/test.csv')['text'].values)[20:21][0]))
print()
learner.predict_batch(list(pd.read_csv('./data/test.csv')['text'].values)[21:22])

What Are Your California Privacy Rights? Our Privacy Policy describes how we share information for
marketing purposes. The Policy and rights apply to all customers, including California residents.

07/05/2020 21:44:46 - INFO - root -   Writing example 0 of 1


[[('First Party Collection/Use', 0.9912109375),
  ('Third Party Sharing/Collection', 0.00867462158203125),
  ('Introductory/Generic', 0.00815582275390625),
  ('User Choice/Control', 0.004573822021484375),
  ('Practice not covered', 0.003482818603515625),
  ('Data Retention', 0.0021648406982421875),
  ('User Access Edit and Deletion', 0.0018463134765625),
  ('Data Security', 0.001628875732421875),
  ('Privacy contact information', 0.0014781951904296875),
  ('Policy Change', 0.0012998580932617188),
  ('International and Specific Audiences', 0.0011606216430664062),
  ('Do Not Track', 0.0008492469787597656)]]

In [None]:
from fast_bert.prediction import BertClassificationPredictor

PATH = Path("./models/")
OUT_PATH = "./models/output/model_out/"

predictor = BertClassificationPredictor(OUT_PATH, LABEL_PATH, multi_label=True, model_type=args.model_type)

07/05/2020 23:37:27 - INFO - transformers.configuration_utils -   loading configuration file ./models/output/model_out/config.json
07/05/2020 23:37:27 - INFO - transformers.configuration_utils -   Model config XLNetConfig {
  "architectures": [
    "XLNetForMultiLabelSequenceClassification"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL

Examples from [Readdle Privacy Policy](https://readdle.com/privacy)

In [None]:
predictor.predict_batch(["""Unless no shorter storage period is indicated in this privacy policy, we, 
                        in general, store Personal Data as long (i) as required for the provision of the Services to you, 
                        and/or (ii) as it is necessary with regard to the contractual relationship with you, thereafter only 
                        if and to the extent that we are obliged to do so by mandatory statutory retention obligations. 
                        If we no longer require the respective Personal Data for the purposes described above, such Personal 
                        Data will only be stored during the respective legal retention period and not processed for other purposes."""])

07/05/2020 23:37:33 - INFO - root -   Writing example 0 of 1


[[('Data Retention', 0.971166729927063),
  ('First Party Collection/Use', 0.098568394780159),
  ('International and Specific Audiences', 0.070105642080307),
  ('User Access Edit and Deletion', 0.065472811460495),
  ('Third Party Sharing/Collection', 0.057098206132650375),
  ('Practice not covered', 0.05584406852722168),
  ('Do Not Track', 0.0371263362467289),
  ('Data Security', 0.03128070756793022),
  ('Privacy contact information', 0.02200254611670971),
  ('User Choice/Control', 0.01853892020881176),
  ('Policy Change', 0.009368144907057285),
  ('Introductory/Generic', 0.004830758087337017)]]

In [None]:
predictor.predict_batch(["""We have taken extensive technical and operational precautions to protect the Personal Data retained 
                          by us against unauthorized access and misuse.
                           Our security procedures are revised regularly and adapted to reflect technological progress."""])

07/05/2020 23:37:33 - INFO - root -   Writing example 0 of 1


[[('Data Security', 0.9835371375083923),
  ('Data Retention', 0.018327999860048294),
  ('First Party Collection/Use', 0.01700400561094284),
  ('Introductory/Generic', 0.012382513843476772),
  ('Third Party Sharing/Collection', 0.010146232321858406),
  ('User Choice/Control', 0.00648467754945159),
  ('Practice not covered', 0.006338837556540966),
  ('Privacy contact information', 0.00616452656686306),
  ('User Access Edit and Deletion', 0.005124018527567387),
  ('International and Specific Audiences', 0.004654853604733944),
  ('Policy Change', 0.0038991225883364677),
  ('Do Not Track', 0.0030028994660824537)]]

In [None]:
data_true = pd.read_csv('./data/test.csv',usecols=['First Party Collection/Use', 'Third Party Sharing/Collection', 'User Access Edit and Deletion',
                                                   'Data Retention', 'Data Security', 'International and Specific Audiences', 'Do Not Track',
                                                   'Policy Change', 'User Choice/Control', 'Introductory/Generic', 'Practice not covered',
                                                   'Privacy contact information'])

In [None]:
label_cols = list(data_true.columns)
num_labels = len(label_cols)
print('Label columns: ', num_labels ," :", label_cols)

Label columns:  12  : ['First Party Collection/Use', 'Third Party Sharing/Collection', 'User Access Edit and Deletion', 'Data Retention', 'Data Security', 'International and Specific Audiences', 'Do Not Track', 'Policy Change', 'User Choice/Control', 'Introductory/Generic', 'Practice not covered', 'Privacy contact information']


In [None]:
data_true['one_hot_labels'] = list(data_true[label_cols].values)
data_true.head()

Unnamed: 0,First Party Collection/Use,Third Party Sharing/Collection,User Access Edit and Deletion,Data Retention,Data Security,International and Specific Audiences,Do Not Track,Policy Change,User Choice/Control,Introductory/Generic,Practice not covered,Privacy contact information,one_hot_labels
0,0,0,0,0,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
1,0,0,0,1,0,0,0,0,0,0,0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,0,0,0,0,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
3,0,0,0,0,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
4,0,0,0,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"


In [None]:
output = predictor.predict_batch(list(pd.read_csv("./data/test.csv")['text'].values))

07/05/2020 23:37:33 - INFO - root -   Writing example 0 of 697


In [None]:
output[0]

[('Policy Change', 0.9675731658935547),
 ('Introductory/Generic', 0.10871566087007523),
 ('International and Specific Audiences', 0.014690202660858631),
 ('First Party Collection/Use', 0.014292731881141663),
 ('Privacy contact information', 0.010994847863912582),
 ('User Access Edit and Deletion', 0.009102728217840195),
 ('Practice not covered', 0.008714914321899414),
 ('Third Party Sharing/Collection', 0.007862123660743237),
 ('User Choice/Control', 0.005560227204114199),
 ('Do Not Track', 0.005237790755927563),
 ('Data Retention', 0.0034489822573959827),
 ('Data Security', 0.0029655008111149073)]

In [None]:
pd.DataFrame(output).to_csv('./data/output_xlnet.csv')

In [None]:
results = pd.read_csv('./data/output_xlnet.csv')

In [None]:
def apply_tresh(y_pred):
  thresh = 0.5
  return float(y_pred > thresh)

In [None]:
preds = pd.DataFrame([{item[0]: apply_tresh(item[1]) for item in pred} for pred in output])

In [None]:
preds = preds[label_cols]
preds.head()

Unnamed: 0,First Party Collection/Use,Third Party Sharing/Collection,User Access Edit and Deletion,Data Retention,Data Security,International and Specific Audiences,Do Not Track,Policy Change,User Choice/Control,Introductory/Generic,Practice not covered,Privacy contact information
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
pred_label_cols = list(preds.columns)
num_labels = len(label_cols)
print('Label columns: ', num_labels ," :", label_cols)

Label columns:  12  : ['First Party Collection/Use', 'Third Party Sharing/Collection', 'User Access Edit and Deletion', 'Data Retention', 'Data Security', 'International and Specific Audiences', 'Do Not Track', 'Policy Change', 'User Choice/Control', 'Introductory/Generic', 'Practice not covered', 'Privacy contact information']


In [None]:
preds['one_hot_labels'] = list(preds[pred_label_cols].values)
preds.head()

Unnamed: 0,First Party Collection/Use,Third Party Sharing/Collection,User Access Edit and Deletion,Data Retention,Data Security,International and Specific Audiences,Do Not Track,Policy Change,User Choice/Control,Introductory/Generic,Practice not covered,Privacy contact information,one_hot_labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]"


In [None]:
from sklearn.metrics import classification_report

y_true = list(data_true['one_hot_labels'])
y_pred = list(preds['one_hot_labels'])

clf_report = classification_report(y_true,y_pred,target_names=label_cols)
print(clf_report)

                                      precision    recall  f1-score   support

          First Party Collection/Use       0.85      0.92      0.88       175
      Third Party Sharing/Collection       0.88      0.90      0.89       158
       User Access Edit and Deletion       0.79      0.90      0.84        21
                      Data Retention       0.69      0.79      0.73        14
                       Data Security       0.93      0.87      0.90        31
International and Specific Audiences       0.91      1.00      0.95        40
                        Do Not Track       1.00      0.67      0.80         6
                       Policy Change       0.87      0.80      0.83        25
                 User Choice/Control       0.88      0.75      0.81        48
                Introductory/Generic       0.88      0.70      0.78        70
                Practice not covered       0.62      0.48      0.54        21
         Privacy contact information       0.82      0.84      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
