In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Writing setup.sh


In [None]:
%%bash 
sh setup.sh

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
import apex
from sklearn.model_selection import train_test_split

import datetime

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
#from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc, F1

In [None]:
pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

  """Entry point for launching an IPython kernel.


In [None]:
DATA_PATH = Path('./data/')
LABEL_PATH = Path('./labels/')

MODEL_PATH=Path('./models/')
LOG_PATH=Path('./logs/')
MODEL_PATH.mkdir(exist_ok=True)

model_state_dict = None

FINETUNED_PATH = None

LOG_PATH.mkdir(exist_ok=True)

OUTPUT_PATH = MODEL_PATH/'output'
OUTPUT_PATH.mkdir(exist_ok=True)

In [None]:
args = Box({
    "run_text": "multilabel Data Practices classification",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "data_practice_classification_lib",
    "no_cuda": False,
    "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "learning_rate": 3e-3,
    "num_train_epochs": 5,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": True,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'xlnet-base-cased',
    "model_type": 'xlnet'
})

In [None]:
import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [None]:
logger.info(args)

07/04/2020 22:53:26 - INFO - root -   {'run_text': 'multilabel Data Practices classification', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('logs'), 'full_data_dir': PosixPath('data'), 'data_dir': PosixPath('data'), 'task_name': 'intent', 'no_cuda': False, 'bert_model': PosixPath('bert_models/pretrained-weights/uncased_L-12_H-768_A-12'), 'output_dir': PosixPath('models/output'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 8, 'eval_batch_size': 16, 'learning_rate': 0.003, 'num_train_epochs': 4, 'warmup_proportion': 0.0, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': False, 'fp16': True, 'fp16_opt_level': 'O1', 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': -1, 'warmup_steps': 500, 'logging_steps': 50, 'eval_all_checkpoints': True, 'overwrite_output_dir': True, 'overwrite_cache': False, 'loss_scale': 128, 'model_name': 'xlnet-base-cased', 'model_type': '

In [None]:
device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

In [None]:
label_cols = ['First Party Collection/Use', 'Third Party Sharing/Collection', 'User Access Edit and Deletion', 'Data Retention',
              'Data Security', 'International and Specific Audiences', 'Do Not Track', 'Policy Change', 'User Choice/Control',
              'Introductory/Generic', 'Practice not covered', 'Privacy contact information']

In [None]:
databunch = BertDataBunch(args['data_dir'], LABEL_PATH, args.model_name, train_file='train.csv', val_file='val.csv',
                          test_data='test.csv',
                          text_col="text", label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type)

07/04/2020 22:53:32 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at /root/.cache/torch/transformers/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.69e5e35e0b798cab5e473f253752f8bf4d280ee37682281a23eed80f6e2d09c6
07/04/2020 22:53:32 - INFO - transformers.configuration_utils -   Model config XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summa

In [None]:
from fast_bert.metrics import roc_auc, accuracy_thresh, fbeta , accuracy_multilabel
from sklearn.metrics import classification_report,hamming_loss, accuracy_score, roc_curve, auc, roc_auc_score, f1_score, multilabel_confusion_matrix
from torch import Tensor

threshold = 0.5


### Metrics functions:
def Hamming_loss(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, sample_weight = None):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return hamming_loss(y_true, y_pred, sample_weight = sample_weight)

def Exact_Match_Ratio(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, normalize:bool = True, sample_weight = None):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return accuracy_score(y_true, y_pred, normalize = normalize, sample_weight = sample_weight)

def roc_auc_score_macro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, average = 'macro', sample_weight = None):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    return roc_auc_score(y_true, y_pred, average = average, sample_weight = sample_weight)

def roc_auc_score_micro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True):
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    return roc_auc_score_macro(y_pred, y_true, sigmoid = sigmoid, average = 'micro')

def roc_auc_score_by_label(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True):
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    return roc_auc_score_macro(y_pred, y_true, sigmoid = sigmoid, average = None)

def ROC_AUC_by_label(y_pred: Tensor, y_true: Tensor, sigmoid:bool = True, labels:list = label_cols):
    # Compute ROC curve and ROC area for each label
    if sigmoid: y_pred = y_pred.sigmoid()
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    for i in range(len(labels)):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
        roc_auc[label_cols[i]] = auc(fpr[i], tpr[i])
    return roc_auc

def F1(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, threshold:float = threshold):
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    return fbeta(y_pred, y_true, sigmoid = sigmoid, thresh = threshold, beta = 1)

def F1_macro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, average = 'macro', sample_weight = None):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return f1_score(y_true, y_pred, average = average, sample_weight = sample_weight)

def F1_micro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True):
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    return F1_macro(y_pred, y_true, sigmoid = sigmoid, average = 'micro')

def F1_by_label(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, sample_weight = None, labels:list = label_cols):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return f1_score(y_true, y_pred, average = None)

def cls_report(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold,labels:list = label_cols):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return classification_report(y_true,y_pred,target_names=label_cols)

def accuracy_by_label(y_pred: Tensor, y_true: Tensor, sigmoid:bool = True, thresh:float = threshold, normalize:bool = True, sample_weight = None, labels:list = label_cols):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    accuracies = {}
    for i in range(len(labels)):
        accuracies[label_cols[i]] = accuracy_score(y_true[:, i], y_pred[:, i], normalize = normalize, sample_weight = sample_weight)
    return accuracies

def confusion_matrix_by_label(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, sample_weight = None, samplewise = False, labels:list = label_cols):
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = y_pred.cpu()
    y_true = y_true.cpu()
    y_pred = (y_pred > thresh).float()
    return multilabel_confusion_matrix(y_true, y_pred, labels = [i for i in range(len(labels))], sample_weight = sample_weight, samplewise = samplewise)

In [None]:
metrics = []
metrics.append({'name': 'F1_macro', 'function': F1_macro})
metrics.append({'name': 'F1_micro', 'function': F1_micro})
metrics.append({'name': 'roc_auc_score_macro', 'function': roc_auc_score_macro})
metrics.append({'name': 'roc_auc_score_micro', 'function': roc_auc_score_micro})
metrics.append({'name': 'F1_by_label', 'function': F1_by_label})
metrics.append({'name': 'cls_report', 'function': cls_report})

In [None]:
learner = BertLearner.from_pretrained_model(databunch, args.model_name, metrics=metrics, 
                                            device=device, logger=logger, output_dir=args.output_dir, 
                                            finetuned_wgts_path=FINETUNED_PATH, warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu, is_fp16=args.fp16, 
                                            multi_label=True, logging_steps=0)

07/04/2020 22:53:53 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at /root/.cache/torch/transformers/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.69e5e35e0b798cab5e473f253752f8bf4d280ee37682281a23eed80f6e2d09c6
07/04/2020 22:53:53 - INFO - transformers.configuration_utils -   Model config XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11"
  },
  "initializer_range": 0.0

In [None]:
learner.fit(args.num_train_epochs, args.learning_rate, validate=True,
            schedule_type="warmup_cosine", optimizer_type="lamb")

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
07/04/2020 18:08:32 - INFO - root -   ***** Running training *****
07/04/2020 18:08:32 - INFO - root -     Num examples = 2385
07/04/2020 18:08:32 - INFO - root -     Num Epochs = 5
07/04/2020 18:08:32 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 8
07/04/2020 18:08:32 - 

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
07/04/2020 18:13:33 - INFO - root -   Running evaluation
07/04/2020 18:13:33 - INFO - root -     Num examples = 603
07/04/2020 18:13:33 - INFO - root -     Batch size = 16


07/04/2020 18:14:06 - INFO - root -   eval_loss after epoch 1: 0.2613000261940454: 
07/04/2020 18:14:06 - INFO - root -   eval_F1_macro after epoch 1: 0.4595029996501489: 
07/04/2020 18:14:06 - INFO - root -   eval_F1_micro after epoch 1: 0.6017316017316018: 
07/04/2020 18:14:06 - INFO - root -   eval_roc_auc_score_macro after epoch 1: 0.8851981081775055: 
07/04/2020 18:14:06 - INFO - root -   eval_roc_auc_score_micro after epoch 1: 0.8945043926996347: 
07/04/2020 18:14:06 - INFO - root -   lr after epoch 1: 0.001196
07/04/2020 18:14:06 - INFO - root -   train_loss after epoch 1: 0.35565310477413065
07/04/2020 18:14:06 - INFO - root -   





07/04/2020 18:19:08 - INFO - root -   Running evaluation
07/04/2020 18:19:08 - INFO - root -     Num examples = 603
07/04/2020 18:19:08 - INFO - root -     Batch size = 16


07/04/2020 18:19:40 - INFO - root -   eval_loss after epoch 2: 0.22174918337872154: 
07/04/2020 18:19:40 - INFO - root -   eval_F1_macro after epoch 2: 0.536723172210125: 
07/04/2020 18:19:40 - INFO - root -   eval_F1_micro after epoch 2: 0.6674132138857783: 
07/04/2020 18:19:40 - INFO - root -   eval_roc_auc_score_macro after epoch 2: 0.8993081140541799: 
07/04/2020 18:19:40 - INFO - root -   eval_roc_auc_score_micro after epoch 2: 0.9234169804309263: 
07/04/2020 18:19:40 - INFO - root -   lr after epoch 2: 0.0019525093704859154
07/04/2020 18:19:40 - INFO - root -   train_loss after epoch 2: 0.21585697022867842
07/04/2020 18:19:40 - INFO - root -   





07/04/2020 18:24:42 - INFO - root -   Running evaluation
07/04/2020 18:24:42 - INFO - root -     Num examples = 603
07/04/2020 18:24:42 - INFO - root -     Batch size = 16


07/04/2020 18:25:15 - INFO - root -   eval_loss after epoch 3: 0.17429929344277634: 
07/04/2020 18:25:15 - INFO - root -   eval_F1_macro after epoch 3: 0.7035194188798904: 
07/04/2020 18:25:15 - INFO - root -   eval_F1_micro after epoch 3: 0.7405405405405406: 
07/04/2020 18:25:15 - INFO - root -   eval_roc_auc_score_macro after epoch 3: 0.932148859411587: 
07/04/2020 18:25:15 - INFO - root -   eval_roc_auc_score_micro after epoch 3: 0.9529978412395693: 
07/04/2020 18:25:15 - INFO - root -   lr after epoch 3: 0.0013120182954814436
07/04/2020 18:25:15 - INFO - root -   train_loss after epoch 3: 0.17626326503761636
07/04/2020 18:25:15 - INFO - root -   





07/04/2020 18:30:16 - INFO - root -   Running evaluation
07/04/2020 18:30:16 - INFO - root -     Num examples = 603
07/04/2020 18:30:16 - INFO - root -     Batch size = 16


07/04/2020 18:30:48 - INFO - root -   eval_loss after epoch 4: 0.16266499126428052: 
07/04/2020 18:30:48 - INFO - root -   eval_F1_macro after epoch 4: 0.7502065996391788: 
07/04/2020 18:30:48 - INFO - root -   eval_F1_micro after epoch 4: 0.778894472361809: 
07/04/2020 18:30:48 - INFO - root -   eval_roc_auc_score_macro after epoch 4: 0.9455100698499819: 
07/04/2020 18:30:48 - INFO - root -   eval_roc_auc_score_micro after epoch 4: 0.9621240769176235: 
07/04/2020 18:30:48 - INFO - root -   lr after epoch 4: 0.00041349266649147654
07/04/2020 18:30:48 - INFO - root -   train_loss after epoch 4: 0.12915618216812014
07/04/2020 18:30:48 - INFO - root -   





07/04/2020 18:35:49 - INFO - root -   Running evaluation
07/04/2020 18:35:49 - INFO - root -     Num examples = 603
07/04/2020 18:35:49 - INFO - root -     Batch size = 16


07/04/2020 18:36:22 - INFO - root -   eval_loss after epoch 5: 0.16442010061521278: 
07/04/2020 18:36:22 - INFO - root -   eval_F1_macro after epoch 5: 0.772367881344099: 
07/04/2020 18:36:22 - INFO - root -   eval_F1_micro after epoch 5: 0.7799188640973631: 
07/04/2020 18:36:22 - INFO - root -   eval_roc_auc_score_macro after epoch 5: 0.9459749215176255: 
07/04/2020 18:36:22 - INFO - root -   eval_roc_auc_score_micro after epoch 5: 0.9624281419468731: 
07/04/2020 18:36:22 - INFO - root -   lr after epoch 5: 0.0
07/04/2020 18:36:22 - INFO - root -   train_loss after epoch 5: 0.08383220616715609
07/04/2020 18:36:22 - INFO - root -   





(1495, 0.1921523456751403)

In [None]:
learner.validate()

07/04/2020 18:39:07 - INFO - root -   Running evaluation
07/04/2020 18:39:07 - INFO - root -     Num examples = 603
07/04/2020 18:39:07 - INFO - root -     Batch size = 16


{'F1_macro': 0.772367881344099,
 'F1_micro': 0.7799188640973631,
 'loss': 0.16442010061521278,
 'roc_auc_score_macro': 0.9459749215176255,
 'roc_auc_score_micro': 0.9624281419468731}

In [None]:
learner.save_model()

07/04/2020 18:40:10 - INFO - transformers.configuration_utils -   Configuration saved in models/output/model_out/config.json
07/04/2020 18:40:11 - INFO - transformers.modeling_utils -   Model weights saved in models/output/model_out/pytorch_model.bin


In [None]:
import textwrap 
 
# Wrap this text. 
wrapper = textwrap.TextWrapper(width=100) 

In [None]:
print(wrapper.fill(list(pd.read_csv('./data/test.csv')['text'].values)[20:21][0]))
print()
learner.predict_batch(list(pd.read_csv('./data/test.csv')['text'].values)[21:22])

Choice/Opt-Out. We offer you the opportunity to choose not to receive communications from us. If you
would like to take advantage of this opportunity, please let us know by accessing and updating your
profile. Please note that sometimes these requests may take up to ten business days to process and
that we are not responsible for removing information about you from the database of any third party
to whom we were authorized to disclose your User Information prior to processing your request. Even
if you do take advantage of this opportunity, we reserve the right to send you administrative
messages relating to the Services (e.g., about changes to this Privacy Policy) and to contact you
regarding any goods or services you have ordered.

07/04/2020 18:40:22 - INFO - root -   Writing example 0 of 1


[[('First Party Collection/Use', 0.732421875),
  ('Practice not covered', 0.369140625),
  ('Introductory/Generic', 0.11181640625),
  ('User Access Edit and Deletion', 0.033905029296875),
  ('Third Party Sharing/Collection', 0.012725830078125),
  ('Data Security', 0.006744384765625),
  ('User Choice/Control', 0.00646209716796875),
  ('Privacy contact information', 0.006313323974609375),
  ('Data Retention', 0.0027370452880859375),
  ('International and Specific Audiences', 0.00238800048828125),
  ('Policy Change', 0.0013561248779296875),
  ('Do Not Track', 0.00034880638122558594)]]

In [None]:
from fast_bert.prediction import BertClassificationPredictor

PATH = Path("./models/")
OUT_PATH = "./models/output/model_out/"

predictor = BertClassificationPredictor(OUT_PATH, LABEL_PATH, multi_label=True, model_type=args.model_type)

07/04/2020 19:53:27 - INFO - transformers.configuration_utils -   loading configuration file ./models/output/model_out/config.json
07/04/2020 19:53:27 - INFO - transformers.configuration_utils -   Model config XLNetConfig {
  "architectures": [
    "XLNetForMultiLabelSequenceClassification"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL

Examples from [Readdle Privacy Policy](https://readdle.com/privacy)

In [None]:
predictor.predict_batch(["""Unless no shorter storage period is indicated in this privacy policy, we, 
                        in general, store Personal Data as long (i) as required for the provision of the Services to you, 
                        and/or (ii) as it is necessary with regard to the contractual relationship with you, thereafter only 
                        if and to the extent that we are obliged to do so by mandatory statutory retention obligations. 
                        If we no longer require the respective Personal Data for the purposes described above, such Personal 
                        Data will only be stored during the respective legal retention period and not processed for other purposes."""])

07/04/2020 18:53:25 - INFO - root -   Writing example 0 of 1


[[('Data Retention', 0.96923828125),
  ('First Party Collection/Use', 0.81640625),
  ('Practice not covered', 0.092041015625),
  ('User Access Edit and Deletion', 0.0582275390625),
  ('Third Party Sharing/Collection', 0.046722412109375),
  ('User Choice/Control', 0.040771484375),
  ('Introductory/Generic', 0.024139404296875),
  ('Policy Change', 0.0173797607421875),
  ('International and Specific Audiences', 0.014007568359375),
  ('Data Security', 0.0104522705078125),
  ('Privacy contact information', 0.00426483154296875),
  ('Do Not Track', 0.0014553070068359375)]]

In [None]:
output = predictor.predict_batch(list(pd.read_csv("./data/test.csv")['text'].values))

07/04/2020 20:43:42 - INFO - root -   Writing example 0 of 749


In [None]:
pd.DataFrame(output).to_csv('./data/output_xlnet.csv')

In [None]:
results = pd.read_csv('./data/output_xlnet.csv')

In [None]:
def apply_tresh(y_pred):
  thresh = 0.5
  return float(y_pred > thresh)

In [None]:
preds = pd.DataFrame([{item[0]: apply_tresh(item[1]) for item in pred} for pred in output])

In [None]:
preds.head()

Unnamed: 0,First Party Collection/Use,Introductory/Generic,Third Party Sharing/Collection,Practice not covered,Data Retention,User Choice/Control,Data Security,Policy Change,Privacy contact information,User Access Edit and Deletion,International and Specific Audiences,Do Not Track
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
pred_label_cols = list(preds.columns)
num_labels = len(label_cols)
print('Label columns: ', num_labels ," :", label_cols)

Label columns:  12  : ['First Party Collection/Use', 'Third Party Sharing/Collection', 'User Access Edit and Deletion', 'Data Retention', 'Data Security', 'International and Specific Audiences', 'Do Not Track', 'Policy Change', 'User Choice/Control', 'Introductory/Generic', 'Practice not covered', 'Privacy contact information']


In [None]:
preds['one_hot_labels'] = list(preds[pred_label_cols].values)
preds.head()

Unnamed: 0,First Party Collection/Use,Introductory/Generic,Third Party Sharing/Collection,Practice not covered,Data Retention,User Choice/Control,Data Security,Policy Change,Privacy contact information,User Access Edit and Deletion,International and Specific Audiences,Do Not Track,one_hot_labels
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


In [None]:
data_true = pd.read_csv('./data/test.csv')

In [None]:
cols = data_true.columns
label_cols = list(cols[2:])
num_labels = len(label_cols)
print('Label columns: ', num_labels ," :", label_cols)

Label columns:  12  : ['First Party Collection/Use', 'Third Party Sharing/Collection', 'User Access Edit and Deletion', 'Data Retention', 'Data Security', 'International and Specific Audiences', 'Do Not Track', 'Policy Change', 'User Choice/Control', 'Introductory/Generic', 'Practice not covered', 'Privacy contact information']


In [None]:
data_true.head(2)

Unnamed: 0,id,text,First Party Collection/Use,Third Party Sharing/Collection,User Access Edit and Deletion,Data Retention,Data Security,International and Specific Audiences,Do Not Track,Policy Change,User Choice/Control,Introductory/Generic,Practice not covered,Privacy contact information
0,0,"Information that Sci-News.com May Collect Online Sci-News.com may collect and process the following data about you: - information that you provide by filling in forms on our site, including names, e-mail and website addresses; we may also ask you for information for other purposes, for example when you report a problem with our site;",1,0,0,0,0,0,0,0,0,0,0,0
1,1,"During the course of any visit to the Sci-News.com website, the pages you see, along with a cookie, are downloaded to your device. A website does this because cookies enable a publisher to find out whether the device has visited the website before. This is done on a repeat visit by checking to see, and finding, the cookie left there on the last visit.",1,0,0,0,0,0,0,0,0,0,1,0


In [None]:
data_true=data_true[pred_label_cols]
data_true.head()

Unnamed: 0,First Party Collection/Use,Introductory/Generic,Third Party Sharing/Collection,Practice not covered,Data Retention,User Choice/Control,Data Security,Policy Change,Privacy contact information,User Access Edit and Deletion,International and Specific Audiences,Do Not Track
0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0


In [None]:
data_true['one_hot_labels'] = list(data_true[pred_label_cols].values)
data_true.head()

Unnamed: 0,First Party Collection/Use,Introductory/Generic,Third Party Sharing/Collection,Practice not covered,Data Retention,User Choice/Control,Data Security,Policy Change,Privacy contact information,User Access Edit and Deletion,International and Specific Audiences,Do Not Track,one_hot_labels
0,1,0,0,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,1,0,0,1,0,0,0,0,0,0,0,0,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,1,0,1,0,0,0,0,0,0,0,0,0,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,1,0,0,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,0,0,1,0,0,0,0,0,0,0,0,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [None]:
from sklearn.metrics import classification_report

y_true = list(data_true['one_hot_labels'])
y_pred = list(preds['one_hot_labels'])

clf_report = classification_report(y_true,y_pred,target_names=pred_label_cols)
print(clf_report)

                                      precision    recall  f1-score   support

          First Party Collection/Use       0.86      0.88      0.87       287
                Introductory/Generic       0.87      0.65      0.74       161
      Third Party Sharing/Collection       0.90      0.88      0.89       227
                Practice not covered       0.70      0.46      0.56       138
                      Data Retention       0.72      0.75      0.73        24
                 User Choice/Control       0.72      0.66      0.69       130
                       Data Security       0.75      0.81      0.78        59
                       Policy Change       0.88      0.80      0.84        46
         Privacy contact information       0.84      0.82      0.83        72
       User Access Edit and Deletion       0.67      0.80      0.73        40
International and Specific Audiences       0.92      0.89      0.90        61
                        Do Not Track       0.88      1.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
