In [80]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval

In [82]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1070 Ti'

In [83]:
df = pd.read_csv(r'Archive\clean_data.csv').drop(columns='Unnamed: 0')

In [84]:
print('Unique comments: ', df.comment_text.nunique() == df.shape[0])
print('Null values: ', df.isnull().values.any())
# df[df.isna().any(axis=1)]

Unique comments:  True
Null values:  False


In [85]:
print('average sentence length: ', df.comment_text.str.split().str.len().mean())
print('stdev sentence length: ', df.comment_text.str.split().str.len().std())

average sentence length:  33.96282969055416
stdev sentence length:  52.52506139045776


In [86]:
cols = df.columns
label_cols = list(cols[2:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)

Label columns:  ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [87]:
print('Count of 1 per label: \n', df[label_cols].sum(), '\n') # Label counts, may need to downsample or upsample
print('Count of 0 per label: \n', df[label_cols].eq(0).sum())

Count of 1 per label: 
 toxic            15102
severe_toxic      1574
obscene           8333
threat             465
insult            7777
identity_hate     1387
dtype: int64 

Count of 0 per label: 
 toxic            142470
severe_toxic     155998
obscene          149239
threat           157107
insult           149795
identity_hate    156185
dtype: int64


In [88]:
df = df.sample(frac=1).reset_index(drop=True) #shuffle rows

In [89]:
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,one_hot_labels
0,6114c09ee934a8eb,ekanti beliefnet article reliable section cited reliable published reliable secondary source reputable media newspapers magazines books documentaries tv programs etc beliefnet article published internet talkemail,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
1,be898eafd4d524c3,list real names professional wrestlers thanks experimenting wikipedia test worked reverted removed please use sandbox tests want take look welcome page would like learn contributing encyclopedia thanks,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
2,961473b62481cc84,minor change line enwikipedia quite insignificant,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
3,48311f95b4c33ef1,regarding cold fusion accepted mediation case regarding cold fusion provide brief summary view points regarding issue thanks contribs,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
4,d27a91d455f281b0,live white owl landing phillips landing road county councilman tim dukes told must write concerns control anything concerning robert graham wildlife area phillips landing roadwell goes time leave development first thing see awful mess across road fallen trees twigs branches etc along area get absolutely disgusted every time leave homes point living nice development like half see mess every time leave anything done cleaning junk make pleasant look maryland letting people cut fallen logs firewood please respond concerns njkelleyaolcom thank attention njk,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [90]:
labels = list(df.one_hot_labels.values)
comments = list(df.comment_text.values)

In [91]:
max_length = 100
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False)
encodings = tokenizer.batch_encode_plus(comments, max_length=max_length, pad_to_max_length=True, truncation=True)

print('tokenizer outputs: ', encodings.keys())


loading file vocab.json from cache at C:\Users\User\.cache\huggingface\hub\models--roberta-base\snapshots\e2da8e2f811d1448a5b465c236feacd80ffbac7b\vocab.json
loading file merges.txt from cache at C:\Users\User\.cache\huggingface\hub\models--roberta-base\snapshots\e2da8e2f811d1448a5b465c236feacd80ffbac7b\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\User\.cache\huggingface\hub\models--roberta-base\snapshots\e2da8e2f811d1448a5b465c236feacd80ffbac7b\tokenizer_config.json
loading file tokenizer.json from cache at C:\Users\User\.cache\huggingface\hub\models--roberta-base\snapshots\e2da8e2f811d1448a5b465c236feacd80ffbac7b\tokenizer.json
loading configuration file config.json from cache at C:\Users\User\.cache\huggingface\hub\models--roberta-base\snapshots\e2da8e2f811d1448a5b465c236feacd80ffbac7b\config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-b

tokenizer outputs:  dict_keys(['input_ids', 'attention_mask'])


Since RobertaTokenizer is designed for use with the RoBERTa model, and RoBERTa does not use token_type_ids, these are not included in the output of batch_encode_plus. The presence of input_ids and attention_mask is sufficient for using RoBERTa.

In [92]:
print('tokenizer outputs: ', encodings.keys())

tokenizer outputs:  dict_keys(['input_ids', 'attention_mask'])


In [93]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
#token_type_ids = encodings['token_type_ids'] # token type ids
attention_masks = encodings['attention_mask'] # attention masks

In [94]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = df['one_hot_labels'].astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

df label indices with only one instance:  [151574, 59449]


In [95]:
# Gathering single instance inputs to force into the training set after stratified split
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
#one_freq_token_types = [token_type_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

In [96]:
print(len(input_ids))
print(len(labels))
print(len(attention_masks))

157570
157570
157570


In [101]:
# Use train_test_split to split our data into train and validation sets

train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(
    input_ids, labels, attention_masks,
    random_state=42, test_size=0.1, stratify=labels
)

# Add one frequency data to train data
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)
#train_token_types.extend(one_freq_token_types)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
#train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
#validation_token_types = torch.tensor(validation_token_types)

In [102]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels
                           #, train_token_types
                           )
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels
                                #, validation_token_types
                                )
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [103]:
torch.save(validation_dataloader,'validation_data_loader')
torch.save(train_dataloader,'train_data_loader')

# Load Model & Set Params

In [105]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
model.cuda()

loading configuration file config.json from cache at C:\Users\User\.cache\huggingface\hub\models--roberta-base\snapshots\e2da8e2f811d1448a5b465c236feacd80ffbac7b\config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [106]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [107]:
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization



# Train Model
BCEWithLogitsLoss stands for Binary Cross Entropy with Logits Loss. It is a combination of two components:

Sigmoid Activation: Applies the sigmoid function to the logits (raw outputs) to convert them into probabilities.
Binary Cross Entropy Loss: Computes the binary cross-entropy loss between the predicted probabilities and the true binary labels.

In [110]:
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score, accuracy_score
from tqdm import trange

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

    # Training

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0 # running loss
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch  # Note: Removed b_token_types

        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()

        # Forward pass for multilabel classification
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]

        # Define the loss function
        loss_func = BCEWithLogitsLoss() # Binary Cross Entropy with Logits Loss
        loss = loss_func(logits.view(-1, num_labels), b_labels.type_as(logits).view(-1, num_labels))

        train_loss_set.append(loss.item())

        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # scheduler.step()  # Uncomment if you're using a learning rate scheduler

        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss / nb_tr_steps))

    ###########################################################################

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Variables to gather full output
    logit_preds, true_labels, pred_labels = [], [], []

    # Predict
    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch  # Note: Removed b_token_types

        with torch.no_grad():
            # Forward pass
            outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            b_logit_pred = outs[0]
            pred_label = torch.sigmoid(b_logit_pred) # sigmoid activation function

            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.cpu().numpy()
            b_labels = b_labels.cpu().numpy()

        logit_preds.append(b_logit_pred)
        pred_labels.append(pred_label)
        true_labels.append(b_labels)

    # Flatten outputs
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    # Calculate Accuracy
    threshold = 0.50
    pred_bools = [pl > threshold for pl in pred_labels]
    true_bools = [tl == 1 for tl in true_labels]
    val_f1_accuracy = f1_score(true_bools, pred_bools, average='micro') * 100
    val_flat_accuracy = accuracy_score(true_bools, pred_bools) * 100

    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Flat Validation Accuracy: ', val_flat_accuracy)


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 0.05573568981557341


Epoch:  33%|███▎      | 1/3 [32:32<1:05:05, 1952.77s/it]

F1 Validation Accuracy:  75.1995181448577
Flat Validation Accuracy:  91.77508408961097
Train loss: 0.04069234438097426


Epoch:  67%|██████▋   | 2/3 [1:05:13<32:37, 1957.25s/it]

F1 Validation Accuracy:  76.00785142684585
Flat Validation Accuracy:  92.23202386241036
Train loss: 0.03560930162481391


Epoch: 100%|██████████| 3/3 [1:37:44<00:00, 1954.92s/it]

F1 Validation Accuracy:  76.16944942734925
Flat Validation Accuracy:  91.6037316748112





In [111]:
torch.save(model.state_dict(), 'roberta_model_comment')

# Load and Preprocess Test Data

In [124]:
test_df = pd.read_csv('Archive/test.csv')
test_labels_df = pd.read_csv('Archive/test_labels.csv')
test_df = test_df.merge(test_labels_df, on='id', how='left')
test_label_cols = list(test_df.columns[2:])
print('Null values: ', test_df.isnull().values.any()) #should not be any null sentences or labels
print('Same columns between train and test: ', label_cols == test_label_cols) #columns should be the same
test_df.head()

Null values:  False
Same columns between train and test:  True


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",-1,-1,-1,-1,-1,-1
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO.",-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message.",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1


In [125]:
test_df = test_df[~test_df[test_label_cols].eq(-1).any(axis=1)] #remove irrelevant rows/comments with -1 values
test_df['one_hot_labels'] = list(test_df[test_label_cols].values)
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,one_hot_labels
5,0001ea8717f6de06,Thank you for understanding. I think very highly of you and would not revert without discussion.,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Religion? Really?? You mean, the way people have invariably kept adding """"Religion"""" to the Samuel Beckett infobox? And why do you bother bringing up the long-dead completely non-existent """"Influences"""" issue? You're just flailing, making up crap on the fly. \n ::: For comparison, the only explicit acknowledgement in the entire Amos Oz article that he is personally Jewish is in the categories! \n\n """,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a type. The """"Type"""" of institution is needed in this case because there are three levels of SUNY schools: \n -University Centers and Doctoral Granting Institutions \n -State Colleges \n -Community Colleges. \n\n It is needed in this case to clarify that UB is a SUNY Center. It says it even in Binghamton University, University at Albany, State University of New York, and Stony Brook University. Stop trying to say it's not because I am totally right in this case.""",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the list, make sure it's relevant == \n\n Before adding a new product to the list, make sure it has a wikipedia entry already, """"proving"""" it's relevance and giving the reader the possibility to read more about it. \n Otherwise it could be subject to deletion. See this article's revision history.""",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [126]:
# Gathering input data
test_labels = list(test_df.one_hot_labels.values)
test_comments = list(test_df.comment_text.values)

In [128]:
# Encoding input data
test_encodings = tokenizer.batch_encode_plus(test_comments,max_length=max_length,pad_to_max_length=True, truncation=True)
test_input_ids = test_encodings['input_ids']
#test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']

In [129]:
# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
# test_token_types = torch.tensor(test_token_type_ids)
# Create test dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels
                          #, test_token_types
                          )
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Prediction and Metics

In [132]:
# Test

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

In [133]:
pred_bools = [pl>0.50 for pl in pred_labels] #boolean output after thresholding

# Print and save classification report
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')
clf_report = classification_report(true_bools,pred_bools,target_names=test_label_cols)
pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
print(clf_report)

Test F1 Accuracy:  0.6456116260441698
Test Flat Accuracy:  0.8612960705242427 

               precision    recall  f1-score   support

        toxic       0.51      0.90      0.65      6090
 severe_toxic       0.39      0.26      0.31       367
      obscene       0.62      0.74      0.67      3691
       threat       0.42      0.67      0.52       211
       insult       0.65      0.70      0.67      3427
identity_hate       0.46      0.66      0.54       712

    micro avg       0.55      0.78      0.65     14498
    macro avg       0.51      0.65      0.56     14498
 weighted avg       0.56      0.78      0.64     14498
  samples avg       0.08      0.08      0.07     14498



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Optimizing threshold value for F1 score

In [135]:
# Calculate Accuracy - maximize F1 accuracy by tuning threshold values. First with 'macro_thresholds' on the order of e^-1 then with 'micro_thresholds' on the order of e^-2

macro_thresholds = np.array(range(1,10))/10

f1_results, flat_acc_results = [], []
for th in macro_thresholds:
  pred_bools = [pl>th for pl in pred_labels]
  test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  f1_results.append(test_f1_accuracy)
  flat_acc_results.append(test_flat_accuracy)

best_macro_th = macro_thresholds[np.argmax(f1_results)] #best macro threshold value

micro_thresholds = (np.array(range(10))/100)+best_macro_th #calculating micro threshold values

f1_results, flat_acc_results = [], []
for th in micro_thresholds:
  pred_bools = [pl>th for pl in pred_labels]
  test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  f1_results.append(test_f1_accuracy)
  flat_acc_results.append(test_flat_accuracy)

best_f1_idx = np.argmax(f1_results) #best threshold value

# Printing and saving classification report
print('Best Threshold: ', micro_thresholds[best_f1_idx])
print('Test F1 Accuracy: ', f1_results[best_f1_idx])
print('Test Flat Accuracy: ', flat_acc_results[best_f1_idx], '\n')

best_pred_bools = [pl>micro_thresholds[best_f1_idx] for pl in pred_labels]
clf_report_optimized = classification_report(true_bools,best_pred_bools, target_names=label_cols)
pickle.dump(clf_report_optimized, open('classification_report_optimized.txt','wb'))
print(clf_report_optimized)

Best Threshold:  0.65
Test F1 Accuracy:  0.6565349544072948
Test Flat Accuracy:  0.880349495138954 

               precision    recall  f1-score   support

        toxic       0.57      0.84      0.68      6090
 severe_toxic       0.51      0.06      0.10       367
      obscene       0.69      0.67      0.68      3691
       threat       0.51      0.52      0.52       211
       insult       0.74      0.57      0.65      3427
identity_hate       0.57      0.54      0.56       712

    micro avg       0.62      0.69      0.66     14498
    macro avg       0.60      0.53      0.53     14498
 weighted avg       0.64      0.69      0.65     14498
  samples avg       0.07      0.07      0.07     14498



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Bonus

In [148]:
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

In [168]:
# Calculate class weights for each label independently
class_weights_per_label = {}
for column in ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']:
    y = df[column]
    class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
    class_weights_dict = dict(zip(np.unique(y), class_weights))
    class_weights_per_label[column] = class_weights_dict

# Print class weights for each label
for label, weights in class_weights_per_label.items():
    print(f"Class weights for {label}: {weights}")

Class weights for toxic: {0: 0.5530006317119394, 1: 5.216924910607866}
Class weights for severe_toxic: {0: 0.5050449364735445, 1: 50.05463786531131}
Class weights for obscene: {0: 0.5279183055367564, 1: 9.454698187927518}
Class weights for threat: {0: 0.5014798831369703, 1: 169.43225806451613}
Class weights for insult: {0: 0.525958810374178, 1: 10.13064163559213}
Class weights for identity_hate: {0: 0.5044402471428114, 1: 56.80317231434751}


In [169]:
# Create a dictionary to store PyTorch tensors for class weights
class_weights_tensors = {}

for column in ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']:
    # Create tensor from class weights
    class_weights = class_weights_per_label[column]
    # Convert to tensor and ensure the tensor has the right shape for the loss function
    class_weights_tensor = torch.tensor([class_weights.get(0, 1.0), class_weights.get(1, 1.0)], dtype=torch.float32)
    class_weights_tensors[column] = class_weights_tensor

# Print the class weights tensors
for label, tensor in class_weights_tensors.items():
    print(f"Class weights tensor for {label}: {tensor}")


Class weights tensor for toxic: tensor([0.5530, 5.2169])
Class weights tensor for severe_toxic: tensor([ 0.5050, 50.0546])
Class weights tensor for obscene: tensor([0.5279, 9.4547])
Class weights tensor for threat: tensor([  0.5015, 169.4323])
Class weights tensor for insult: tensor([ 0.5260, 10.1306])
Class weights tensor for identity_hate: tensor([ 0.5044, 56.8032])


In [170]:
class_weights_tensors

{'toxic': tensor([0.5530, 5.2169]),
 'severe_toxic': tensor([ 0.5050, 50.0546]),
 'obscene': tensor([0.5279, 9.4547]),
 'threat': tensor([  0.5015, 169.4323]),
 'insult': tensor([ 0.5260, 10.1306]),
 'identity_hate': tensor([ 0.5044, 56.8032])}

In [172]:
import torch
import torch.nn as nn
import torch.optim as optim


# Initialize BCEWithLogitsLoss
# The pos_weight parameter is not directly useful for multiclass, so we manually adjust the loss
loss_func = nn.BCEWithLogitsLoss(reduction='none')

# Compute loss
loss = loss_func(logits.view(-1, num_labels), b_labels.type_as(logits).view(-1, num_labels))

# Apply class weights
# Expand class weights to match the shape of logits
weights = class_weights_tensor.view(1, -1).expand_as(logits)  # Shape: (batch_size, num_classes)
weighted_loss = loss * weights

# Compute the mean loss
mean_loss = weighted_loss.mean()

print("Mean Loss with Class Weights:", mean_loss.item())


AttributeError: 'numpy.ndarray' object has no attribute 'type_as'