In [2]:
from transformers import AdamW, RobertaConfig
from transformers import RobertaTokenizer, AutoModelForSequenceClassification, RobertaModel
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd, numpy as np
import random, torch, wget, os
from tqdm import tqdm

In [2]:
pd.set_option('precision', 2)

local = True
gpu = False
epochs = 4
batch_size = 32

# If there's a GPU available...
if torch.cuda.is_available() and not local:
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


### Dowloading dataset

In [3]:
!pip install wget
if not os.path.exists('./data/webis_train.csv'):
    wget.download("https://ml-coding-test.s3.eu-west-1.amazonaws.com/webis_train.csv", './data/webis_train.csv')
if not os.path.exists('./data/webis_test.csv'):
    wget.download("https://ml-coding-test.s3.eu-west-1.amazonaws.com/webis_test.csv", './data/webis_test.csv')



### Reading dataset and splitting training data

In [4]:
def cast_labels(data: pd.core.series.Series):
    data[data=="no-clickbait"] = 0
    data[data=="clickbait"] = 1
    return data

df = pd.read_csv("./data/webis_train.csv", delimiter=',', header=0)
# Removing nan values to simplify training process / I will only use postText to classify each instance
df = df[["postText", "truthClass"]].dropna()
training_sentences = df.postText
training_labels = cast_labels(df.truthClass)

training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(training_sentences, 
                                                                                                training_labels, test_size=0.1, random_state=42, stratify=training_labels)

df = pd.read_csv("./data/webis_test.csv", delimiter=',', header=0)
# Removing nan values to simplify training process / I will only use postText to classify each instance
df = df[["postText", "truthClass"]].dropna()
test_sentences = df.postText
test_labels = cast_labels(df.truthClass)

del df
print('Number of training sentences: {:,}\n'.format(training_sentences.shape[0]))
print('Number of validation sentences: {:,}\n'.format(validation_sentences.shape[0]))
print('Number of test sentences: {:,}\n'.format(test_sentences.shape[0]))
max_length = max([max(map(len, training_sentences)), max(map(len, validation_sentences)), max(map(len, test_sentences))])
print('Max length: {:,}\n'.format(max_length))
# manual setting max_length to 200 to ensure compatibility after tokenization.
max_length = 200

Number of training sentences: 17,535

Number of validation sentences: 1,949

Number of test sentences: 18,913

Max length: 124



In [24]:
if local:
    tokenizer = RobertaTokenizer.from_pretrained("./bert_models/roberta-base/")
    model = RobertaModel.from_pretrained("./bert_models/roberta-base/")
else:
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained("roberta-base")

Some weights of the model checkpoint at ./bert_models/roberta-base/ were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
def cast_data(data: pd.core.series.Series, max_length):
    input_ids = []
    attention_masks = []
    for sent in data:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      
                            add_special_tokens = True, 
                            truncation=True,
                            max_length = max_length,
                            padding = "max_length",
                            return_attention_mask = True,
                            return_tensors = 'pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks 

In [37]:
training_ids, training_attention = cast_data(training_sentences, max_length)
validation_ids, validation_attention = cast_data(validation_sentences, max_length)

In [38]:
training_dataset = TensorDataset(training_ids, training_attention, training_labels)
train_dataloader = DataLoader(training_dataset,  sampler = RandomSampler(training_dataset),  batch_size = batch_size)
validation_dataset = TensorDataset(validation_ids, validation_attention, validation_labels)
validation_dataloader = DataLoader(validation_dataset, sampler = SequentialSampler(validation_dataset), batch_size = batch_size)

In [39]:
if local:
    model = AutoModelForSequenceClassification.from_pretrained("./bert_models/roberta-base/", 
                                                          num_labels = 2, 
                                                          output_attentions = False, output_hidden_states = False)
else:
    model = AutoModelForSequenceClassification.from_pretrained("roberta-base", 
                                                          num_labels = 2, 
                                                          output_attentions = False, output_hidden_states = False)
if gpu:
    model.cuda()

Some weights of the model checkpoint at ./bert_models/roberta-base/ were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./bert_models/roberta-base/ and are newly initialized: ['classi

In [40]:
optimizer = AdamW(model.parameters(), lr = 2e-5)

In [41]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    labels_flat = labels.flatten()
    return np.sum(np.argmax(preds, axis=1).flatten() == labels_flat) / len(labels_flat)

In [42]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

history = []

for epoch_i in range(0, epochs):
    total_train_loss = 0
    
    model.train()

    progress_bar = tqdm(enumerate(train_dataloader), desc="Training epoch "+str(epoch_i), miniters=20)
    for step, batch in progress_bar:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels).to_tuple()

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))

    print("")
    print("Running Validation...")


    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    progress_bar = tqdm(validation_dataloader, desc="Processing validation dataset", miniters=20)

    # Evaluate data for one epoch
    for batch in progress_bar:        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        
            loss, logits = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels).to_tuple()
            
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))

    
    history.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy
        }
    )

Training epoch 0: 3it [01:20, 26.91s/it]


KeyboardInterrupt: 

### Training stats

In [None]:
df_stats = pd.DataFrame(data=history).set_index('epoch')
df_stats

In [None]:
# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])
plt.show()

## Testing the model

In [103]:
test_ids, test_attention = cast_data(test_sentences, max_length)
test_labels = torch.tensor(test_labels.to_list())
prediction_data = TensorDataset(test_ids, test_attention, test_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=32)



In [104]:
print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))
model.eval()
predictions , true_labels = [], []
for batch in prediction_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
  
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, 
                          attention_mask=b_input_mask).to_tuple()
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    predictions+=logits.tolist()
    true_labels+=label_ids.tolist()
print('    DONE.')

Predicting labels for 17,535 test sentences...


KeyboardInterrupt: 

In [108]:
precision, recall, fscore, _ = precision_recall_fscore_support(true_labels, predictions, average="macro")
print("Precision", precision)
print("Recall", recall)
print("Fscore", fscore)

[0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0]