# STEP1: Read data files

In [1]:
import os
import json
import datetime
import pandas as pd
import numpy as np

In [51]:
def readData(dataset_type="train"):

    events_f = open("./data/{}.data.txt".format(dataset_type),encoding = "utf-8")        
    list_of_events = [line.strip('\n') for line in events_f.readlines()]
    #get a list of crawled tweet ids
    events_f.close()
    if dataset_type == "test":
        crawl_tobjs = [i.split(".")[0] for i in os.listdir("./data/tweet-objects")]
    else:
        crawl_tobjs = [i.split(".")[0] for i in os.listdir("./data/crawlData/{}_objs".format(dataset_type))]
    
    event_dir_list=[] #[[event0:{obj},{obj}...],[event1:]...[eventn:]]
    # function created to get value, convert str to datetime format 
    def get_create_time(tweet):
        create_at = tweet.get('created_at')
        try:
            to_datetime = datetime.datetime.strptime(create_at, '%Y-%m-%dT%H:%M:%S.000Z')
        except ValueError:
            # Sat Apr 04 17:00:40 +0000 2020
            to_datetime = datetime.datetime.strptime(create_at, '%a %b %d %H:%M:%S +0000 %Y')
        return to_datetime
        
    remove_eve_indexs = []
    for event in list_of_events:
        each_id_lists = event.split(",")
        merge_tweets = []
        # if the source tweet does not exit, we can assume that the event makes no sense for predicting.
        if each_id_lists[0] in crawl_tobjs:

            for tid in each_id_lists:
                if tid in crawl_tobjs:
                    if dataset_type == "test":
                        f = open("./data/tweet-objects/{}.json".format(tid))
                    else:
                        f = open("./data/crawlData/{}_objs/{}.json".format(dataset_type,tid))
                    merge_tweets.append(json.load(f))
                    f.close()
        else:
            remove_eve_indexs.append(list_of_events.index(event))
            continue

        #sorting the tweets by create_at time, the tweet would be replied based on the context of previous one
        event_dir_list.append(sorted(merge_tweets, key=get_create_time))
    
    if dataset_type not in ["test","covid"]:                
        labels_f = open("./data/{}.label.txt".format(dataset_type),encoding = "utf-8")
        labels_list = [0 if label.strip("\n") == "nonrumour" else 1 for label in labels_f.readlines()]
        labels_f.close()
        event_labels = [labels_list[i] for i in range(len(labels_list)) if i not in remove_eve_indexs]
        return event_dir_list,event_labels
    elif dataset_type == "covid":
        return event_dir_list, remove_eve_indexs
    else:
        return event_dir_list

# STEP2: Preprocessing datasets

In [4]:
import emoji
import re
import nltk
import contractions
from nltk.corpus import stopwords
from transformers import BertTokenizer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

In [5]:
def preprocessing(dataset_type="train"):
    event_labels = []
    if dataset_type in ["train","dev"]:
        event_dir_list,event_labels = readData(dataset_type)
    elif dataset_type == "covid":
        event_dir_list, line_index = readData(dataset_type)
    else:
        event_dir_list = readData(dataset_type)
        
    #load BERT's WordPiece tokenisation model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
    
    #Creating an attention mask - For actual tokens its set to 1, for padding tokens its set to 0
    def create_attention_masks(input_ids):
        attention_masks = []
        for seq in input_ids:
            # containing 1s for no padded tokens
            seq_mask = [float(i>0) for i in seq]
            attention_masks.append(seq_mask)
        return np.array(attention_masks)
    
    def textNormalized(text,lang='en'):
        #convert unicode to emoji and then get its description
        expanded_text = contractions.fix(text)
        text_emoticon = expanded_text.encode('utf-16', 'surrogatepass').decode('utf-16')
        for i in text_emoticon:
            if i in emoji.UNICODE_EMOJI[lang]:
                text_emoticon = re.sub(i, emoji.UNICODE_EMOJI['en'][i], text_emoticon)
        url_replace = re.sub(r'http\S+', 'http', text_emoticon)
        #mention_replace = re.sub(r'@[a-z|0-9]+', '@user', url_replace)
        
        return url_replace
    
    input_ids_list = []
    attn_masks_list = []
    sentence_tokens = []
    max_len = 0
    for event in event_dir_list:
        event_df = pd.DataFrame.from_dict(event, orient='columns') 
        event_df['text'] = event_df['text'].transform(textNormalized)
        event_df['text'].drop_duplicates()
        for each in list(event_df['text']):
            if len(each.split(" ")) > max_len:
                max_len = len(each.split(" "))
        
        tokens_list=[]
        for index, row in event_df.iterrows():
            if index == 0 :
                texts = ['[AUT]']
            else:
                texts = ['[REP]']
            if row['user']['verified']=='true':
                texts.append('[VER]')
            else:
                texts.append('[NON]')
            user_mentions = row['entities']['user_mentions']
            text = row['text']
            if user_mentions  != [] :
                
                for i in user_mentions:
                    text = text.replace('@'+i['screen_name'],'[MEN]'+ i['name'])
                                        
            texts.extend(tokenizer.tokenize(text)+['[SEP]'])
            tokens_list.append(texts)

        #texts = list(event_df['text'].drop_duplicates())
        #tokens_list = list(map(lambda t: tokenizer.tokenize(t)+['[SEP]'], texts))

        merge_tokens = [ele for sub_list in tokens_list for ele in sub_list]
        #tokens_without_sw = [word for word in merge_tokens if not word in stopwords.words()]
        #lemmatizer = WordNetLemmatizer()
        #lem_tokens = list(map(lambda t:lemmatizer.lemmatize(t),tokens_without_sw))
        #print(merge_tokens)        
        sentence_tokens.append(['[CLS]'] + merge_tokens)  
    #tokenizer_texts = [tokens + ['[PAD]' for n in range(MAX_LEN - len(tokens))] if len(tokens) < MAX_LEN else tokens[:MAX_LEN-1] + ['[SEP]'] for tokens in tokenizer_texts]
    #Converting Input words to corresponding ids
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in sentence_tokens]
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(sequences = input_ids,maxlen=128,dtype='long',padding='post',truncating='post',value=0)
    attn_masks = create_attention_masks(input_ids)    
    return input_ids,event_labels,attn_masks
    

# STEP3-a: Loading train/dev/test datasets

In [6]:
import nltk
import tensorflow as tf
import torch
from torch.utils.data import DataLoader,Dataset
from random import sample

In [7]:
train_input_ids, train_labels,train_attn_masks = preprocessing(dataset_type="train")
dev_input_ids, dev_labels, dev_attn_masks = preprocessing(dataset_type="dev")
test_input_ids, test_labels, test_attn_masks = preprocessing(dataset_type="test")

# STEP3-b: Balancing dataset (optional)

In [8]:
from collections import Counter

In [9]:
Counter(train_labels)

Counter({0: 1244, 1: 318})

In [10]:
Counter(dev_labels)

Counter({0: 420, 1: 115})

In [None]:
train_input_ids_list = train_input_ids.tolist()
train_attn_masks_list = train_attn_masks.tolist()
train_df = pd.DataFrame({'input_ids':train_input_ids_list,'labels':train_labels,'attn_masks':train_attn_masks_list})
train_df.to_csv("train_preprocessing.csv",index=False,sep=',')

In [None]:
train_input_ids_list

In [None]:
train_df['labels'].value_counts()

In [None]:
bal_train_indexs = sample(list(train_df[train_df['labels']==0].index),318)
bal_train_indexs += list(train_df[train_df['labels']==1].index)

In [None]:
bal_train_df = train_df.loc[bal_train_indexs]

In [None]:
dev_input_ids_list = dev_input_ids.tolist()
dev_attn_masks_list = dev_attn_masks.tolist()
dev_df = pd.DataFrame({'input_ids':dev_input_ids_list,'labels':dev_labels,'attn_masks':dev_attn_masks_list})
dev_df.to_csv("dev_preprocessing.csv",index=False,sep=',')

In [None]:
dev_df['labels'].value_counts()

In [None]:
bal_dev_indexs = sample(list(dev_df[dev_df['labels']==0].index),115)
bal_dev_indexs += list(dev_df[dev_df['labels']==1].index)

In [None]:
bal_dev_df = dev_df.loc[bal_dev_indexs]

In [None]:
test_input_ids_list = test_input_ids.tolist()
test_attn_masks_list = test_attn_masks.tolist()
test_df = pd.DataFrame({'input_ids':test_input_ids_list,'attn_masks':test_attn_masks_list})
test_df.to_csv("test_preprocessing.csv",index=False,sep=',')

# STEP4: Feeding data

### If BALANCED (unexpected performance)

In [None]:
X_train = torch.tensor(list(bal_train_df['input_ids']))
X_val = torch.tensor(list(bal_dev_df['input_ids']))
y_train = torch.tensor(list(bal_train_df['labels'] ))
y_val = torch.tensor(list(bal_dev_df['labels']))
attention_masks_train = torch.tensor(list(bal_train_df['attn_masks']))
attention_masks_val = torch.tensor(list(bal_dev_df['attn_masks']))

## Otherwise:

In [11]:
X_train = torch.tensor(train_input_ids)
X_val = torch.tensor(dev_input_ids)
y_train = torch.tensor(train_labels )
y_val = torch.tensor(dev_labels)
attention_masks_train = torch.tensor(train_attn_masks)
attention_masks_val = torch.tensor(dev_attn_masks)

In [12]:
test_input_ids = torch.tensor(test_input_ids)
test_attention_masks = torch.tensor(test_attn_masks)
#Dataset wrapping tensors.
train_data = torch.utils.data.TensorDataset(X_train, attention_masks_train, y_train)
val_data = torch.utils.data.TensorDataset(X_val, attention_masks_val, y_val)
test_data = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks)

In [13]:
#Samples elements randomly. If without replacement(default), then sample from a shuffled dataset.
train_sampler = torch.utils.data.RandomSampler(train_data)
val_sampler = torch.utils.data.SequentialSampler(val_data)
test_sampler = torch.utils.data.SequentialSampler(test_data)

# STEP5: Building RumorDetectionClassifier

In [14]:
import torch
import torch.nn as nn
from transformers import BertModel, BertForSequenceClassification
import torch.optim as optim

# Option1: (APPLIED)

In [15]:
class RumorDetectionClassifier(nn.Module):

    def __init__(self):
        super(RumorDetectionClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        
        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks, return_dict=True)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

# Option2:

In [None]:
def myBertModel():
    x_train, y_train,x_details = readData(dataset_type="train",feature="text")
    
    # Bert layers
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text') 
    
    def get_sentence_embedding(sentences):
    
        bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
        bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

        #keys:['input_word_ids', 'input_mask', 'input_type_ids']
        preprocessed_text = bert_preprocess(sentences)
        #keys: ['sequence_output', 'encoder_outputs', 'default', 'pooled_output']
        bert_results = bert_encoder(preprocessed_text)['pooled_output']
        return bert_results
    
    outputs = get_sentence_embedding(text_input)
    # Neural network layers
    l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs)
    l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

    # Use inputs and outputs to construct a final model
    model = tf.keras.Model(inputs=[text_input], outputs = [l])
    
    METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
    ]

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=METRICS)
    model.fit(x_train, y_train, epochs=2)
    return model

# STEP6: Training model

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import wandb
import tensorflow as tf
import time
import torch
from torch import nn
import torch.optim as optim

In [17]:
criterion = nn.BCEWithLogitsLoss() #cross entropy loss

# --Hyper_parameter by Weights and Biases

In [18]:
#Hyperparameter Optimization for Hugging Face Transformers
#https://wandb.ai/amogkam/transformers/reports/Hyperparameter-Optimization-for-Hugging-Face-Transformers--VmlldzoyMTc2ODI#grid-search-(baseline):

In [19]:
#command shells
#wandb login

In [20]:
import wandb

In [21]:
sweep_config = {
    'method': 'grid', 
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {

        'learning_rate': {
            'values': [ 5e-5, 3e-5 ]
        },
        'batch_size': {
            'values': [8,16,32]
        },
        'epochs':{
            'values':[4]
        },
        'model_name_or_path':{
            'values': ["bert-base-uncased"]
        }
    }
}
sweep_defaults = {
            'learning_rate': 3e-5,
       
            'batch_size': 32,

            'epochs':4
}

sweep_id = wandb.sweep(sweep_config)

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/mist/.netrc


Create sweep with ID: syunmbgv
Sweep URL: https://wandb.ai/nlp_pj/uncategorized/sweeps/syunmbgv


In [22]:
from transformers import get_linear_schedule_with_warmup
import datetime
def ret_dataloader():
    batch_size = wandb.config.batch_size
    print('batch_size = ', batch_size)
    train_dataloader = DataLoader(
                train_data,  # The training samples.
                sampler = train_sampler, 
                batch_size = batch_size # Trains with this batch size.
            )

    validation_dataloader = DataLoader(
                val_data, # The validation samples.
                sampler = val_sampler, # Pull out batches sequentially.
                batch_size = batch_size # Evaluate with this batch size.
            )
    return train_dataloader,validation_dataloader
def ret_optim(model):
    print('Learning_rate = ',wandb.config.learning_rate )
    optimizer = optim.Adam(model.parameters(),
                      lr = wandb.config.learning_rate
                    )
    return optimizer

def ret_scheduler(train_dataloader,optimizer):
    epochs = wandb.config.epochs
    print('epochs =>', epochs)
    # Total number of training steps is [number of batches] x [number of epochs]. 
    # (Note that this is not the same as the number of training samples).
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)
    return scheduler

def get_accuracy(logits, labels):
    probs = torch.sigmoid(torch.tensor(logits)) 
    #probs = F.softmax(logits, dim=0)
    preds = (probs > 0.5).long()
    acc = f1_score(preds, labels, average = 'micro')
    
    return acc

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [23]:
import random
def train():
    wandb.login()
    wandb.init()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model = RumorDetectionClassifier()
    model.to(device)
    #wandb.init(config=sweep_defaults)
    train_dataloader,val_dataloader = ret_dataloader()
    optimizer = ret_optim(model)
    scheduler = ret_scheduler(train_dataloader,optimizer)

    #print("config ",wandb.config.learning_rate, "\n",wandb.config)
    seed_val = 42
   
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    
    training_stats = []

    # Measure the total training time for the whole run.
    total_t0 = time.time()
    epochs = wandb.config.epochs
    # For each epoch...
    for epoch_i in range(0, epochs):

        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0

        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Always clear any previously calculated gradients before performing a backward pass. 
            model.zero_grad()        

            logits = model(b_input_ids, b_input_mask)
            loss = criterion(logits.squeeze(-1), b_labels.float())
            
            wandb.log({'train_batch_loss':loss.item()})

            total_train_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)            
        
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        wandb.log({'avg_train_loss':avg_train_loss})

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))
            
        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        for batch in val_dataloader:

            b_input_ids = batch[0].cuda()
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            with torch.no_grad():        

                # values prior to applying an activation function like the softmax.
                logits = model(b_input_ids, b_input_mask)
                loss = criterion(logits.squeeze(-1), b_labels.float())
                
            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            total_eval_accuracy += get_accuracy(logits, label_ids)
            

        # Report the final accuracy for this validation run.
        avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(val_dataloader)
        
        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)
        wandb.log({'val_accuracy':avg_val_accuracy,'avg_val_loss':avg_val_loss})
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


In [24]:
wandb.agent(sweep_id,function=train)

[34m[1mwandb[0m: Agent Starting Run: rax7ox6q with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	model_name_or_path: bert-base-uncased
[34m[1mwandb[0m: Currently logged in as: [33mzanqraaa21[0m ([33mnlp_pj[0m). Use [1m`wandb login --relogin`[0m to force relogin


cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


batch_size =  8
Learning_rate =  5e-05
epochs => 4
Training...
  Batch    40  of    196.    Elapsed: 0:00:04.
  Batch    80  of    196.    Elapsed: 0:00:08.
  Batch   120  of    196.    Elapsed: 0:00:12.
  Batch   160  of    196.    Elapsed: 0:00:16.

  Average training loss: 0.36
  Training epcoh took: 0:00:19

Running Validation...
  Accuracy: 0.93
  Validation Loss: 0.22
  Validation took: 0:00:01
Training...
  Batch    40  of    196.    Elapsed: 0:00:04.
  Batch    80  of    196.    Elapsed: 0:00:07.
  Batch   120  of    196.    Elapsed: 0:00:11.
  Batch   160  of    196.    Elapsed: 0:00:15.

  Average training loss: 0.17
  Training epcoh took: 0:00:18

Running Validation...
  Accuracy: 0.93
  Validation Loss: 0.25
  Validation took: 0:00:01
Training...
  Batch    40  of    196.    Elapsed: 0:00:03.
  Batch    80  of    196.    Elapsed: 0:00:07.
  Batch   120  of    196.    Elapsed: 0:00:10.
  Batch   160  of    196.    Elapsed: 0:00:13.

  Average training loss: 0.05
  Training e

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
avg_train_loss,█▄▂▁
avg_val_loss,▁▇█▆
train_batch_loss,▅▅▄▄▅▄▃█▂▇▂▁▆▁▃▇▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▆█

0,1
avg_train_loss,0.01068
avg_val_loss,0.24851
train_batch_loss,0.0006
val_accuracy,0.94776


[34m[1mwandb[0m: Agent Starting Run: cb7etou3 with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	model_name_or_path: bert-base-uncased


cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


batch_size =  8
Learning_rate =  3e-05
epochs => 4
Training...
  Batch    40  of    196.    Elapsed: 0:00:04.
  Batch    80  of    196.    Elapsed: 0:00:08.
  Batch   120  of    196.    Elapsed: 0:00:11.
  Batch   160  of    196.    Elapsed: 0:00:15.

  Average training loss: 0.39
  Training epcoh took: 0:00:18

Running Validation...
  Accuracy: 0.90
  Validation Loss: 0.26
  Validation took: 0:00:01
Training...
  Batch    40  of    196.    Elapsed: 0:00:03.
  Batch    80  of    196.    Elapsed: 0:00:07.
  Batch   120  of    196.    Elapsed: 0:00:11.
  Batch   160  of    196.    Elapsed: 0:00:15.

  Average training loss: 0.20
  Training epcoh took: 0:00:18

Running Validation...
  Accuracy: 0.92
  Validation Loss: 0.27
  Validation took: 0:00:01
Training...
  Batch    40  of    196.    Elapsed: 0:00:04.
  Batch    80  of    196.    Elapsed: 0:00:07.
  Batch   120  of    196.    Elapsed: 0:00:11.
  Batch   160  of    196.    Elapsed: 0:00:15.

  Average training loss: 0.04
  Training e

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
avg_train_loss,█▄▂▁
avg_val_loss,▅▆█▁
train_batch_loss,▂▃▂▃▃▂▂▃▁▂▄▁▁▁▃▃▁▂█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▄▇█

0,1
avg_train_loss,0.01227
avg_val_loss,0.24517
train_batch_loss,0.00066
val_accuracy,0.95336


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: n1m9lna0 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	model_name_or_path: bert-base-uncased


cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


batch_size =  16
Learning_rate =  5e-05
epochs => 4
Training...
  Batch    40  of     98.    Elapsed: 0:00:04.
  Batch    80  of     98.    Elapsed: 0:00:09.

  Average training loss: 0.36
  Training epcoh took: 0:00:11

Running Validation...
  Accuracy: 0.90
  Validation Loss: 0.28
  Validation took: 0:00:01
Training...
  Batch    40  of     98.    Elapsed: 0:00:04.
  Batch    80  of     98.    Elapsed: 0:00:09.

  Average training loss: 0.16
  Training epcoh took: 0:00:10

Running Validation...
  Accuracy: 0.93
  Validation Loss: 0.24
  Validation took: 0:00:01
Training...
  Batch    40  of     98.    Elapsed: 0:00:04.
  Batch    80  of     98.    Elapsed: 0:00:09.

  Average training loss: 0.06
  Training epcoh took: 0:00:10

Running Validation...
  Accuracy: 0.94
  Validation Loss: 0.23
  Validation took: 0:00:01
Training...
  Batch    40  of     98.    Elapsed: 0:00:04.
  Batch    80  of     98.    Elapsed: 0:00:08.

  Average training loss: 0.01
  Training epcoh took: 0:00:10

Ru

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
avg_train_loss,█▄▂▁
avg_val_loss,█▄▂▁
train_batch_loss,█▆▅▇▅▄▅▄▃▆▃▂▁▁▃▅▁▁▄▁▁▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▆▇█

0,1
avg_train_loss,0.00809
avg_val_loss,0.22025
train_batch_loss,0.00099
val_accuracy,0.94853


[34m[1mwandb[0m: Agent Starting Run: 5p3xoshf with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	model_name_or_path: bert-base-uncased


cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


batch_size =  16
Learning_rate =  3e-05
epochs => 4
Training...
  Batch    40  of     98.    Elapsed: 0:00:04.
  Batch    80  of     98.    Elapsed: 0:00:09.

  Average training loss: 0.38
  Training epcoh took: 0:00:11

Running Validation...
  Accuracy: 0.85
  Validation Loss: 0.33
  Validation took: 0:00:01
Training...
  Batch    40  of     98.    Elapsed: 0:00:04.
  Batch    80  of     98.    Elapsed: 0:00:09.

  Average training loss: 0.18
  Training epcoh took: 0:00:11

Running Validation...
  Accuracy: 0.92
  Validation Loss: 0.28
  Validation took: 0:00:01
Training...
  Batch    40  of     98.    Elapsed: 0:00:04.
  Batch    80  of     98.    Elapsed: 0:00:09.

  Average training loss: 0.07
  Training epcoh took: 0:00:11

Running Validation...
  Accuracy: 0.94
  Validation Loss: 0.21
  Validation took: 0:00:01
Training...
  Batch    40  of     98.    Elapsed: 0:00:04.
  Batch    80  of     98.    Elapsed: 0:00:09.

  Average training loss: 0.02
  Training epcoh took: 0:00:11

Ru

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
avg_train_loss,█▄▂▁
avg_val_loss,█▅▁▂
train_batch_loss,█▅▄▅▅▄▄▆▃▇▄▃▂▂▃▅▁▁▅▁▁▁▁▂▃▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▆██

0,1
avg_train_loss,0.02052
avg_val_loss,0.22342
train_batch_loss,0.00162
val_accuracy,0.93382


[34m[1mwandb[0m: Agent Starting Run: p6o3r6wl with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	model_name_or_path: bert-base-uncased


cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


batch_size =  32
Learning_rate =  5e-05
epochs => 4
Training...
  Batch    40  of     49.    Elapsed: 0:00:07.

  Average training loss: 0.39
  Training epcoh took: 0:00:09

Running Validation...
  Accuracy: 0.88
  Validation Loss: 0.27
  Validation took: 0:00:01
Training...
  Batch    40  of     49.    Elapsed: 0:00:07.

  Average training loss: 0.17
  Training epcoh took: 0:00:09

Running Validation...
  Accuracy: 0.90
  Validation Loss: 0.26
  Validation took: 0:00:01
Training...
  Batch    40  of     49.    Elapsed: 0:00:07.

  Average training loss: 0.05
  Training epcoh took: 0:00:09

Running Validation...
  Accuracy: 0.93
  Validation Loss: 0.26
  Validation took: 0:00:01
Training...
  Batch    40  of     49.    Elapsed: 0:00:07.

  Average training loss: 0.01
  Training epcoh took: 0:00:09

Running Validation...
  Accuracy: 0.94
  Validation Loss: 0.23
  Validation took: 0:00:01

Training complete!
Total training took 0:00:39 (h:mm:ss)


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
avg_train_loss,█▄▂▁
avg_val_loss,█▆▆▁
train_batch_loss,▇▆▅█▆▆▄▅▅▅▃▂▂▂▄▄▂▁▆▂▁▁▁▁▂▁▁▁▂▁▁▁▁▁▁▂▁▁▁▁
val_accuracy,▁▃▇█

0,1
avg_train_loss,0.00856
avg_val_loss,0.22845
train_batch_loss,0.00262
val_accuracy,0.94118


[34m[1mwandb[0m: Agent Starting Run: rtz7mpqk with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	model_name_or_path: bert-base-uncased


cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


batch_size =  32
Learning_rate =  3e-05
epochs => 4
Training...
  Batch    40  of     49.    Elapsed: 0:00:08.

  Average training loss: 0.41
  Training epcoh took: 0:00:09

Running Validation...
  Accuracy: 0.84
  Validation Loss: 0.39
  Validation took: 0:00:01
Training...
  Batch    40  of     49.    Elapsed: 0:00:07.

  Average training loss: 0.23
  Training epcoh took: 0:00:09

Running Validation...
  Accuracy: 0.89
  Validation Loss: 0.28
  Validation took: 0:00:01
Training...
  Batch    40  of     49.    Elapsed: 0:00:07.

  Average training loss: 0.11
  Training epcoh took: 0:00:09

Running Validation...
  Accuracy: 0.90
  Validation Loss: 0.26
  Validation took: 0:00:01
Training...
  Batch    40  of     49.    Elapsed: 0:00:07.

  Average training loss: 0.05
  Training epcoh took: 0:00:09

Running Validation...
  Accuracy: 0.92
  Validation Loss: 0.24
  Validation took: 0:00:01

Training complete!
Total training took 0:00:40 (h:mm:ss)


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
avg_train_loss,█▅▂▁
avg_val_loss,█▃▂▁
train_batch_loss,█▇▆█▆▇▄▆▆▇▄▄▄▃▄▄▃▃▅▂▂▂▂▃▂▂▂▁▂▁▁▁▁▂▂▃▁▂▁▁
val_accuracy,▁▆▇█

0,1
avg_train_loss,0.045
avg_val_loss,0.24091
train_batch_loss,0.03164
val_accuracy,0.9184


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [25]:
api = wandb.Api()

sweep = api.sweep("nlp_pj/uncategorized/syunmbgv")
runs = sorted(sweep.runs,key=lambda run: run.summary.get("val_accuracy", 0), reverse=True)
val_acc = runs[0].summary.get("val_accuracy")
print(f"Best run {runs[0].name} with {val_acc}% validation accuracy")

#runs[0].file("model.h5").download(replace=True)
#print("Best model saved to model-best.h5")

Best run good-sweep-2 with 0.9533582089552238% validation accuracy


In [26]:
api = wandb.Api()

runs = api.runs("nlp_pj/uncategorized")

summary_list, config_list, name_list = [], [], []
for run in runs: 

    summary_list.append(run.summary._json_dict)

    config_list.append(
        {k: v for k,v in run.config.items()
          if not k.startswith('_')})

    name_list.append(run.name)

runs_df = pd.DataFrame({
    "summary": summary_list,
    "config": config_list,
    "name": name_list
    })

runs_df.to_csv("project.csv")

# Fine-tuning

In [27]:
BATCH_SIZE = 8

#represents a Python iterable over a dataset
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = BATCH_SIZE)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size = BATCH_SIZE)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = BATCH_SIZE)

In [28]:
def train(model, criterion, LRate, train_dataloader, val_dataloader, test_dataloader,num_epoch, gpu):
    opti = optim.Adam(model.parameters(), lr = LRate) #,label_smoothing=0.0
    best_acc = 0
    st = time.time()
    cand_list = []
    for ep in range(num_epoch):
        
        model.train()
        for it, (seq, attn_masks, labels) in enumerate(train_dataloader):
            #Clear gradients
            opti.zero_grad()  
            
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            
            #Obtaining the logits from the model
            logits = model(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(model, criterion, val_dataloader, gpu)
        
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(model.state_dict(), 'sstcls_{}.dat'.format(ep))
        
        outputs = []
        
        #Iterate over the test_loader 
        for step, batch in enumerate(test_dataloader):
            #Transfer batch to GPUs
            batch = tuple(t.to("cuda") for t in batch)
            #We dont need to update gradients as we are just predicting
            with torch.no_grad():
                #Bring up the next batch of input_texts and attention_masks 
                b_input_ids, b_input_mask = batch
                #Forward propogate the inputs and get output as logits

                logits = model(b_input_ids, b_input_mask)
                #Pass the outputs through a sigmoid function to get the multi-label preditions
                s = nn.Sigmoid() 
                out = s(logits).to('cpu').numpy()
                #Add the predictions for this batch to the final list
                outputs.extend(out)
        cand_list.append(outputs)
    return cand_list

In [29]:
from numpy import exp
import torch.nn.functional as F
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1)) 
    #probs = F.softmax(logits, dim=0)
    preds = (probs > 0.5).long()
    acc = f1_score(preds.squeeze().detach().cpu().numpy(), labels.detach().cpu().numpy(), average = 'micro')
    return acc

def evaluate(model, criterion, dataloader, gpu):
    model.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = model(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [30]:
gpu = 0 #gpu ID

print("Creating the rumor detection classifier, initialised with pretrained BERT-BASE parameters...")
model = RumorDetectionClassifier()
model.cuda(gpu) #Enable gpu support for the model
print("Done creating the rumor detection classifier.")

Creating the rumor detection classifier, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the rumor detection classifier.


In [31]:
num_epoch = 4
LRate = 0.00003
preds_fine_tuning = train(model, criterion, LRate, train_dataloader, val_dataloader, test_dataloader,num_epoch, gpu)

Iteration 0 of epoch 0 complete. Loss: 0.6060468554496765; Accuracy: 0.75; Time taken (s): 0.08098936080932617
Iteration 100 of epoch 0 complete. Loss: 0.10284268856048584; Accuracy: 1.0; Time taken (s): 6.957259654998779
Epoch 0 complete! Development Accuracy: 0.9123134328358209; Development Loss: 0.21881516344511687
Best development accuracy improved from 0 to 0.9123134328358209, saving model...
Iteration 0 of epoch 1 complete. Loss: 0.1182466596364975; Accuracy: 1.0; Time taken (s): 10.432116985321045
Iteration 100 of epoch 1 complete. Loss: 0.03237377852201462; Accuracy: 1.0; Time taken (s): 7.826496124267578
Epoch 1 complete! Development Accuracy: 0.9421641791044776; Development Loss: 0.14780610948523035
Best development accuracy improved from 0.9123134328358209 to 0.9421641791044776, saving model...
Iteration 0 of epoch 2 complete. Loss: 0.010133877396583557; Accuracy: 1.0; Time taken (s): 10.863122701644897
Iteration 100 of epoch 2 complete. Loss: 0.008637635037302971; Accuracy:

In [32]:
from pandas.core.frame import DataFrame
from collections import Counter
def MajorityVoting(outputs):
    preds_fine_tuning_df = DataFrame(outputs)
    def probToLabel(num):
        prob = float(np.asarray(num))
        if prob > 0.5:
            #print(prob)
            return 1
        else:
            return 0
    new_df = preds_fine_tuning_df.applymap(probToLabel)
    preds_ensembling = []
    for name, values in new_df.iteritems():
        #print(new_df[name])
        preds_ensembling.append(Counter(list(new_df[name])).most_common(1)[0][0])
    return preds_ensembling

In [33]:
preds_ensembling = MajorityVoting(preds_fine_tuning)

In [34]:
id_list = [i for i in range(len(preds_ensembling))]
predicted_df = pd.DataFrame({'Id':id_list,'Predicted':preds_ensembling})
predicted_df.to_csv("test.predicted.csv",index=False,sep=',')

# Ensembling

In [35]:
input_ids = train_input_ids.tolist()+dev_input_ids.tolist()
#input_ids =  np.array(input_ids)
labels = train_labels+dev_labels
attn_masks = train_attn_masks.tolist()+dev_attn_masks.tolist()
#attn_masks = np.array(attn_masks)
preprocess_df = pd.DataFrame({'input_ids':input_ids,'labels':labels,'attn_masks':attn_masks})

In [36]:
gpu = 0 #gpu ID

print("Creating the rumor detection classifier, initialised with pretrained BERT-BASE parameters...")
model = RumorDetectionClassifier()
model.cuda(gpu) #Enable gpu support for the model
print("Done creating the rumor detection classifier.")

Creating the rumor detection classifier, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the rumor detection classifier.


In [37]:
def train_validate_test_split(df, train_percent=.7, validate_percent=.3, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    train = df.loc[perm[:train_end]]
    validate = df.loc[perm[train_end:]]
    return train, validate

cand_list = []
for r in range(10):
    en_train, en_validate = train_validate_test_split(preprocess_df)
    en_X_train = torch.tensor(list(en_train['input_ids']))
    en_X_val = torch.tensor(list(en_validate['input_ids']))    
    en_y_train = torch.tensor(list(en_train['labels']))    
    en_y_val = torch.tensor(list(en_validate['labels']))    
    en_attention_masks_train = torch.tensor(list(en_train['attn_masks']))
    en_attention_masks_val = torch.tensor(list(en_validate['attn_masks']))    
    en_test_input_ids = torch.tensor(test_input_ids)
    en_test_attention_masks = torch.tensor(test_attn_masks)
 
    en_train_data = torch.utils.data.TensorDataset(en_X_train, en_attention_masks_train, en_y_train)
    en_val_data = torch.utils.data.TensorDataset(en_X_val, en_attention_masks_val, en_y_val)
    en_test_data = torch.utils.data.TensorDataset(en_test_input_ids, en_test_attention_masks)

    en_train_sampler = torch.utils.data.RandomSampler(en_train_data)
    en_val_sampler = torch.utils.data.SequentialSampler(en_val_data)
    en_test_sampler = torch.utils.data.SequentialSampler(en_test_data)
    
    BATCH_SIZE = 8

    en_train_dataloader = DataLoader(en_train_data, sampler = en_train_sampler, batch_size = BATCH_SIZE)
    en_val_dataloader = DataLoader(en_val_data, sampler = en_val_sampler, batch_size = BATCH_SIZE)
    en_test_dataloader = DataLoader(en_test_data, sampler = en_test_sampler, batch_size = BATCH_SIZE)
    
    num_epoch = 4
    LRate = 3e-5
    
    opti = optim.Adam(model.parameters(), lr = LRate) #,label_smoothing=0.0
    best_acc = 0
    st = time.time()
    
    for ep in range(num_epoch):
        
        model.train()
        for it, (seq, attn_masks, labels) in enumerate(en_train_dataloader):

            opti.zero_grad()  
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = model(seq, attn_masks)
            loss = criterion(logits.squeeze(-1), labels.float())
            loss.backward()
            opti.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(model, criterion, en_val_dataloader, gpu)
        
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(model.state_dict(), 'sstcls_{}.dat'.format(ep))
        
        outputs = []
        
 
        for step, batch in enumerate(en_test_dataloader):
            batch = tuple(t.to("cuda") for t in batch)

            with torch.no_grad():

                b_input_ids, b_input_mask = batch

                logits = model(b_input_ids, b_input_mask)
                s = nn.Sigmoid() 
                out = s(logits).to('cpu').numpy()
                outputs.extend(out)
        cand_list.append(outputs)
        



Iteration 0 of epoch 0 complete. Loss: 0.8297592401504517; Accuracy: 0.0; Time taken (s): 0.07875561714172363
Iteration 100 of epoch 0 complete. Loss: 0.36297494173049927; Accuracy: 1.0; Time taken (s): 7.358769178390503
Epoch 0 complete! Development Accuracy: 0.8370253164556962; Development Loss: 0.38422293128752255
Best development accuracy improved from 0 to 0.8370253164556962, saving model...
Iteration 0 of epoch 1 complete. Loss: 0.3603743314743042; Accuracy: 0.75; Time taken (s): 9.718173503875732
Iteration 100 of epoch 1 complete. Loss: 0.38923266530036926; Accuracy: 0.875; Time taken (s): 6.983335971832275
Epoch 1 complete! Development Accuracy: 0.9108649789029535; Development Loss: 0.18514114872941487
Best development accuracy improved from 0.8370253164556962 to 0.9108649789029535, saving model...
Iteration 0 of epoch 2 complete. Loss: 0.009328015148639679; Accuracy: 1.0; Time taken (s): 9.098792314529419
Iteration 100 of epoch 2 complete. Loss: 0.004731050226837397; Accuracy:



Iteration 0 of epoch 0 complete. Loss: 0.0015140671748667955; Accuracy: 1.0; Time taken (s): 0.08220672607421875
Iteration 100 of epoch 0 complete. Loss: 0.07042383402585983; Accuracy: 1.0; Time taken (s): 7.3735411167144775
Epoch 0 complete! Development Accuracy: 0.9778481012658228; Development Loss: 0.08180485971572608
Best development accuracy improved from 0 to 0.9778481012658228, saving model...
Iteration 0 of epoch 1 complete. Loss: 0.016465017572045326; Accuracy: 1.0; Time taken (s): 9.669538497924805
Iteration 100 of epoch 1 complete. Loss: 0.189011350274086; Accuracy: 0.875; Time taken (s): 6.994989395141602
Epoch 1 complete! Development Accuracy: 0.9762658227848101; Development Loss: 0.1050199047723841
Iteration 0 of epoch 2 complete. Loss: 0.015816837549209595; Accuracy: 1.0; Time taken (s): 8.41226840019226
Iteration 100 of epoch 2 complete. Loss: 0.0007060837233439088; Accuracy: 1.0; Time taken (s): 6.950765371322632
Epoch 2 complete! Development Accuracy: 0.98259493670886



Iteration 0 of epoch 0 complete. Loss: 0.000347429740941152; Accuracy: 1.0; Time taken (s): 0.07765364646911621
Iteration 100 of epoch 0 complete. Loss: 0.002340195933356881; Accuracy: 1.0; Time taken (s): 5.889510869979858
Epoch 0 complete! Development Accuracy: 0.9984177215189873; Development Loss: 0.007714324469436431
Best development accuracy improved from 0 to 0.9984177215189873, saving model...
Iteration 0 of epoch 1 complete. Loss: 0.0031031863763928413; Accuracy: 1.0; Time taken (s): 8.088213920593262
Iteration 100 of epoch 1 complete. Loss: 0.0009498874424025416; Accuracy: 1.0; Time taken (s): 7.394997835159302
Epoch 1 complete! Development Accuracy: 0.9968354430379747; Development Loss: 0.014176146583734787
Iteration 0 of epoch 2 complete. Loss: 0.008857407607138157; Accuracy: 1.0; Time taken (s): 8.886963367462158
Iteration 100 of epoch 2 complete. Loss: 0.00034928671084344387; Accuracy: 1.0; Time taken (s): 7.150412321090698
Epoch 2 complete! Development Accuracy: 0.9984177



Iteration 0 of epoch 0 complete. Loss: 0.000390275614336133; Accuracy: 1.0; Time taken (s): 0.08238983154296875
Iteration 100 of epoch 0 complete. Loss: 0.00019720390264410526; Accuracy: 1.0; Time taken (s): 7.299767017364502
Epoch 0 complete! Development Accuracy: 0.9968354430379747; Development Loss: 0.020779271934367863
Best development accuracy improved from 0 to 0.9968354430379747, saving model...
Iteration 0 of epoch 1 complete. Loss: 0.00015537664876319468; Accuracy: 1.0; Time taken (s): 9.715476989746094
Iteration 100 of epoch 1 complete. Loss: 0.003326812759041786; Accuracy: 1.0; Time taken (s): 7.129087686538696
Epoch 1 complete! Development Accuracy: 1.0; Development Loss: 0.0010241193485393224
Best development accuracy improved from 0.9968354430379747 to 1.0, saving model...
Iteration 0 of epoch 2 complete. Loss: 0.0002811567683238536; Accuracy: 1.0; Time taken (s): 9.642694234848022
Iteration 100 of epoch 2 complete. Loss: 0.00019914793665520847; Accuracy: 1.0; Time taken 



Iteration 0 of epoch 0 complete. Loss: 0.0001671163336141035; Accuracy: 1.0; Time taken (s): 0.08664584159851074
Iteration 100 of epoch 0 complete. Loss: 4.024722147732973e-05; Accuracy: 1.0; Time taken (s): 6.381785869598389
Epoch 0 complete! Development Accuracy: 1.0; Development Loss: 0.00013716650062610536
Best development accuracy improved from 0 to 1.0, saving model...
Iteration 0 of epoch 1 complete. Loss: 0.0001391946861986071; Accuracy: 1.0; Time taken (s): 9.480836629867554
Iteration 100 of epoch 1 complete. Loss: 4.9083217163570225e-05; Accuracy: 1.0; Time taken (s): 7.009309768676758
Epoch 1 complete! Development Accuracy: 0.9984177215189873; Development Loss: 0.008798643705942388
Iteration 0 of epoch 2 complete. Loss: 6.627772381762043e-05; Accuracy: 1.0; Time taken (s): 8.199063777923584
Iteration 100 of epoch 2 complete. Loss: 6.15393728367053e-05; Accuracy: 1.0; Time taken (s): 7.739902973175049
Epoch 2 complete! Development Accuracy: 0.9984177215189873; Development Los



Iteration 0 of epoch 0 complete. Loss: 5.881279139430262e-05; Accuracy: 1.0; Time taken (s): 0.08019757270812988
Iteration 100 of epoch 0 complete. Loss: 6.242530071176589e-05; Accuracy: 1.0; Time taken (s): 7.203251838684082
Epoch 0 complete! Development Accuracy: 0.9978902953586497; Development Loss: 0.017623243876521054
Best development accuracy improved from 0 to 0.9978902953586497, saving model...
Iteration 0 of epoch 1 complete. Loss: 2.4124678020598367e-05; Accuracy: 1.0; Time taken (s): 9.680160999298096
Iteration 100 of epoch 1 complete. Loss: 0.0003397318359930068; Accuracy: 1.0; Time taken (s): 7.044120788574219
Epoch 1 complete! Development Accuracy: 1.0; Development Loss: 0.00016628697817453296
Best development accuracy improved from 0.9978902953586497 to 1.0, saving model...
Iteration 0 of epoch 2 complete. Loss: 2.7447556931292638e-05; Accuracy: 1.0; Time taken (s): 9.574346780776978
Iteration 100 of epoch 2 complete. Loss: 5.8124227507505566e-05; Accuracy: 1.0; Time tak



Iteration 0 of epoch 0 complete. Loss: 2.1755457055405714e-05; Accuracy: 1.0; Time taken (s): 0.09182095527648926
Iteration 100 of epoch 0 complete. Loss: 6.213764208951034e-06; Accuracy: 1.0; Time taken (s): 7.257682800292969
Epoch 0 complete! Development Accuracy: 0.9857594936708861; Development Loss: 0.13784790555982449
Best development accuracy improved from 0 to 0.9857594936708861, saving model...
Iteration 0 of epoch 1 complete. Loss: 4.872666977462359e-06; Accuracy: 1.0; Time taken (s): 9.688417434692383
Iteration 100 of epoch 1 complete. Loss: 0.00015050446381792426; Accuracy: 1.0; Time taken (s): 6.942434072494507
Epoch 1 complete! Development Accuracy: 1.0; Development Loss: 8.224865853697315e-05
Best development accuracy improved from 0.9857594936708861 to 1.0, saving model...
Iteration 0 of epoch 2 complete. Loss: 5.32552003278397e-05; Accuracy: 1.0; Time taken (s): 9.550647974014282
Iteration 100 of epoch 2 complete. Loss: 6.396789103746414e-05; Accuracy: 1.0; Time taken (



Iteration 0 of epoch 0 complete. Loss: 1.5869609342189506e-05; Accuracy: 1.0; Time taken (s): 0.09055423736572266
Iteration 100 of epoch 0 complete. Loss: 2.2500730665342417e-06; Accuracy: 1.0; Time taken (s): 6.961956262588501
Epoch 0 complete! Development Accuracy: 1.0; Development Loss: 1.0311975279076746e-06
Best development accuracy improved from 0 to 1.0, saving model...
Iteration 0 of epoch 1 complete. Loss: 1.564620674798789e-06; Accuracy: 1.0; Time taken (s): 9.295673370361328
Iteration 100 of epoch 1 complete. Loss: 1.1473887298052432e-06; Accuracy: 1.0; Time taken (s): 7.103535413742065
Epoch 1 complete! Development Accuracy: 1.0; Development Loss: 5.029297956096781e-07
Iteration 0 of epoch 2 complete. Loss: 7.450577186318696e-07; Accuracy: 1.0; Time taken (s): 9.029592275619507
Iteration 100 of epoch 2 complete. Loss: 5.662439548359544e-07; Accuracy: 1.0; Time taken (s): 6.919949531555176
Epoch 2 complete! Development Accuracy: 1.0; Development Loss: 2.639454118672825e-07
I



Iteration 0 of epoch 0 complete. Loss: 2.9802316703353426e-07; Accuracy: 1.0; Time taken (s): 0.08135986328125
Iteration 100 of epoch 0 complete. Loss: 1.1920928244535389e-07; Accuracy: 1.0; Time taken (s): 7.667430877685547
Epoch 0 complete! Development Accuracy: 1.0; Development Loss: 1.886222823502435e-10
Best development accuracy improved from 0 to 1.0, saving model...
Iteration 0 of epoch 1 complete. Loss: 2.9802320611338473e-08; Accuracy: 1.0; Time taken (s): 9.675573825836182
Iteration 100 of epoch 1 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 6.958543062210083
Epoch 1 complete! Development Accuracy: 1.0; Development Loss: 0.0
Iteration 0 of epoch 2 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 8.228141784667969
Iteration 100 of epoch 2 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 7.266549110412598
Epoch 2 complete! Development Accuracy: 1.0; Development Loss: 0.0
Iteration 0 of epoch 3 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 8.89340615272522
Ite



Iteration 0 of epoch 0 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 0.0856316089630127
Iteration 100 of epoch 0 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 6.9781529903411865
Epoch 0 complete! Development Accuracy: 1.0; Development Loss: 0.0
Best development accuracy improved from 0 to 1.0, saving model...
Iteration 0 of epoch 1 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 9.04682183265686
Iteration 100 of epoch 1 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 7.098207712173462
Epoch 1 complete! Development Accuracy: 1.0; Development Loss: 0.0
Iteration 0 of epoch 2 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 8.894661664962769
Iteration 100 of epoch 2 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 7.159968614578247
Epoch 2 complete! Development Accuracy: 1.0; Development Loss: 0.0
Iteration 0 of epoch 3 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 9.0114266872406
Iteration 100 of epoch 3 complete. Loss: 0.0; Accuracy: 1.0; Time taken (s): 

In [38]:
preds_sample_ensembling = MajorityVoting(cand_list)

In [39]:
id_list_sample = [i for i in range(len(preds_sample_ensembling))]
predicted_df_sample = pd.DataFrame({'Id':id_list_sample,'Predicted':preds_sample_ensembling})
predicted_df_sample.to_csv("test.predicted_sample.csv",index=False,sep=',')

# TASK2: COVID-19 Rumor Analysis

In [42]:
covid_input_ids, covid_labels, covid_attn_masks = preprocessing(dataset_type="covid")

In [43]:
covid_input_ids = torch.tensor(covid_input_ids)
covid_attention_masks = torch.tensor(covid_attn_masks)
covid_data = torch.utils.data.TensorDataset(covid_input_ids, covid_attention_masks)
covid_sampler = torch.utils.data.SequentialSampler(covid_data)
covid_dataloader = DataLoader(covid_data, sampler = covid_sampler, batch_size = 8)

In [52]:
covid_events, covid_line_index= readData(dataset_type="covid")

In [54]:
len(covid_line_index)

1496

In [53]:
covid_pred_index_df = pd.DataFrame({'predicted_index':covid_line_index})
covid_pred_index_df.to_csv("covid_predicted_index.csv",index=False,sep=',')

In [46]:
gpu = 0 #gpu ID
model = RumorDetectionClassifier()
model.cuda(gpu) #Enable gpu support for the model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RumorDetectionClassifier(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [47]:
num_epoch = 4
LRate = 0.00003
covid_preds = train(model, criterion, LRate, train_dataloader, val_dataloader, covid_dataloader,num_epoch, gpu)

Iteration 0 of epoch 0 complete. Loss: 0.7182886004447937; Accuracy: 0.5; Time taken (s): 0.10550618171691895
Iteration 100 of epoch 0 complete. Loss: 0.2919047474861145; Accuracy: 0.875; Time taken (s): 8.79378628730774
Epoch 0 complete! Development Accuracy: 0.9011194029850746; Development Loss: 0.23004813803665675
Best development accuracy improved from 0 to 0.9011194029850746, saving model...
Iteration 0 of epoch 1 complete. Loss: 0.18369704484939575; Accuracy: 1.0; Time taken (s): 40.253735065460205
Iteration 100 of epoch 1 complete. Loss: 0.11061564087867737; Accuracy: 0.875; Time taken (s): 7.688759088516235
Epoch 1 complete! Development Accuracy: 0.8899253731343284; Development Loss: 0.2271653250026614
Iteration 0 of epoch 2 complete. Loss: 0.24670301377773285; Accuracy: 0.875; Time taken (s): 40.18888020515442
Iteration 100 of epoch 2 complete. Loss: 0.0017059332458302379; Accuracy: 1.0; Time taken (s): 8.287480592727661
Epoch 2 complete! Development Accuracy: 0.95708955223880

In [48]:
covid_preds_ensembling = MajorityVoting(covid_preds)

In [49]:
covid_id_list = [i for i in range(len(covid_preds_ensembling))]
covid_preds_df = pd.DataFrame({'Id':covid_id_list,'Predicted':covid_preds_ensembling})
covid_preds_df.to_csv("covid_predictions.csv",index=False,sep=',')