In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import preprocessor as p 
import re
import json
import wordninja
import random
import csv
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import precision_recall_fscore_support
from transformers import AutoModel, BertForMaskedLM, AdamW
from transformers import BertTokenizer, BertModel, AutoTokenizer, BertweetTokenizer

In [None]:
# Data Loading

def load_data(filename):

    filename = [filename]
    concat_text = pd.DataFrame()
    raw_text = pd.read_csv(filename[0],usecols=[0], encoding='ISO-8859-1')
    raw_label = pd.read_csv(filename[0],usecols=[2], encoding='ISO-8859-1')
    raw_target = pd.read_csv(filename[0],usecols=[1], encoding='ISO-8859-1')
    label = pd.DataFrame.replace(raw_label,['FAVOR','NONE','AGAINST'], [1,2,0])
    concat_text = pd.concat([raw_text, label, raw_target], axis=1)
    concat_text = concat_text[concat_text.Stance != 2]
    
    return(concat_text)

In [None]:
# Data Cleaning

def data_clean(strings, norm_dict):
    
    p.set_options(p.OPT.URL,p.OPT.EMOJI,p.OPT.RESERVED)
    clean_data = p.clean(strings)  # using lib to clean URL, emoji...
    clean_data = re.sub(r"#SemST", "", clean_data)
    clean_data = re.findall(r"[A-Za-z#@]+|[,.!?&/\<>=$]|[0-9]+",clean_data)
    clean_data = [[x.lower()] for x in clean_data]
    
    for i in range(len(clean_data)):
        if clean_data[i][0] in norm_dict.keys():
            clean_data[i][0] = norm_dict[clean_data[i][0]]
            continue
        if clean_data[i][0].startswith("#") or clean_data[i][0].startswith("@"):
            clean_data[i] = wordninja.split(clean_data[i][0]) # split compound hashtags
    clean_data = [j for i in clean_data for j in i]

    return clean_data


In [None]:
# Clean All Data

def clean_all(filename, norm_dict):
    
    concat_text = load_data(filename)
    raw_data = concat_text['Tweet'].values.tolist() 
    label = concat_text['Stance'].values.tolist()
    x_target = concat_text['Target'].values.tolist()
    clean_data = [None for _ in range(len(raw_data))]
    
    for i in range(len(raw_data)):
        clean_data[i] = data_clean(raw_data[i], norm_dict)
        x_target[i] = data_clean(x_target[i], norm_dict)
    
    return clean_data,label,x_target

In [None]:
# Tokenization

def convert_data_to_ids(tokenizer, target, text):
    
    input_ids, seg_ids, attention_masks, sent_len = [], [], [], []
    for tar, sent in zip(target, text):
        encoded_dict = tokenizer.encode_plus(
                            ' '.join(tar),                  # Target to encode
                            ' '.join(sent),                 # Sentence to encode
                            add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
                            max_length = 128,               # Pad & truncate all sentences
                            padding = 'max_length',
                            return_attention_mask = True,   # Construct attention masks
                       )

        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])
        seg_ids.append(encoded_dict['token_type_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        sent_len.append(sum(encoded_dict['attention_mask']))
    
    return input_ids, seg_ids, attention_masks, sent_len
    
def data_helper_bert(x_train_all,x_val_all,x_test_all,model_select):
    
    print('Loading data')
    
    x_train,y_train,x_train_target = x_train_all[0],x_train_all[1],x_train_all[2]                                                
    x_val,y_val,x_val_target = x_val_all[0],x_val_all[1],x_val_all[2]
    x_test,y_test,x_test_target = x_test_all[0],x_test_all[1],x_test_all[2]
                                                         
    print("Length of x_train: %d, the sum is: %d"%(len(x_train), sum(y_train)))
    print("Length of x_val: %d, the sum is: %d"%(len(x_val), sum(y_val)))
    print("Length of x_test: %d, the sum is: %d"%(len(x_test), sum(y_test)))
    
    # get the tokenizer
    if model_select == 'Bertweet':
        tokenizer = BertweetTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)
    elif model_select == 'Bert':
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
        
    # tokenization
    x_train_input_ids, x_train_seg_ids, x_train_atten_masks, x_train_len = \
                    convert_data_to_ids(tokenizer, x_train_target, x_train)
    x_val_input_ids, x_val_seg_ids, x_val_atten_masks, x_val_len = \
                    convert_data_to_ids(tokenizer, x_val_target, x_val)
    x_test_input_ids, x_test_seg_ids, x_test_atten_masks, x_test_len = \
                    convert_data_to_ids(tokenizer, x_test_target, x_test)
#     print(x_test_input_ids[0])
    x_train_all = [x_train_input_ids,x_train_seg_ids,x_train_atten_masks,y_train,x_train_len]
    x_val_all = [x_val_input_ids,x_val_seg_ids,x_val_atten_masks,y_val,x_val_len]
    x_test_all = [x_test_input_ids,x_test_seg_ids,x_test_atten_masks,y_test,x_test_len]
    
    return x_train_all,x_val_all,x_test_all

In [None]:
# BERT/BERTweet

class stance_classifier(nn.Module):

    def __init__(self,num_labels,model_select):

        super(stance_classifier, self).__init__()
        
        self.dropout = nn.Dropout(0.)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
        if model_select == 'Bertweet':
            self.bert = AutoModel.from_pretrained("vinai/bertweet-base")
        elif model_select == 'Bert':
            self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.linear = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.out = nn.Linear(self.bert.config.hidden_size, num_labels)
        
    def forward(self, x_input_ids, x_seg_ids, x_atten_masks, x_len):
        
        last_hidden = self.bert(input_ids=x_input_ids, \
                                attention_mask=x_atten_masks, token_type_ids=x_seg_ids, \
                               )
        
        query = last_hidden[0][:,0]
        query = self.dropout(query)
        
        linear = self.relu(self.linear(query))
        out = self.out(linear)
        
        return out

In [None]:
# Evaluation

def compute_f1(preds, y):
    
    rounded_preds = F.softmax(preds)
    _, indices = torch.max(rounded_preds, 1)
                
    correct = (indices == y).float()
    acc = correct.sum()/len(correct) # compute accuracy
    
    y_pred = np.array(indices.cpu().numpy())
    y_true = np.array(y.cpu().numpy())
    result = precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0,1])
#     print(result[2][0],result[2][1])
    f1_average = (result[2][0]+result[2][1])/2 # average F1 score of Favor and Against
        
    return acc, f1_average, result[0], result[1]

In [None]:
# Main 

def data_loader(x_all, batch_size, data_type):
    
    x_input_ids = torch.tensor(x_all[0], dtype=torch.long).cuda()
    x_seg_ids = torch.tensor(x_all[1], dtype=torch.long).cuda()
    x_atten_masks = torch.tensor(x_all[2], dtype=torch.long).cuda()
    y = torch.tensor(x_all[3], dtype=torch.long).cuda()
    x_len = torch.tensor(x_all[4], dtype=torch.long).cuda()

    tensor_loader = TensorDataset(x_input_ids,x_seg_ids,x_atten_masks,y,x_len)
    if data_type == 'train':
        data_loader = DataLoader(tensor_loader, shuffle=True, batch_size=batch_size)
    else:
        data_loader = DataLoader(tensor_loader, shuffle=False, batch_size=batch_size)

    return x_input_ids, x_seg_ids, x_atten_masks, y, x_len, data_loader
    
def sep_test_set(input_data):
    
    # split the combined test set for Trump, Biden and Bernie
    data_list = [input_data[:777], input_data[777:1522], input_data[1522:2157]]
    
    return data_list

def run_classifier(input_word_pair,model_select,train_mode):
    
    random_seeds = [0,1,14,15,16,17,19]
    target_word_pair = input_word_pair
    
    #Creating Normalization Dictionary
    with open("./noslang_data.json", "r") as f:
        data1 = json.load(f)
    data2 = {}
    with open("./emnlp_dict.txt","r") as f:
        lines = f.readlines()
        for line in lines:
            row = line.split('\t')
            data2[row[0]] = row[1].rstrip()
    normalization_dict = {**data1,**data2}

    for target_index in range(len(target_word_pair)):
        best_result, best_val = [], []
        for seed in random_seeds:    
            print("current random seed: ", seed)

            if train_mode == "unified":
                filename1 = '/home/ubuntu/Stance_ACL2021/raw_train_all.csv'
                filename2 = '/home/ubuntu/Stance_ACL2021/raw_val_all.csv'
                filename3 = '/home/ubuntu/Stance_ACL2021/raw_test_all.csv'
            elif train_mode == "adhoc":
                filename1 = '/home/ubuntu/Stance_ACL2021/raw_train_'+target_word_pair[target_index]+'.csv'
                filename2 = '/home/ubuntu/Stance_ACL2021/raw_val_'+target_word_pair[target_index]+'.csv'
                filename3 = '/home/ubuntu/Stance_ACL2021/raw_test_'+target_word_pair[target_index]+'.csv'
            x_train,y_train,x_train_target = clean_all(filename1, normalization_dict)
            x_val,y_val,x_val_target = clean_all(filename2, normalization_dict)
            x_test,y_test,x_test_target = clean_all(filename3, normalization_dict)
                
            num_labels = len(set(y_train))
#             print(x_train_target[0])
            x_train_all = [x_train,y_train,x_train_target]
            x_val_all = [x_val,y_val,x_val_target]
            x_test_all = [x_test,y_test,x_test_target]
            
            # set up the random seed
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed) 

            # prepare for model
            x_train_all,x_val_all,x_test_all = data_helper_bert(x_train_all,x_val_all,x_test_all,model_select)
#             print(x_test_all[0][0])
            x_train_input_ids, x_train_seg_ids, x_train_atten_masks, y_train, x_train_len, trainloader = \
                                        data_loader(x_train_all, batch_size, 'train')
            x_val_input_ids, x_val_seg_ids, x_val_atten_masks, y_val, x_val_len, valloader = \
                                        data_loader(x_val_all, batch_size, 'val')                            
            x_test_input_ids, x_test_seg_ids, x_test_atten_masks, y_test, x_test_len, testloader = \
                                        data_loader(x_test_all, batch_size, 'test')

            model = stance_classifier(num_labels,model_select).cuda()

            for n,p in model.named_parameters():
                if "bert.embeddings" in n:
                    p.requires_grad = False
            optimizer_grouped_parameters = [
                {'params': [p for n, p in model.named_parameters() if n.startswith('bert.encoder')] , 'lr': lr},
                {'params': [p for n, p in model.named_parameters() if n.startswith('bert.pooler')] , 'lr': 1e-3},
                {'params': [p for n, p in model.named_parameters() if n.startswith('linear')], 'lr': 1e-3},
                {'params': [p for n, p in model.named_parameters() if n.startswith('out')], 'lr': 1e-3}
                ]
            
            loss_function = nn.CrossEntropyLoss(reduction='sum')
            optimizer = AdamW(optimizer_grouped_parameters)
            
            sum_loss = []
            sum_val = []
            train_f1_average = []
            val_f1_average = []
            if train_mode == "unified":
                test_f1_average = [[] for i in range(3)]
            elif train_mode == "adhoc":
                test_f1_average = [[]]

            for epoch in range(0, total_epoch):
                print('Epoch:', epoch)
                train_loss, valid_loss = [], []
                model.train()
                for input_ids,seg_ids,atten_masks,target,length in trainloader:
                    optimizer.zero_grad()
                    output1 = model(input_ids, seg_ids, atten_masks, length)
                    loss = loss_function(output1, target)
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), 1)
                    optimizer.step()
                    train_loss.append(loss.item())
                sum_loss.append(sum(train_loss)/len(x_train))  
                print(sum_loss[epoch])

                # evaluation on dev set
                model.eval()
                val_preds = []
                with torch.no_grad():
                    for input_ids,seg_ids,atten_masks,target,length in valloader: 
                        pred1 = model(input_ids, seg_ids, atten_masks, length) 
                        val_preds.append(pred1)
                pred1 = torch.cat(val_preds, 0)
                acc, f1_average, precision, recall = compute_f1(pred1,y_val)
                val_f1_average.append(f1_average)
                
                # evaluation on test set
                with torch.no_grad():
                    test_preds = []
                    for input_ids,seg_ids,atten_masks,target,length in testloader:
                        pred1 = model(input_ids, seg_ids, atten_masks, length)
                        test_preds.append(pred1)
                    pred1 = torch.cat(test_preds, 0)
                    if train_mode == "unified":
                        pred1_list = sep_test_set(pred1)
                        y_test_list = sep_test_set(y_test)
                    else:
                        pred1_list = [pred1]
                        y_test_list = [y_test]
                        
                    for ind in range(len(y_test_list)):
                        pred1 = pred1_list[ind]
                        acc, f1_average, precision, recall = compute_f1(pred1,y_test_list[ind])
                        test_f1_average[ind].append(f1_average)
            
            best_epoch = [index for index,v in enumerate(val_f1_average) if v == max(val_f1_average)][-1] 
            best_result.append([f1[best_epoch] for f1 in test_f1_average])

            print("******************************************")
            print("dev results with seed {} on all epochs".format(seed))
            print(val_f1_average)
            best_val.append(val_f1_average[best_epoch])
            print("******************************************")
            print("test results with seed {} on all epochs".format(seed))
            print(test_f1_average)
            print("******************************************")
        
        # model that performs best on the dev set is evaluated on the test set
        print("model performance on the test set: ")
        print(best_result)

In [None]:
# run classifier in unified setting

lr = 2e-5
batch_size = 32
total_epoch = 3
run_classifier(['all'],'Bertweet','unified')

In [None]:
# run classifier in adhoc setting

lr = 2e-5
batch_size = 32
total_epoch = 3
run_classifier(['trump2'],'Bertweet','adhoc')
run_classifier(['biden2'],'Bertweet','adhoc')
run_classifier(['bernie2'],'Bertweet','adhoc')