### Install dependencies

In [None]:
!pip install transformers
!pip install torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Import data

In [None]:
import pandas as pd
import bz2

with bz2.open('/content/drive/MyDrive/Universitet/Thesis/NLP code/speeches-201718.json.bz2') as source:
    speeches_201718 = pd.read_json(source)

with bz2.open('/content/drive/MyDrive/Universitet/Thesis/NLP code/speeches-201819.json.bz2') as source:
    speeches_201819 = pd.read_json(source)

speeches = pd.concat([speeches_201718, speeches_201819])

## Global variables

In [None]:
# number of locations
L = 5

### Preprocess data


In [None]:
from sklearn.model_selection import train_test_split

party_to_int = {
    'S': 0,
    'M': 1,
    'MP': 2,
    'SD': 3,
    'V': 4,
    'C': 5,
    'KD': 6,
    'L': 7,
}

def speeches_split(speeches, max_text_length=1000):
  texts = []
  labels = []
  for _, row in speeches.iterrows():
    words_arr = row["words"].split(" ")
    words_arr = words_arr[-max_text_length:]
    words = ' '.join(map(str, words_arr)) 
    texts.append(words)
    party = row["party"]
    party_int = party_to_int[party]
    labels.append(party_int)
  return texts, labels

train_texts, train_labels = speeches_split(speeches)

# Split data set into 80/20
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=.2)


In [None]:
## Split data into n subsets

def split_into_n_arrays(texts, labels, n):
  texts_splitted = []
  labels_splitted = []
  texts_init_len = len(texts)
  print(texts_init_len)
  for i in range(n):
    subset_text = []
    subset_labels = []
    if i != n-1:
      for i in range(texts_init_len//n):
        subset_text.append(texts.pop())
        subset_labels.append(labels.pop())
    else:
      for i in range(len(texts)):
        subset_text.append(texts.pop())
        subset_labels.append(labels.pop())
    texts_splitted.append(subset_text)
    labels_splitted.append(subset_labels)
  return texts_splitted, labels_splitted

train_split, labels_split = split_into_n_arrays(train_texts, train_labels, L)

## Import pre-trained Swedish bert and tokenize data

In [None]:
from transformers import AutoModel,AutoTokenizer,TFAutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')

def encode_arrays(train_split):
  encoded_split = []
  for texts in train_split:
    encoded_split.append(tokenizer(texts, truncation=True, max_length=256, padding=True))
  return encoded_split

test_encodings = tokenizer(test_texts, truncation=True, max_length=256, padding=True)
train_L_encodings = encode_arrays(train_split)


## Setup for pytorch


In [None]:
import torch

class SpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def gen_train_datasets(train_multiple_encodings, labels_split):
  train_datasets = []
  for i, train_encs in enumerate(train_multiple_encodings):
    labels = labels_split[i]
    train_dataset = SpeechDataset(train_encs, labels)
    train_datasets.append(train_dataset)
  return train_datasets

#train_dataset = SpeechDataset(train_encodings, train_labels)
train_datasets = gen_train_datasets(train_L_encodings, labels_split)
test_dataset = SpeechDataset(test_encodings, test_labels)

## Adapt pre-trained BERT
Add a dropout layer, and a linear layer on top of the pooled BERT output.
Calculate cross entropy loss.

In [None]:
from torch import nn

class CustomBERTModel(nn.Module):
    def __init__(self):
        super(CustomBERTModel, self).__init__()
        self.num_labels = 8

        self.bert = AutoModel.from_pretrained('KB/bert-base-swedish-cased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 8)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output


## Training setup and fedavg code

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
import transformers
import math
import copy

## TRAINING GLOBAL VARIABLES ##
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
batch_size = 4
cycles = 10

def fedavg(models):
  params1 = models[0]['model'].named_parameters()
  resulting_params = models[0]['model'].named_parameters()
  resulting_params = copy.deepcopy(dict(resulting_params))

  # N = total number of training samples
  N = 17304

  for name, _ in params1:
    total = 0
    for model_dict in models:
      model = model_dict['model']
      total += dict(model.named_parameters())[name]*model_dict['n_samples']/N
    #if not torch.all(torch.eq(total, resulting_params[name])):
      #print("")
    resulting_params[name].data.copy_(total)
    
  model = CustomBERTModel()
  model = model.cuda()
  model.to(device)
  model.load_state_dict(resulting_params, strict=False)
  return model

def create_n_models(n, train_datasets):
  models = []
  for i in range(n):
    model = CustomBERTModel()
    model = model.cuda()
    model.to(device)
    train_loader = DataLoader(train_datasets[i], batch_size=batch_size, shuffle=True)
    optim = AdamW(model.parameters(), lr=5e-5,  weight_decay=0.01)
    warm_steps = int(len(train_datasets[i])*0.1/batch_size*cycles)
    train_steps = int(len(train_datasets[i])/batch_size*cycles)
    scheduler = transformers.get_linear_schedule_with_warmup(optim, warm_steps, train_steps)
    models.append({'model': model, 'loader': train_loader, 'optim': optim, 'scheduler': scheduler, 'n_samples': len(train_datasets[i])})
  return models


def update_models(global_model, models):
  for i in range(L):
    # load global model parameters
    m = models[i]['model']
    m.load_state_dict(copy.deepcopy(dict(global_model.named_parameters())), strict=False)
    models[i]['model'] = m
  return models

### Evaluation code

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report

def compute_metrics(pred):
    labels = pred['label_ids']
    preds = pred['predictions'].argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    cr = classification_report(labels, preds, digits=3)
    print(cr)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def evaluate(model):
  test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)
  pred = {
          'label_ids': torch.empty(0).to(device),
          'predictions':torch.empty(0).to(device)
          }
  model.eval()

  with torch.no_grad():
    for i, batch in enumerate(test_loader):
      #print("batch: {}/{}".format(i, len(test_loader)))
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      pred['predictions'] = torch.cat((pred['predictions'], outputs[1]), 0)
      pred['label_ids'] = torch.cat((pred['label_ids'], labels), 0)

  pred['predictions'] = pred['predictions'].cpu()
  pred['label_ids'] = pred['label_ids'].cpu()

  compute_metrics(pred)



## Train

In [None]:
def same(m1, m2):
  for p1, p2 in zip(m1.parameters(), m2.parameters()):
      if p1.data.ne(p2.data).sum() > 0:
          print(p1.data.ne(p2.data).sum())
          return False
  return True

def same2(m1, m2):
  m2_dict = dict(m2.named_parameters())
  for n, p in m1.named_parameters():
    if not torch.all(torch.eq(p, m2_dict[n])):
      print(n)
      print(p)
      print(m2_dict[n])
      return False
  return True

In [None]:
import time
start_time = time.time()

def train():
  models = create_n_models(L, train_datasets)
  for c in range(cycles):
    print("Cycle: {}".format(c+1))
    for j, model_dict in enumerate(models):
      print("Training model {}".format(j))
      model = model_dict['model']
      test = copy.deepcopy(model_dict['model'])
      model.train()
      train_loader = model_dict['loader']
      optim = model_dict['optim']
      scheduler = model_dict['scheduler']
      #for epoch in range(epochs):
      for i, batch in enumerate(train_loader):
        #print("\rbatch: {}/{}".format(i, len(train_loader), end=""))
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        scheduler.step()

    print("learning rates.........")
    for mdl in models:
      for param_group in mdl['optim'].param_groups:
        print(param_group['lr'])


    print()
    
    for i, m in enumerate(models):
      print("Evaluating model {} ------------------...".format(i))
      evaluate(m['model'])
    print("Evaluating global model...")
    global_model = fedavg(models)
    evaluate(global_model)
    models = update_models(global_model, models)
    del global_model
  return models

mdls = train()

print("--- %s seconds ---" % (time.time() - start_time))

## Save models

In [None]:
g_m = fedavg(mdls)
torch.save(g_m.state_dict(), '/content/drive/MyDrive/5_run2_models.pth')

## Evaluate saved model

In [None]:
model = CustomBERTModel()
model.load_state_dict(torch.load('/content/drive/MyDrive/5_run_2models.pth'))
model.cuda()
evaluate(model)

## Plot results

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

precision = [0.651, 0.638, 0.640, 0.630, 0.605]
recall = [0.608, 0.613, 0.612, 0.602, 0.582]
f_one = [0.625, 0.624, 0.623, 0.614, 0.591]
accuracy = [0.676, 0.678, 0.685, 0.673, 0.648]

y = accuracy 
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
font = {'family' : 'normal',
        'size'   : 14}

matplotlib.rc('font', **font)
matplotlib.rc('font', **font)

x = np.arange(1, 6, 1)
axs[0].set_xticks(x)
#fig.suptitle('Experimental results', fontsize=16)
#plt.subplots_adjust(wspace=2)
axs[0].plot(x, f_one)
axs[0].set_title('Results')
axs[0].set(xlabel='Number of locations', ylabel='Value')
axs[0].plot(x, precision, 'tab:orange')
axs[0].plot(x, recall, 'tab:green')
axs[0].plot(x, accuracy, 'tab:red')
axs[0].set_ylim((0.57, 0.70))
axs[0].legend(['F1-Score', 'Precision', 'Recall', 'Accuracy'], loc='upper right')
columns = ('1', '2', '3', '4.', '5')
rows = ['F1-score', 'Precision', 'Recall', 'Accuracy']

data_list = [f_one, precision, recall, accuracy]
scatter_x = (1, 2, 3)
scatter_y = (1224.53, 1231.76, 1228.70)


table = axs[1].table(cellText=data_list,
           rowLabels=rows,
           colLabels=columns, loc="center",
           colWidths=[0.1 for _ in range(5)],
           fontsize=20
            )
            
table.set_fontsize(14)
table.scale(1.5, 1.5)

axs[1].axis("off")

fig.savefig("res.png")

In [None]:
# Google colab download
from google.colab import files
files.download("res.png") 