In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

"""
TO DO 
- remove tweet dataset preprocessing stuff and use the preprocessed dataset
- turn into a function that takes a bitstring as the only parameter and only returns the f1score 
    - maybe just call it fitness 
"""

def fitness(bitstring):

    ### GLOBAL VARIABLES ###

    # specific GA hyperparameters
    BERT_LAYER = int(bitstring[0:4]) #done
    FINE_TUNING_ARCHITECTURE = 0 #int(bitstring[4:6]) #td
    CNN_DROPOUT = 0.2 #int(bitstring[6:10]) #done
    CNN_KERNEL_SIZE = 2 #int(bitstring[10:13]) #done
    BILSTM_DROPOUT = int(bitstring[13:17]) #td
    BILSTM_OUTPUT_NEURON = int(bitstring[17:27]) #td
    FINAL_ACTIVATION_NEURON = int(bitstring[27]) #done

    if BERT_LAYER > 13:
        BERT_LAYER = 13 ## embedding layer = 0 and then layers 1 - 12, make sure these numbers get translated to the right values

    BATCH_SIZE = 32
    EPOCHS = 1 #10
    NUM_FEATURES = 4
    ## need length of tweet

    ## load in the dataset
    col1_names=['id', 'tweet_id', 'text', 'username']
    col2_names = ['tweet_id', 'disease', 'label']
    df1 = pd.read_csv("phm2017_tweets.csv", names=col1_names, header=None)
    df2 = pd.read_csv("PHM2017.csv", names=col2_names, header=None)
    df = pd.merge(df1, df2, on="tweet_id")
    df = df.drop(['id', 'tweet_id', 'username', 'disease'], axis=1)
    df['label'] = df['label'].astype(int)
    print(df.dtypes)

    ## get shape
    shape = df.shape

    # uncomment if you want to see what the df looks liek
    #print(shape)


    # uncomment if you want to check class distribution
    #print(df['label'].value_counts(normalize = True))


    # split into train test split
    train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'],
                                                                        random_state=2018,
                                                                        test_size=0.3,
                                                                        stratify=df['label'])

    # we will use temp_text and temp_labels to create validation and test set
    val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                    random_state=2018,
                                                                    test_size=0.5,
                                                                    stratify=temp_labels)

    # import BERT-base pretrained model
    bert = AutoModel.from_pretrained('bert-base-uncased', return_dict=False)

    # Load the BERT tokenizer
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    ## uncomment if you just want to see an example
    # sample data
    #text = ["this is a bert model tutorial", "we will fine-tune a bert model"]

    # encode text
    #sent_id = tokenizer.batch_encode_plus(text, padding=True, return_token_type_ids=False)

    # output
    #print(sent_id)

    ############### TOKENIZATION ###############
    # get length of all the messages in the train set
    seq_len = [len(i.split()) for i in train_text]

    pd.Series(seq_len).hist(bins = 30)

    max_seq_len = 280 ## change this maybe?

    # tokenize and encode sequences in the training set
    tokens_train = tokenizer.batch_encode_plus(
        train_text.tolist(),
        max_length=max_seq_len,
        padding= 'longest',
        truncation=True,
        return_token_type_ids=False
    )

    # tokenize and encode sequences in the validation set
    tokens_val = tokenizer.batch_encode_plus(
        val_text.tolist(),
        max_length=max_seq_len,
        padding= 'longest',
        truncation=True,
        return_token_type_ids=False
    )

    # tokenize and encode sequences in the test set
    tokens_test = tokenizer.batch_encode_plus(
        test_text.tolist(),
        max_length=max_seq_len,
        padding='longest',
        truncation=True,
        return_token_type_ids=False
    )


    ############### CREATE TENSORS ###############

    # for train set
    train_seq = torch.tensor(tokens_train['input_ids'])
    train_mask = torch.tensor(tokens_train['attention_mask'])
    train_y = torch.tensor(train_labels.tolist())

    # for validation set
    val_seq = torch.tensor(tokens_val['input_ids'])
    val_mask = torch.tensor(tokens_val['attention_mask'])
    val_y = torch.tensor(val_labels.tolist())

    # for test set
    test_seq = torch.tensor(tokens_test['input_ids'])
    test_mask = torch.tensor(tokens_test['attention_mask'])
    test_y = torch.tensor(test_labels.tolist())

    ############### CREATE DATALOOADERS ###############

    # wrap tensors
    train_data = TensorDataset(train_seq, train_mask, train_y)

    # sampler for sampling the data during training
    train_sampler = RandomSampler(train_data)

    # dataLoader for train set
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

    # wrap tensors
    val_data = TensorDataset(val_seq, val_mask, val_y)

    # sampler for sampling the data during training
    val_sampler = SequentialSampler(val_data)

    # dataLoader for validation set
    val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=BATCH_SIZE)

    # freeze only the specific layers we want
    modules = [bert.embeddings, *bert.encoder.layer[:BERT_LAYER]]
    for module in modules:
        for param in module.parameters():
            param.requires_grad = False

    class BERT_Arch(nn.Module):

        def __init__(self, bert):
            super(BERT_Arch, self).__init__()

            self.bert = bert

            # one layer CNN
            if FINE_TUNING_ARCHITECTURE == 0:
                self.dropout1 = nn.Dropout(CNN_DROPOUT)
                self.conv = nn.Conv1d(in_channels=768, out_channels=512, kernel_size=CNN_KERNEL_SIZE,
                                                  padding='valid')
                self.relu = nn.ReLU()
                self.maxpool = nn.MaxPool1d(kernel_size=CNN_KERNEL_SIZE)
                self.fc = nn.Linear(15, 4)

            # two layer CNN
            elif FINE_TUNING_ARCHITECTURE == 1:
                self.dropout2 = nn.Dropout(CNN_DROPOUT)
                self.conv1 = nn.Conv1d(in_channels=768, out_channels=512, kernel_size=CNN_KERNEL_SIZE,
                                                  padding='valid')
                self.relu1 = nn.ReLU()
                self.maxpool1 = nn.MaxPool1d(kernel_size=CNN_KERNEL_SIZE)
                self.conv2 = nn.Conv1d(in_channels=512, out_channels=256, kernel_size=CNN_KERNEL_SIZE,
                                                  padding='valid')
                self.relu2 = nn.ReLU()
                self.maxpool2 = nn.MaxPool1d(kernel_size=CNN_KERNEL_SIZE)
                self.fc1 = nn.Linear(15,4)


            # one layer BiLSTM
            elif FINE_TUNING_ARCHITECTURE == 2:
                self.lstm1 = nn.LSTM(768, seq_len, NUM_FEATURES, batch_first=True, bidirectional=True)
                self.dropout3 = nn.Dropout(BILSTM_DROPOUT)

            # two layer BiLSTM
            else:
                self.lstm2 = nn.LSTM(768, seq_len, NUM_FEATURES, batch_first=True, bidirectional=True)
                self.lstm3 = nn.LSTM(768, seq_len, NUM_FEATURES, batch_first=True, bidirectional=True)
                self.dropout4 = nn.Dropout(BILSTM_DROPOUT)


            # softmax activation function
            if FINAL_ACTIVATION_NEURON == 0:
                self.softmax = nn.LogSoftmax(dim=1) #is dim = 1 right here?
            else:
                self.sigmoid = nn.Sigmoid()

        # define the forward pass
        def forward(self, sent_id, mask):
            # pass the inputs to the model
            _, cls_hs = self.bert(sent_id, attention_mask=mask)


            # one layer CNN
            if FINE_TUNING_ARCHITECTURE == 0:
                cls_hs = cls_hs.transpose(1, 0)
                x = self.dropout1(cls_hs)
                x = self.conv(x)
                x = self.relu(x)
                x = self.maxpool(x)
                x = self.fc(x)

            # two layer CNN
            elif FINE_TUNING_ARCHITECTURE == 1:
                cls_hs = cls_hs.transpose(1, 0)
                x = self.dropout2(cls_hs)
                x = self.conv1(x)
                x = self.relu1(x)
                x = self.maxpool1(x)
                x = self.conv2(x)
                x = self.relu2(x)
                x = self.maxpool2(x)
                x = self.fc1(x)

            # one layer BiLSTM
            elif FINE_TUNING_ARCHITECTURE == 2:
                x = self.lstm1(cls_hs)
                x = self.dropout3(x)

            # two layer BiLSTM
            else:
                x = self.lstm2(cls_hs)
                x = self.dropout4(x)

            # apply final activation
            if FINAL_ACTIVATION_NEURON == 0:
                x = self.softmax(x)
                print(x.shape)
            else:
                x = self.sigmoid(x)

            return x


    # pass the pre-trained BERT to our define architecture
    model = BERT_Arch(bert)

    # push the model to GPU
    model = model.to(device)

    # define the optimizer
    optimizer = AdamW(model.parameters(), lr = 1e-3)

    # compute the class weights
    class_wts = compute_class_weight(class_weight='balanced', classes = np.unique(train_labels), y=train_labels)

    # uncomment if you want to print classweights
    #print(class_wts)

    # convert class weights to tensor
    weights= torch.tensor(class_wts,dtype=torch.float)
    weights = weights.to(device)

    # loss function
    cross_entropy  = nn.NLLLoss(weight=weights)

    # number of training epochs


    # function to train the model
    def train():
        model.train()

        total_loss, total_accuracy = 0, 0

        # empty list to save model predictions
        total_preds = []

        # iterate over batches
        for step, batch in enumerate(train_dataloader):

            # progress update after every 50 batches.
            if step % 50 == 0 and not step == 0:
                print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

            # push the batch to gpu
            batch = [r.to(device) for r in batch]

            sent_id, mask, labels = batch

            #print(batch[0].shape)

            # clear previously calculated gradients
            model.zero_grad()

            # get model predictions for the current batch
            preds = model(sent_id, mask)

            # compute the loss between actual and predicted values
            loss = cross_entropy(preds, labels)

            # add on to the total loss
            total_loss = total_loss + loss.item()

            # backward pass to calculate the gradients
            loss.backward()

            # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # update parameters
            optimizer.step()

            # model predictions are stored on GPU. So, push it to CPU
            preds = preds.detach().cpu().numpy()

            # append the model predictions
            total_preds.append(preds)

        # compute the training loss of the epoch
        avg_loss = total_loss / len(train_dataloader)

        # predictions are in the form of (no. of batches, size of batch, no. of classes).
        # reshape the predictions in form of (number of samples, no. of classes)
        total_preds = np.concatenate(total_preds, axis=0)

        # returns the loss and predictions
        return avg_loss, total_preds


    # function for evaluating the model
    def evaluate():
        print("\nEvaluating...")

        # deactivate dropout layers
        model.eval()

        total_loss, total_accuracy = 0, 0

        # empty list to save the model predictions
        total_preds = []

        # iterate over batches
        for step, batch in enumerate(val_dataloader):

            # Progress update every 50 batches.
            if step % 50 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                #elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

            # push the batch to gpu
            batch = [t.to(device) for t in batch]

            sent_id, mask, labels = batch

            # deactivate autograd
            with torch.no_grad():

                # model predictions
                preds = model(sent_id, mask)

                # compute the validation loss between actual and predicted values
                loss = cross_entropy(preds, labels)

                total_loss = total_loss + loss.item()

                preds = preds.detach().cpu().numpy()

                total_preds.append(preds)

        # compute the validation loss of the epoch
        avg_loss = total_loss / len(val_dataloader)

        # reshape the predictions in form of (number of samples, no. of classes)
        total_preds = np.concatenate(total_preds, axis=0)

        return avg_loss, total_preds


    # set initial loss to infinite
    best_valid_loss = float('inf')

    # empty lists to store training and validation loss of each epoch
    train_losses = []
    valid_losses = []

    # for each epoch
    for epoch in range(EPOCHS):

        print('\n Epoch {:} / {:}'.format(epoch + 1, EPOCHS))

        # train model
        train_loss, _ = train()

        # evaluate model
        valid_loss, _ = evaluate()

        # save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'saved_weights.pt')

        # append training and validation loss
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        print(f'\nTraining Loss: {train_loss:.3f}')
        print(f'Validation Loss: {valid_loss:.3f}')


    #load weights of best model
    path = 'saved_weights.pt'
    model.load_state_dict(torch.load(path))

    # get predictions for test data
    with torch.no_grad():
        preds = model(test_seq.to(device), test_mask.to(device))
        preds = preds.detach().cpu().numpy()

    # model's performance
    preds = np.argmax(preds, axis = 1)
    f1 = f1_score(test_y, preds, average='weighted')

    # confusion matrix
    #pd.crosstab(test_y, preds)

    return f1

bitstring = '0100101010000011100101001101'
fitness(bitstring)