In [2]:
import json
import os
import random
import shap
import torch
import math
import torch.nn as nn 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from torch import optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import time
import warnings
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
warnings.filterwarnings("ignore")
import re



from transformers import BertTokenizer, BertModel, XLMTokenizer, XLMConfig, XLMModel, \
    AlbertTokenizer, AlbertModel, RobertaTokenizer, RobertaConfig, RobertaModel


Model

In [48]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


Choose LLM

In [49]:
def atoi(text):
    return int(text) if text.isdigit() else text
def natural_keys(text):
    return [atoi(c) for c in re.split('(\d+)',text) ]

In [192]:
LLM = "bert-base-uncased"
# LLM = "bert-large-uncased"
# LLM = "albert"
# LLM = "roberta"
# LLM = "xlm"

def change_llm(LLM):

    if "bert-" in LLM:
        tokenizer = BertTokenizer.from_pretrained(LLM)
        llm = BertModel.from_pretrained(LLM)


    if LLM == "albert":
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        llm = AlbertModel.from_pretrained('albert-base-v2')

    if LLM == "roberta":
        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        configuration = RobertaConfig()
        llm = RobertaModel(configuration)

    if LLM == "xlm":
        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
        configuration = XLMConfig()
        llm = XLMModel(configuration)


    llm.eval()
    sentence = "Paris is a beautiful city" 
    tok = tokenizer(sentence)
    encoded_layers = llm(torch.Tensor([tok.input_ids]).to(torch.int), attention_mask=torch.Tensor([tok.attention_mask]))
    llm_size = (encoded_layers.last_hidden_state)[:, 0, :].size()[1]

    return tokenizer, llm, llm_size
tokenizer, llm, llm_size = change_llm(LLM)

In [362]:
LLMs = ["bert-base-uncased", "bert-large-uncased", "albert", "roberta", "xlm"]
for LLM in LLMs:
    tokenizer, llm, llm_size = change_llm(LLM)
    for root, dirs,files in (os.walk(os. getcwd())):
        if "Group_" in root:
            if "asr" not in root: # (update) change to process rows in csv files instead of seperate txt files
                df = pd.DataFrame()
                for file in files:
                    if 'Oracle.csv' in file:
                        transcripts = pd.read_csv(root+'/'+file)["Transcript"].fillna("")
                        for transc in transcripts:
                            tok = tokenizer(transc)
                            with torch.no_grad():
                                encoded_layers = llm(torch.Tensor([tok.input_ids]).to(torch.int), attention_mask=torch.Tensor([tok.attention_mask]))
                            tensor = (encoded_layers.last_hidden_state)[:, 0, :]
                            tensor_df = pd.DataFrame(tensor.numpy())
                            df = pd.concat((df, tensor_df))
                gr = root.find('Group')
                group = root[gr:gr+8]
                df.to_csv(group+'/'+group+"_"+LLM+".csv", index=False)

In [370]:
lv1_lv3 = dict({0:0, 1:0, 2:0, 3:0, 4:0, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 11:1, 12:1, 13:2, 14:2, 15:2, 16:2, 17:2, 18:2})

In [406]:
class DATA():

    def __init__(self):
        self.dataset_llm = []
        self.dataset_opensmile = []
        self.dataset_cps_f = []
        self.dataset_cps_i = []
        self.targets = []

    def openBERT(self, filename):
        data = pd.read_csv(filename)
        for i in range(len(data)):
            self.dataset_llm.append(np.asarray(data.iloc[i]))
    
    
    def openSmile(self,filename):
        data = pd.read_csv(filename)
        for i in range (data.shape[0]):
            # print(f'C:\\Users\\Bbykitty\\OneDrive - Colostate\\Research\\Initial Observations for Fib Weights\\Data\\Segment Analysis\\{filename[filename.index("Group_"):filename.index("Group_")+8]}\\segments_oracle\\{filename[filename.index("Group_"):filename.index("Group_")+8]}_'+str(i)+'.wav')
            row=data[data['file']==f'C:\\Users\\bradf\\OneDrive - Colostate\\Research\\Initial Observations for Fib Weights\\Data\\Segment Analysis\\{filename[filename.index("Group_"):filename.index("Group_")+8]}\\segments_oracle\\{filename[filename.index("Group_"):filename.index("Group_")+8]}_'+str(i)+'.wav']
            tensor=np.asarray(row.values[0][3:],dtype=np.float32).tolist()
            self.dataset_opensmile.append(tensor)


    def openCPS(self, filename):
        data = pd.read_csv(filename).fillna(0)
        for row in range(data.shape[0]):
            cps = data.iloc[row, 10:].fillna(0)
            self.dataset_cps_i.append(cps)
            cps_f = [0, 0, 0]
            for i, c in enumerate(cps):
                if c == 1:
                    cps_f[lv1_lv3[i]] = 1
            self.dataset_cps_f.append(cps_f)
        

    def openTarget(self,filename):
        data = pd.read_csv(filename)
        for row in range(data.shape[0]):
            target = data.iloc[row, 4:].values.astype(int)
            self.targets.append(target)


    def get_datasets(self, rand=False):
        final_dataset=[]
        if len(self.dataset_llm) == 0:
            for opensmile, label in zip(self.dataset_opensmile, self.targets):
                final_dataset.append([opensmile,label])
        if len(self.dataset_opensmile) == 0:
            for bert, label in zip(self.dataset_llm, self.targets):
                final_dataset.append([bert,label])
        if rand:
            random.shuffle(final_dataset)
        return final_dataset

In [407]:
def read_data(dataset, root, file, llm="bert-base-uncased"):

    if LLM in file:
        dataset.openBERT(root+"/"+file)
    if 'opensmile' in file:
        dataset.openSmile(root+"/"+file)
    elif 'CG.csv' in file:
        dataset.openTarget(root+"/"+file)
    elif 'CPS_Oracle' in file:
        dataset.openCPS(root+"/"+file)



        

In [408]:
train_datasets = DATA()

In [409]:
for root, dirs,files in (os.walk(os. getcwd())):
    if "Group_" in root:
        if "asr" not in root:
            for file in files:
                read_data(train_datasets, root, file)

In [164]:
scaler = MinMaxScaler()
train_datasets.dataset_llm = scaler.fit_transform(train_datasets.dataset_llm).tolist()
scaler = MinMaxScaler()
train_datasets.dataset_opensmile = scaler.fit_transform(train_datasets.dataset_opensmile).tolist()

In [6]:
class nlp_dataset(Dataset):
    def __init__(self,xy=None):

        self.bert_data = torch.from_numpy(np.asarray([el[0] for el in xy],dtype=np.float32))
        self.open_data = torch.from_numpy(np.asarray([el[1] for el in xy],dtype=np.float32))
        self.y_data = torch.from_numpy(np.asarray([el[2] for el in xy],dtype=np.float32))
        self.len=len(self.bert_data)


    def __getitem__(self, index):
        return self.bert_data[index], self.open_data[index], self.y_data[index]

    def __len__(self):
        return self.len

In [7]:
train_loader = DataLoader(dataset=nlp_dataset(train_datasets.get_datasets(rand=True)),batch_size=16,shuffle=False)

In [9]:
l = train_datasets.get_datasets()
l = [a+b for a,b,c in l]

In [18]:
n_folds = 3
fold_len = int(np.ceil(len(train_datasets.get_datasets())/n_folds))
fold_len

607

In [19]:
folds = [train_datasets.get_datasets()[x:x+fold_len] for x in range(0, len(train_datasets.get_datasets()), fold_len)] 

Recurrent model

In [165]:
train_list = train_datasets.get_datasets()

In [166]:
train_list = [[a+b, c] for a,b,c in train_list]

variable window size

In [199]:
window_size = 7

In [200]:
rec_train_list = []
pad = [[0]*600, [0]*7]
for utt_id in range(len(train_list)):
    aux = []
    for i in range(window_size):
        if utt_id == i:
            for _ in range(window_size):
                aux.append(pad)
    for i in range(window_size):
        if len(aux) == i:
            aux.append(train_list[utt_id - window_size + i])
    aux.append(train_list[utt_id])
    rec_train_list.append(aux)

In [201]:
class rec_dataset(Dataset):
    def __init__(self,xy=None):
        self.utt_bert_l = []
        self.utt_open_l = []
        self.utt_y_l = []

        for i in range(window_size+1):
            self.utt_bert_l.append(torch.from_numpy(np.asarray([el[i][0][:llm_size] for el in xy],dtype=np.float32)).to(device))
            self.utt_open_l.append(torch.from_numpy(np.asarray([el[i][0][llm_size:] for el in xy],dtype=np.float32)).to(device))
            self.utt_y_l.append(torch.from_numpy(np.asarray([el[i][1] for el in xy],dtype=np.float32)).to(device))
        
        self.len=len(self.utt_bert_l[0])
        

    def __getitem__(self, index):
        return self.utt_bert_l[index], self.utt_open_l[index], self.utt_y_l[index]
    
    def __len__(self):
        return self.len

In [202]:
rec_train_loader = DataLoader(dataset=rec_dataset(rec_train_list),batch_size=16,shuffle=True)

In [57]:
class rec_common_ground(nn.Module):
    def __init__(self, lin_layers=True, bert_b=True, opensmile_b=True, output_size=1):
        super(rec_common_ground, self).__init__()
        self.lin_bert1 = nn.Linear(llm_size, 256)
        self.lin_bert2 = nn.Linear(256, 256)
        if not lin_layers:
            self.lstm_bert = nn.LSTM(input_size=llm_size, batch_first=True, hidden_size=256)
        self.lstm_bert = nn.LSTM(input_size=256, batch_first=True, hidden_size=256)
        self.lin_open1 = nn.Linear(88, 256)
        self.lin_open2 = nn.Linear(256, 256)
        self.lstm_opensmile = nn.LSTM(input_size=256, batch_first=True, hidden_size=256)
        self.relu = nn.ReLU()
        if not bert_b or not opensmile_b:
            self.ff1 = nn.Linear(256, 512)
        self.ff1 = nn.Linear(512, 512)
        self.ff2 = nn.Linear(512, 512)
        self.act1 = nn.Tanh()
        self.act2 = nn.SiLU()
        self.classifier = nn.Linear(512, output_size)

    def forward(self, utt_bert_l, utt_open_l, lin_layers=True, bert_b=True, opensmile_b=True):

        if bert_b:
            if lin_layers:
                utt_bert_l = [self.lin_bert1(utt_x_bert) for utt_x_bert in utt_bert_l]
                utt_bert_l = [self.lin_bert2(utt_x_bert) for utt_x_bert in utt_bert_l]
                utt_bert_l = [self.relu(utt_x_bert) for utt_x_bert in utt_bert_l]
            bert = torch.stack(tuple(utt_bert_l), dim=1)
            bert = self.lstm_bert(bert)[1][0][0]

        if opensmile_b:
            if lin_layers:
                utt_open_l = [self.lin_open1(utt_x_open) for utt_x_open in utt_open_l]
                utt_open_l = [self.lin_open2(utt_x_open) for utt_x_open in utt_open_l]
                utt_open_l = [self.relu(utt_x_open) for utt_x_open in utt_open_l]
            opensmile = torch.stack(tuple(utt_open_l), dim=1)
            opensmile = self.lstm_opensmile(opensmile)[1][0][0]

        if bert_b and opensmile_b:
            x = torch.hstack((bert, opensmile))
        if not bert_b:
            x = opensmile
        if not opensmile_b:
            x = bert
        x = self.ff1(x)
        x = self.act1(x)
        x = self.ff2(x)
        x = self.act2(x)
        predict = self.classifier(x)

        return predict


In [171]:
def rec_train(model, total_epochs, lr, train_iterator, class_to_eval, lin_layers, bert_b, opensmile_b, output_size):
    
    optimizer = optim.Adam(model.parameters(), lr=lr)
    epoch_loss = []
    nepochs = 0
    model = model.to(device)
    model.train()
    criterion = nn.BCELoss(reduction='mean').to(device)
    if output_size != 1:
        criterion = nn.CrossEntropyLoss(reduction='mean').to(device)
    while nepochs < total_epochs :
        optimizer.zero_grad()
        batch_loss = []
        for batch_idx, (utt_bert_l, utt_open_l, utt_y_l) in enumerate(train_iterator):
            output = model(utt_bert_l, utt_open_l, lin_layers, bert_b, opensmile_b, output_size)
            target = utt_y_l[-1]
            if output_size == 1:
                target_binary = torch.zeros(target.size()[0], 1).to(device)
                for i,t in enumerate(target):
                    target_binary[i] = torch.Tensor([t[class_to_eval]])
                loss = criterion(torch.sigmoid(output).to(device), target_binary)
            else:
                loss = criterion(output, target)
            batch_loss.append(loss)
            loss.backward()
            optimizer.step()
        nepochs += 1
        epoch_loss.append(sum(batch_loss)/len(batch_loss))
    return model, epoch_loss

In [172]:
def rec_test(model, test_iterator, class_to_eval,   lin_layers, bert_b, opensmile_b, output_size):
    with torch.no_grad():
        model.eval()
        true, pred = None, None
        for batch_idx, (utt_bert_l, utt_open_l, utt_y_l) in enumerate(test_iterator):
            output = model(utt_bert_l, utt_open_l, lin_layers, bert_b, opensmile_b, output_size)
            pred_tmp = torch.sigmoid(output)
            target = utt_y_l[-1]
            true_binary = torch.zeros(target.size()[0], 1)
            for i, t in enumerate(target):
                true_binary[i] = torch.Tensor([t[class_to_eval]])
            
            if true == None:
                true = true_binary
                pred = pred_tmp > 0.5
                pred_probs = pred_tmp

            else :
                true = torch.cat((true, true_binary))
                pred = torch.cat((pred, pred_tmp > 0.5))
                pred_probs = torch.cat((pred_probs, pred_tmp))
    return true, pred, pred_probs


In [None]:
LLMs = ["bert-base-uncased", "bert-large-uncased", "albert", "roberta", "xlm"]
n_folds = [5, 6, 7] # train test valid
window_sizes = [1, 3, 5, 7, 9]
lrs = [0.01, 0.001, 0.0001]
output_sizes = [1, 8, 9] # [binary, multinomial without negatives, multinomial with all utterances]
bert_b = [True, False]
opensmile_b = [True, False]
action_b = [True, False]
gamr_b = [True, False]
objects_b = [True, False]
cps_b = [True, False]
lin_layers = [True, False]


In [21]:
tokenizer, llm, llm_size = change_llm(LLM)
fold_len = int(np.ceil(len(train_datasets.get_datasets())/n_folds))
folds = [train_datasets.get_datasets()[x:x+fold_len] for x in range(0, len(train_datasets.get_datasets()), fold_len)]

for class_to_eval in range(6):
    print(f"Results for class {class_to_eval}")
    for k in range(len(folds)):
        train_l = []
        for i in range(len(folds)):
            if i != k:
                train_l += folds[i]
        test_l = folds[k]
        train_loader = DataLoader(dataset=rec_dataset(train_l),batch_size=16,shuffle=False)
        test_loader = DataLoader(dataset=rec_dataset(test_l),batch_size=16,shuffle=False)
        
        
        
        model = rec_common_ground(lin_layers, bert_b, opensmile_b, output_size).to(device)
        rec_train(model, 60, train_loader, class_to_eval)
        true, pred, pred_probs = rec_test(model, test_loader, class_to_eval)





        # try:
        #     auroc = roc_auc_score(true, torch.nan_to_num(pred_probs, 0.5))
        # except:
        #     auroc =  0.5
        # print(auroc)
        accuracy = accuracy_score(true.to("cpu"), pred.to("cpu"))
        print(accuracy)



        # torch.save(model, f"save/binary_all_features/{Class_to_eval}_{filt}_{NN_size}_{epochs}_{group_holdout}.pt")


Results for class 0


0.9505766062602965
0.9703459637561779
0.9686468646864687
Results for class 1
0.957166392092257
0.02800658978583196
0.03135313531353135
Results for class 2
0.9390444810543658
0.957166392092257
0.9455445544554455
Results for class 3
0.9604612850082372
0.9686985172981878
0.971947194719472
Results for class 4
0.9901153212520593
0.9950576606260296
0.9867986798679867
Results for class 5
0.9934102141680395
0.9950576606260296
0.9867986798679867


In [58]:
model = rec_common_ground()

In [189]:
model

rec_common_ground(
  (lin_bert1): Linear(in_features=512, out_features=256, bias=True)
  (lin_bert2): Linear(in_features=256, out_features=256, bias=True)
  (lstm_bert): LSTM(256, 256, batch_first=True)
  (lin_open1): Linear(in_features=88, out_features=256, bias=True)
  (lin_open2): Linear(in_features=256, out_features=256, bias=True)
  (lstm_opensmile): LSTM(256, 256, batch_first=True)
  (relu): ReLU()
  (ff1): Linear(in_features=512, out_features=512, bias=True)
  (ff2): Linear(in_features=512, out_features=512, bias=True)
  (act1): Tanh()
  (act2): SiLU()
  (classifier): Linear(in_features=512, out_features=1, bias=True)
)

In [183]:
model, loss = rec_train(model, 120, rec_train_loader, 0)

In [190]:
loss

[tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(-5.8867e+08, device='cuda:0', grad_fn=<DivBackw