In [1]:
import json
import os
import random
import torch
import torch.nn as nn 
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from torch import optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from scipy.interpolate import splev
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
from datetime import datetime
sns.set(font_scale=1.5)
warnings.filterwarnings("ignore")

Data

In [2]:
com = pd.read_csv("Group10_CGAnnote_Phase1_Ref.csv")
segments = pd.read_csv("Group_10/Group_10_CPS.csv")
seg_com = segments.copy().iloc[:,:4]
seg_com[["Observation", "Statement", "Accept", "Doubt", "Question", "Recommendation"]] = 0

In [3]:
ptr = 0
for i in range(com.shape[0]):
    start, end = float(com.iloc[i, 0]), float(com.iloc[i, 1])
    while True:
        if start > float(seg_com.iloc[ptr, 2]) or (float(seg_com.iloc[ptr, 2]) - start) < (end - seg_com.iloc[min(ptr+1, seg_com.shape[0]-1), 1]):
            ptr += 1
        else:
            if end < seg_com.iloc[ptr, 1]:
                utterance_id = ptr + 1000
            else:
                utterance_id = ptr
            break
    com.at[i, "Utterance_id"] = int(utterance_id)


In [4]:
com.head()

Unnamed: 0,Begin Time - ss.msec,End Time - ss.msec,Duration - ss.msec,test-common ground,Utterance_id
0,23.431,27.45,4.019,S0: STATEMENT(red =10),3.0
1,55.294,56.862,1.568,S1: STATEMENT(red =10),12.0
2,56.882,58.647,1.765,ACCEPT(S1) FACT += (red =10),12.0
3,60.666,65.294,4.628,R1: RECOMMENDATION(bigger ones?) QUD += bigger...,14.0
4,71.274,72.725,1.451,"O1: OBSERVATION: on(RedBlock and BlueBlock, Le...",17.0


In [5]:
for i in range(com.shape[0]):
    idx = int(com.iloc[i]["Utterance_id"])
    if "OBSERVATION" in com.iloc[i]["test-common ground"]:
        seg_com.at[idx, "Observation"] = 1
    elif "STATEMENT" in com.iloc[i]["test-common ground"]:
        seg_com.at[idx, "Statement"] = 1
    elif "ACCEPT" in com.iloc[i]["test-common ground"]:
        seg_com.at[idx, "Accept"] = 1
    elif "DOUBT" in com.iloc[i]["test-common ground"]:
        seg_com.at[idx, "Doubt"] = 1
    elif "QUESTION" in com.iloc[i]["test-common ground"]:
        seg_com.at[idx, "Question"] = 1
    elif "RECOMMENNDATION" in com.iloc[i]["test-common ground"]:
        seg_com.at[idx, "Recommendation"] = 1

In [6]:
seg_com

Unnamed: 0,Utterance,Start,End,Group,Observation,Statement,Accept,Doubt,Question,Recommendation
0,0,4.02,9.60,10,0,0,0,0,0,0
1,1,14.67,19.08,10,0,0,0,0,0,0
2,2,19.71,22.41,10,0,0,0,0,0,0
3,3,24.27,27.03,10,0,1,0,0,0,0
4,4,27.60,30.24,10,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
141,141,868.71,871.92,10,0,0,0,0,0,0
142,142,875.31,878.91,10,0,0,0,0,0,0
143,143,881.04,889.65,10,0,0,0,0,0,0
144,144,889.74,892.77,10,0,0,0,0,0,0


In [81]:
seg_com.iloc[:, 4:].sum()

Observation        5
Statement         12
Accept            11
Doubt              1
Question           0
Recommendation     0
dtype: int64

In [7]:
seg_com.to_csv("Group_10/Group_10_CG.csv", index=False)

Model

In [8]:
class DATA():

    def __init__(self):
        self.dataset = [[], []]
        self.targets = []

    def openBERT(self,filename):
        data=pd.read_csv(filename, names=["index", "embed"])
        data.sort_values(by='index',inplace=True)
        for index, row in data.iterrows():
            try:
                tensor=np.asarray(row[1].replace(',','.').split('[[')[1].split(']]')[0].split(),dtype=np.float32).tolist()
            except:
                print('problem with utterance number ',row[0])
                tensor=self.dataset[0][-1]
            self.dataset[0].append(tensor)
            
    
    def openSmile(self,filename):
        data=pd.read_csv(filename)
        for i in range (data.shape[0]):
            row=data[data['file']==f'D:\\Research\\Weights_Task\\Weights_Task_Audio\\{filename[filename.index("Group_"):filename.index("Group_")+8]}-audio_PCM\\segments\\{filename[filename.index("Group_"):filename.index("Group_")+8]}-audio_PCM_'+str(i)+'.wav']
            tensor=np.asarray(row.values[0][3:],dtype=np.float32).tolist()
            self.dataset[1].append(tensor)
        

    def openTarget(self,filename):
        data=pd.read_csv(filename)
        for row in range(data.shape[0]):
            target = data.iloc[row, 4:].values.astype(int)
            self.targets.append(target)


    def get_datasets(self):
        final_dataset=[]
        for bert,opensmile,label in zip(self.dataset[0],self.dataset[1],self.targets):
            final_dataset.append([bert,opensmile,label])
        random.shuffle(final_dataset)
        return final_dataset

In [9]:
def read_data(dataset, root, file):
    if 'bert' in file:
        dataset.openBERT(root+"/"+file)
    elif 'features' in file:
        dataset.openSmile(root+"/"+file)
    elif 'CG' in file:
        dataset.openTarget(root+"/"+file)

In [10]:
train_datasets = DATA()

In [11]:
for root, dirs,files in (os.walk(os. getcwd())):
    if "Group_10" in root:
        for file in files:
            read_data(train_datasets, root, file)

In [12]:
class nlp_dataset(Dataset):
    def __init__(self,xy=None):

        self.bert_data = torch.from_numpy(np.asarray([el[0] for el in xy ],dtype=np.float32))
        self.open_data = torch.from_numpy(np.asarray([el[1] for el in xy ],dtype=np.float32))
        self.y_data = torch.from_numpy(np.asarray([el[2] for el in xy ],dtype=np.float32))
        self.len=len(self.bert_data)


    def __getitem__(self, index):
        return self.bert_data[index], self.open_data[index], self.y_data[index]

    def __len__(self):
        return self.len

In [13]:
train_loader = DataLoader(dataset=nlp_dataset(train_datasets.get_datasets()),batch_size=16,shuffle=False)

In [26]:
class common_ground(nn.Module):
    def __init__(self):
        super(common_ground, self).__init__()
        self.lin_bert = nn.Linear(512, 256)
        self.lin_open = nn.Linear(88, 256)
        self.ff = nn.Linear(512, 512)
        self.classifier = nn.Linear(512, 1)
    
    def forward(self, bert, opensmile):
        bert = self.lin_bert(bert)
        opensmile = self.lin_open(opensmile)
        x = torch.hstack((bert, opensmile))
        x = self.ff(x)
        predict = self.classifier(x)

        return predict

In [27]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [72]:
def train(model, total_epochs, train_iterator, class_to_eval):
    
    optimizer = optim.Adam(model.parameters())

    nepochs = 0
    model.train()
    model = model.to(device)
    criterion = nn.BCELoss(reduction='mean').to(device)
    while nepochs < total_epochs :
        optimizer.zero_grad()
        for batch_idx, (bert_data, open_data, target) in enumerate(train_iterator):
            output = model(bert_data.to(device), open_data.to(device))
            target_binary = torch.zeros(target.size()[0], 1).to(device)
            for i,t in enumerate(target):
                target_binary[i] = torch.Tensor([t[class_to_eval]])
            loss = criterion(torch.sigmoid(output).to(device), target_binary)
            loss.backward()
            optimizer.step()
        nepochs += 1

    return model

In [73]:
def test(model, test_iterator, class_to_eval):
    with torch.no_grad():
        model.eval()
        true, pred = None, None
        for batch_idx, (bert_data, open_data, target) in enumerate(test_iterator):
            output = model(bert_data.to(device), open_data.to(device))
            pred_tmp = torch.sigmoid(output)
            true_binary = torch.zeros(target.size()[0], 1)
            for i, t in enumerate(target):
                true_binary[i] = torch.Tensor([t[class_to_eval]])
            
            if true == None:
                true = true_binary
                pred = pred_tmp > 0.5
                pred_probs = pred_tmp

            else :
                true = torch.cat((true, true_binary))
                pred = torch.cat((pred, pred_tmp > 0.5))
                pred_probs = torch.cat((pred_probs, pred_tmp))
    return true, pred, pred_probs


In [66]:
class_to_eval = 1
model = common_ground().to(device)

In [67]:
train(model, 20, train_loader, 1)

common_ground(
  (lin_bert): Linear(in_features=512, out_features=256, bias=True)
  (lin_open): Linear(in_features=88, out_features=256, bias=True)
  (ff): Linear(in_features=512, out_features=512, bias=True)
  (classifier): Linear(in_features=512, out_features=1, bias=True)
)

In [69]:
len(test(model, train_loader, 1))

3

In [None]:
# K-folds
# get results

In [82]:
folds = [train_datasets.get_datasets()[x:x+30] for x in range(0, len(train_datasets.get_datasets()), 30)]

In [86]:
for class_to_eval in range(5):
    for k in range(len(folds)):
        train_l = []
        for i in range(len(folds)):
            if i != k:
                train_l += folds[i]
        test_l = folds[k]
        train_loader = DataLoader(dataset=nlp_dataset(train_l),batch_size=16,shuffle=False)
        test_loader = DataLoader(dataset=nlp_dataset(test_l),batch_size=16,shuffle=False)
        model = common_ground().to(device)
        train(model, 20, train_loader, class_to_eval)
        true, pred, pred_probs = test(model, test_loader, class_to_eval)
        # try:
        #     auroc = roc_auc_score(true, torch.nan_to_num(pred_probs, 0))
        # except:
        #     auroc =  0
        # print(auroc)
        accuracy = accuracy_score(true.to("cpu"), pred.to("cpu"))
        print(accuracy)

0.03333333333333333
0.03333333333333333
0.03333333333333333
0.9
0.0
0.03333333333333333
0.9333333333333333
0.03333333333333333
0.06666666666666667
1.0
0.03333333333333333
0.9
1.0
0.13333333333333333
0.9615384615384616
1.0
0.0
0.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
