In [25]:
import torch
from scipy.io import loadmat
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.decomposition import PCA
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import WeightedRandomSampler
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
torch.manual_seed(54749110)

<torch._C.Generator at 0x1ac82ed2970>

In [26]:
train_data_list = []
train_label_list = []
test_data_list = []
test_label_list = []

# scaler = StandardScaler()
scaler = MinMaxScaler()
# scaler = MaxAbsScaler()

for i in range(1, 11):
    mat_data = loadmat("data/train/"+str(i)+".mat")
    train_data_list.append(scaler.fit_transform(mat_data['de_feature']))
    train_label_list.append(mat_data['label'])

for i in range(11, 14):
    mat_data = loadmat("data/test/"+str(i)+".mat")
    test_data_list.append(scaler.fit_transform(mat_data['de_feature']))
    test_label_list.append(mat_data['label'])

train_datas = np.concatenate(train_data_list)
train_labels = np.concatenate(train_label_list)
test_datas = np.concatenate(test_data_list)
test_labels = np.concatenate(test_label_list)

# pca = PCA(n_components=10)
# train_datas = pca.fit_transform(train_datas)
# test_data_list = [pca.fit_transform(x) for x in test_data_list]

In [27]:
class sentimentDataset(Dataset):
    def __init__(self, data, labels=None, window_size=6):
        self.data = data
        self.labels = labels
        self.window_size = window_size
        self.len = data.shape[0]//window_size
        
    def __getitem__(self, idx):
        start = idx*self.window_size
        data_tensor = torch.tensor(self.data[start: start+self.window_size], dtype=torch.float32)
        if self.labels is not None:
            a = self.labels[start: start+self.window_size]
            label_tensor = torch.tensor(max(a.tolist(), key=a.tolist().count), dtype=torch.long)
        return data_tensor, label_tensor
    
    def __len__(self):
        return self.len

In [28]:
# testsets = [sentimentDataset(test_data_list[i], test_label_list[i]) for i in range(3)]
# testloaders = [DataLoader(testset, batch_size=64) for testset in testsets]
# it = iter(testloaders[1])

In [29]:
EPOCHS = 25
DEV_NUM = 0
HIDDEN_SIZE = 512

class LSTM_baseline(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(310, HIDDEN_SIZE, batch_first=True)
        self.fc = nn.Sequential(nn.Linear(512, 128),
                               nn.ReLU(True),
                               nn.Linear(128, 4))
    
    def forward(self, datas, labels=None): # datas: [batch_size, window_size, feature_dim(310)]
#         print(datas.shape)
        output, (hn, cn) = self.lstm(datas) # hn: [batch_size, hidden_size]
        output = output.view(-1, datas.shape[1], HIDDEN_SIZE)
        logits = self.fc(hn)
        logit = F.softmax(logits)
        outputs = (logits,)
        if labels is not None:
            loss_fnt = nn.CrossEntropyLoss()
            loss = loss_fnt(logits.view(-1, 4), labels.view(-1))
            outputs = (loss,) + outputs
        return outputs

In [30]:
def get_predictions(model, dataloader, compute_acc=False):
    if torch.cuda.is_available():
        model.to("cuda")
    model.eval()
    predictions = None
    correct = 0
    total = 0
    with torch.no_grad():
        for sample in dataloader:
            datas = sample[0]
            if torch.cuda.is_available():
                datas = datas.to("cuda")
            outputs = model(datas)
            logits = F.softmax(outputs[0].squeeze(), dim=1)
            _, pred = torch.max(logits.data, dim=1)
            if compute_acc:
                labels = sample[1]
                if torch.cuda.is_available():
                    labels = labels.to("cuda")
                total += labels.shape[0]
                correct += (pred == labels.squeeze()).sum().item()
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))

    model.train()
    if compute_acc:
        acc = correct / total
        return predictions, acc
    else:
        return predictions
        

def train_model(model, trainset, validloaders: list):
    trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
    device = torch.device("cuda:"+str(DEV_NUM) if torch.cuda.is_available() else "cpu")
    optimizer = torch.optim.Adam(model.parameters(), lr = 5e-5)
    model = model.to(device)
    model.train()
    best_acc = 0.0
    for epoch in range(EPOCHS):
        running_loss = 0.0
        for datas, labels in trainloader:
            datas = datas.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(datas, labels)
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        _, train_acc = get_predictions(model, trainloader, compute_acc=True)
        pres_and_accs = [get_predictions(model, validloader, compute_acc=True) for validloader in validloaders]
        accs = np.array([x[1] for x in pres_and_accs])
        print("In epoch %d, running_loss: %.3f, train_acc: %.3f, valid_avg_acc: %.3f," %(epoch, running_loss, train_acc, accs.mean())\
             + " accs: " + str(accs))
    print("Training done...")

In [31]:
trainset = sentimentDataset(train_datas, train_labels)
trainloader = DataLoader(trainset, batch_size=2)

In [32]:
model = LSTM_baseline()
trainset = sentimentDataset(train_datas, train_labels)
testsets = [sentimentDataset(test_data_list[i], test_label_list[i]) for i in range(3)]
testloaders = [DataLoader(testset, batch_size=64) for testset in testsets]
train_model(model, trainset, testloaders)

In epoch 0, running_loss: 31.673, train_acc: 0.307, valid_avg_acc: 0.314, accs: [0.30496454 0.30496454 0.33333333]
In epoch 1, running_loss: 31.280, train_acc: 0.330, valid_avg_acc: 0.314, accs: [0.30496454 0.30496454 0.33333333]
In epoch 2, running_loss: 30.646, train_acc: 0.453, valid_avg_acc: 0.319, accs: [0.27659574 0.29787234 0.38297872]
In epoch 3, running_loss: 29.572, train_acc: 0.511, valid_avg_acc: 0.296, accs: [0.29787234 0.31914894 0.26950355]
In epoch 4, running_loss: 27.433, train_acc: 0.530, valid_avg_acc: 0.303, accs: [0.34042553 0.23404255 0.33333333]
In epoch 5, running_loss: 24.798, train_acc: 0.648, valid_avg_acc: 0.310, accs: [0.43262411 0.22695035 0.26950355]
In epoch 6, running_loss: 21.875, train_acc: 0.721, valid_avg_acc: 0.388, accs: [0.43971631 0.34751773 0.37588652]
In epoch 7, running_loss: 18.895, train_acc: 0.740, valid_avg_acc: 0.456, accs: [0.43262411 0.4822695  0.45390071]
In epoch 8, running_loss: 17.495, train_acc: 0.794, valid_avg_acc: 0.468, accs: 