In [1]:
!pip install --upgrade gdown

# Main link
!gdown --id '1N1eVIDe9hKM5uiNRGmifBlwSDGiVXPJe' --output libriphone.zip
# !gdown --id '1qzCRnywKh30mTbWUEjXuNT2isOCAPdO1' --output libriphone.zip

!unzip -q libriphone.zip
!ls libriphone




[notice] A new release of pip is available: 23.0 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Access denied with the following error:



 	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id='1N1eVIDe9hKM5uiNRGmifBlwSDGiVXPJe' 

'unzip' 不是內部或外部命令、可執行的程式或批次檔。
'ls' 不是內部或外部命令、可執行的程式或批次檔。


In [2]:
import numpy as np
import torch
import random

def same_seeds(seed):
    random.seed(seed) 
    np.random.seed(seed)  
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) 
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [3]:
print("pytorch version:",torch.__version__)
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
else: 
    device = 'cpu'
print("use",device,"now!")

pytorch version: 1.13.1
use cuda now!


In [4]:
import os
import torch
from tqdm import tqdm

def load_feat(path):
    feat = torch.load(path)
    return feat

def shift(x, n):
    if n < 0:
        left = x[0].repeat(-n, 1)
        right = x[:n]
    elif n > 0:
        right = x[-1].repeat(n, 1)
        left = x[n:]
    else:
        return x

    return torch.cat((left, right), dim=0)

def concat_feat(x, concat_n):
    assert concat_n % 2 == 1 # n must be odd
    if concat_n < 2:
        return x
    seq_len, feature_dim = x.size(0), x.size(1)
    x = x.repeat(1, concat_n) 
    x = x.view(seq_len, concat_n, feature_dim).permute(1, 0, 2) # concat_n, seq_len, feature_dim
    mid = (concat_n // 2)
    for r_idx in range(1, mid+1):
        x[mid + r_idx, :] = shift(x[mid + r_idx], r_idx)
        x[mid - r_idx, :] = shift(x[mid - r_idx], -r_idx)

    return x.permute(1, 0, 2).view(seq_len, concat_n * feature_dim)

def preprocess_data(split, feat_dir, phone_path, concat_nframes, train_ratio=0.8, random_seed=1213):
    class_num = 41 # NOTE: pre-computed, should not need change

    if split == 'train' or split == 'val':
        mode = 'train'
    elif split == 'test':
        mode = 'test'
    else:
        raise ValueError('Invalid \'split\' argument for dataset: PhoneDataset!')

    label_dict = {}
    if mode == 'train':
        for line in open(os.path.join(phone_path, f'{mode}_labels.txt')).readlines():
            line = line.strip('\n').split(' ')
            label_dict[line[0]] = [int(p) for p in line[1:]]
        
        # split training and validation data
        usage_list = open(os.path.join(phone_path, 'train_split.txt')).readlines()
        random.seed(random_seed)
        random.shuffle(usage_list)
        train_len = int(len(usage_list) * train_ratio)
        usage_list = usage_list[:train_len] if split == 'train' else usage_list[train_len:]

    elif mode == 'test':
        usage_list = open(os.path.join(phone_path, 'test_split.txt')).readlines()

    usage_list = [line.strip('\n') for line in usage_list]
    print('[Dataset] - # phone classes: ' + str(class_num) + ', number of utterances for ' + split + ': ' + str(len(usage_list)))

    max_len = 3000000
    X = torch.empty(max_len, 39 * concat_nframes)
    if mode == 'train':
        y = torch.empty(max_len, dtype=torch.long)

    idx = 0
    for i, fname in tqdm(enumerate(usage_list)):
        feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))
        cur_len = len(feat)
        feat = concat_feat(feat, concat_nframes)
        if mode == 'train':
          label = torch.LongTensor(label_dict[fname])

        X[idx: idx + cur_len, :] = feat
        if mode == 'train':
          y[idx: idx + cur_len] = label

        idx += cur_len

    X = X[:idx, :]
    if mode == 'train':
      y = y[:idx]

    print(f'[INFO] {split} set')
    print(X.shape)
    if mode == 'train':
      print(y.shape)
      return X, y
    else:
      return X


# Dataset

In [5]:
import torch
from torch.utils.data import Dataset

class LibriDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = X
        if y is not None:
            self.label = torch.LongTensor(y)
        else:
            self.label = None

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]

    def __len__(self):
        return len(self.data)


# Model
Feel free to modify the structure of the model.

參考資料
學習率調整:https://ithelp.ithome.com.tw/articles/10277623
LSTM model 參考 : https://ithelp.ithome.com.tw/articles/10292728 、ChatGPT

In [6]:
import torch.nn as nn

class BiLSTM(nn.Module):
    def __init__(self,output_dim = 41):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size = input_dim, hidden_size=hidden_dim, num_layers= hidden_layers, batch_first=True, dropout=0.3, bidirectional=True)
        self.fc = nn.Sequential(
            nn.GELU(),
#             nn.LeakyReLU(0.1),
#             nn.LeakyReLU(0.2),
#             nn.ReLU(),
            nn.BatchNorm1d( hidden_dim * 2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim * 2, hidden_dim),
#             nn.GELU(),
            nn.LeakyReLU(0.1),
#             nn.LeakyReLU(0.2),
#             nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        
#         h0 = torch.zeros(num_layers * 2, x.size(0), hidden_size).to(device)
#         c0 = torch.zeros(num_layers * 2, x.size(0), hidden_size).to(device)
        
        x = x.view(-1,concat_nframes,39)        #將輸入的訓練資料的維度拉成1*39
        x, _ = self.lstm(x)
#         print(x.shape)
        x = x[:,concat_nframes//2]              #lstm出來的結果是三維透過這邊將維度改成2維
#         print(x.shape)
        x = self.fc(x)
        return x

# Hyper-parameters

In [7]:
# data prarameters
concat_nframes = 59              # the number of frames to concat with, n must be odd (total 2k+1 = n frames)
train_ratio = 0.99               # the ratio of data used for training, the rest will be used for validation

# training parameters
seed = 24       #kobe best                
batch_size = 2048                # batch size
num_epoch = 20                  # the number of training epoch
learning_rate = 0.001         # learning rate
model_path = './model.ckpt'     # the path where the checkpoint will be saved
input_dim = 39
hidden_layers = 3 
hidden_dim = 800

# Dataloader

In [8]:
from torch.utils.data import DataLoader
import gc

same_seeds(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'DEVICE: {device}')

# preprocess data
train_X, train_y = preprocess_data(split='train', feat_dir='./libriphone/feat', phone_path='./libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio, random_seed=seed)
val_X, val_y = preprocess_data(split='val', feat_dir='./libriphone/feat', phone_path='./libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio, random_seed=seed)

# get dataset
train_set = LibriDataset(train_X, train_y)
val_set = LibriDataset(val_X, val_y)

# remove raw feature to save memory
del train_X, train_y, val_X, val_y
gc.collect()

# get dataloader
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

DEVICE: cuda
[Dataset] - # phone classes: 41, number of utterances for train: 3394


3394it [00:45, 73.80it/s]


[INFO] train set
torch.Size([2095636, 2301])
torch.Size([2095636])
[Dataset] - # phone classes: 41, number of utterances for val: 35


35it [00:00, 74.83it/s]

[INFO] val set
torch.Size([21158, 2301])
torch.Size([21158])





# Training

In [9]:
# create model, define a loss function, and optimizer
# model = Classifier(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers).to(device)
model = BiLSTM().to(device)
# model = Classifier(batch_size).to(device)
criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate,weight_decay =0.0001 )
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=3, T_mult=5, eta_min=0.000001,verbose = True)  
     
best_acc = 0.0
for epoch in range(num_epoch):
    train_acc = 0.0
    train_loss = 0.0
    val_acc = 0.0
    val_loss = 0.0
    
    # training
    model.train() # set the model to training mode
    for i, batch in enumerate(tqdm(train_loader)):
        features, labels = batch
        features = features.to(device)
        labels = labels.to(device)
        outputs = model(features) 
        
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward() 
        optimizer.step() 
        
        _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
        train_acc += (train_pred.detach() == labels.detach()).sum().item()
        train_loss += loss.item()
    scheduler.step()
    # validation
    if len(val_set) > 0:
        model.eval() # set the model to evaluation mode
        with torch.no_grad():
            for i, batch in enumerate(tqdm(val_loader)):
                features, labels = batch
                features = features.to(device)
                labels = labels.to(device)
                
                outputs = model(features)
                
                loss = criterion(outputs, labels) 
                
                _, val_pred = torch.max(outputs, 1) 
                val_acc += (val_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
                val_loss += loss.item()

            print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f} | Val Acc: {:3.6f} loss: {:3.6f}'.format(
                epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader), val_acc/len(val_set), val_loss/len(val_loader)
            ))

            # if the model improves, save a checkpoint at this epoch
            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(model.state_dict(), model_path)
                print('saving model with acc {:.3f}'.format(best_acc/len(val_set)))
    else:
        print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f}'.format(
            epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader)
        ))
    
# if not validating, save the last epoch
if len(val_set) == 0:
    torch.save(model.state_dict(), model_path)
    print('saving model at last epoch')

Epoch 00000: adjusting learning rate of group 0 to 1.0000e-03.


100%|██████████| 1024/1024 [16:10<00:00,  1.05it/s]


Epoch 00001: adjusting learning rate of group 0 to 7.5025e-04.


100%|██████████| 11/11 [00:03<00:00,  3.57it/s]


[001/020] Train Acc: 0.783643 Loss: 0.693890 | Val Acc: 0.816807 loss: 0.630993
saving model with acc 0.817


100%|██████████| 1024/1024 [16:09<00:00,  1.06it/s]


Epoch 00002: adjusting learning rate of group 0 to 2.5075e-04.


100%|██████████| 11/11 [00:03<00:00,  3.54it/s]


[002/020] Train Acc: 0.910252 Loss: 0.258244 | Val Acc: 0.827158 loss: 0.713703
saving model with acc 0.827


100%|██████████| 1024/1024 [16:10<00:00,  1.06it/s]


Epoch 00003: adjusting learning rate of group 0 to 1.0000e-03.


100%|██████████| 11/11 [00:03<00:00,  3.53it/s]


[003/020] Train Acc: 0.952912 Loss: 0.124572 | Val Acc: 0.832357 loss: 0.787392
saving model with acc 0.832


100%|██████████| 1024/1024 [16:10<00:00,  1.05it/s]


Epoch 00004: adjusting learning rate of group 0 to 9.8908e-04.


100%|██████████| 11/11 [00:03<00:00,  3.53it/s]


[004/020] Train Acc: 0.924875 Loss: 0.207942 | Val Acc: 0.824038 loss: 0.795835


100%|██████████| 1024/1024 [16:10<00:00,  1.05it/s]


Epoch 00005: adjusting learning rate of group 0 to 9.5682e-04.


100%|██████████| 11/11 [00:03<00:00,  3.54it/s]


[005/020] Train Acc: 0.943044 Loss: 0.152163 | Val Acc: 0.825929 loss: 0.818095


100%|██████████| 1024/1024 [16:12<00:00,  1.05it/s]


Epoch 00006: adjusting learning rate of group 0 to 9.0460e-04.


100%|██████████| 11/11 [00:03<00:00,  3.53it/s]


[006/020] Train Acc: 0.952236 Loss: 0.125453 | Val Acc: 0.826969 loss: 0.873131


100%|██████████| 1024/1024 [16:12<00:00,  1.05it/s]


Epoch 00007: adjusting learning rate of group 0 to 8.3473e-04.


100%|██████████| 11/11 [00:03<00:00,  3.54it/s]


[007/020] Train Acc: 0.958389 Loss: 0.108232 | Val Acc: 0.830608 loss: 0.908194


100%|██████████| 1024/1024 [16:12<00:00,  1.05it/s]


Epoch 00008: adjusting learning rate of group 0 to 7.5025e-04.


100%|██████████| 11/11 [00:03<00:00,  3.53it/s]


[008/020] Train Acc: 0.964444 Loss: 0.091892 | Val Acc: 0.834153 loss: 0.900610
saving model with acc 0.834


100%|██████████| 1024/1024 [16:13<00:00,  1.05it/s]


Epoch 00009: adjusting learning rate of group 0 to 6.5485e-04.


100%|██████████| 11/11 [00:03<00:00,  3.55it/s]


[009/020] Train Acc: 0.969805 Loss: 0.077896 | Val Acc: 0.832829 loss: 0.955180


100%|██████████| 1024/1024 [17:00<00:00,  1.00it/s]


Epoch 00010: adjusting learning rate of group 0 to 5.5271e-04.


100%|██████████| 11/11 [00:03<00:00,  3.36it/s]


[010/020] Train Acc: 0.974974 Loss: 0.064369 | Val Acc: 0.832782 loss: 1.030381


100%|██████████| 1024/1024 [16:22<00:00,  1.04it/s]


Epoch 00011: adjusting learning rate of group 0 to 4.4829e-04.


100%|██████████| 11/11 [00:03<00:00,  3.52it/s]


[011/020] Train Acc: 0.980097 Loss: 0.051496 | Val Acc: 0.833207 loss: 1.078811


100%|██████████| 1024/1024 [16:09<00:00,  1.06it/s]


Epoch 00012: adjusting learning rate of group 0 to 3.4615e-04.


100%|██████████| 11/11 [00:03<00:00,  3.58it/s]


[012/020] Train Acc: 0.984720 Loss: 0.040124 | Val Acc: 0.838973 loss: 1.130994
saving model with acc 0.839


100%|██████████| 1024/1024 [16:07<00:00,  1.06it/s]


Epoch 00013: adjusting learning rate of group 0 to 2.5075e-04.


100%|██████████| 11/11 [00:03<00:00,  3.54it/s]


[013/020] Train Acc: 0.988835 Loss: 0.029744 | Val Acc: 0.838170 loss: 1.215840


100%|██████████| 1024/1024 [16:07<00:00,  1.06it/s]


Epoch 00014: adjusting learning rate of group 0 to 1.6627e-04.


100%|██████████| 11/11 [00:03<00:00,  3.56it/s]


[014/020] Train Acc: 0.992108 Loss: 0.021375 | Val Acc: 0.838170 loss: 1.334736


100%|██████████| 1024/1024 [16:07<00:00,  1.06it/s]


Epoch 00015: adjusting learning rate of group 0 to 9.6396e-05.


100%|██████████| 11/11 [00:03<00:00,  3.57it/s]


[015/020] Train Acc: 0.994590 Loss: 0.014883 | Val Acc: 0.838123 loss: 1.436713


100%|██████████| 1024/1024 [16:17<00:00,  1.05it/s]


Epoch 00016: adjusting learning rate of group 0 to 4.4184e-05.


100%|██████████| 11/11 [00:03<00:00,  3.52it/s]


[016/020] Train Acc: 0.996511 Loss: 0.010052 | Val Acc: 0.839021 loss: 1.492390
saving model with acc 0.839


100%|██████████| 1024/1024 [16:19<00:00,  1.04it/s]


Epoch 00017: adjusting learning rate of group 0 to 1.1915e-05.


100%|██████████| 11/11 [00:03<00:00,  3.51it/s]


[017/020] Train Acc: 0.997561 Loss: 0.007309 | Val Acc: 0.839115 loss: 1.530307
saving model with acc 0.839


100%|██████████| 1024/1024 [16:19<00:00,  1.05it/s]


Epoch 00018: adjusting learning rate of group 0 to 1.0000e-03.


100%|██████████| 11/11 [00:03<00:00,  3.48it/s]


[018/020] Train Acc: 0.998135 Loss: 0.005867 | Val Acc: 0.839304 loss: 1.535640
saving model with acc 0.839


100%|██████████| 1024/1024 [16:23<00:00,  1.04it/s]


Epoch 00019: adjusting learning rate of group 0 to 9.9956e-04.


100%|██████████| 11/11 [00:03<00:00,  3.53it/s]


[019/020] Train Acc: 0.951645 Loss: 0.135515 | Val Acc: 0.832026 loss: 0.927101


100%|██████████| 1024/1024 [16:15<00:00,  1.05it/s]


Epoch 00020: adjusting learning rate of group 0 to 9.9825e-04.


100%|██████████| 11/11 [00:03<00:00,  3.54it/s]

[020/020] Train Acc: 0.968606 Loss: 0.082444 | Val Acc: 0.830419 loss: 0.977485





In [10]:
del train_set, val_set
del train_loader, val_loader
gc.collect()

0

# Testing
Create a testing dataset, and load model from the saved checkpoint.

In [11]:
# load data
test_X = preprocess_data(split='test', feat_dir='./libriphone/feat', phone_path='./libriphone', concat_nframes=concat_nframes)
test_set = LibriDataset(test_X, None)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

[Dataset] - # phone classes: 41, number of utterances for test: 857


857it [00:11, 75.33it/s]

[INFO] test set
torch.Size([527364, 2301])





In [12]:
# load model
# model = Classifier(input_dim=input_dim, hidden_layers=hidden_layers, hidden_dim=hidden_dim).to(device)
model = BiLSTM().to(device)
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

Make prediction.

In [13]:
pred = np.array([], dtype=np.int32)

model.eval()
with torch.no_grad():
    for i, batch in enumerate(tqdm(test_loader)):
        features = batch
#         features = features.to(device)
        features = features.to(device)
        outputs = model(features)

        _, test_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
        pred = np.concatenate((pred, test_pred.cpu().numpy()), axis=0)


100%|██████████| 258/258 [01:17<00:00,  3.32it/s]


Write prediction to a CSV file.

After finish running this block, download the file `prediction.csv` from the files section on the left-hand side and submit it to Kaggle.

In [14]:
with open('prediction_rnn_test59_hiddendim_800_seed24_dropout0.3_gelu_leakyrelu_learning_0.001.csv', 'w') as f:
    f.write('Id,Class\n')
    for i, y in enumerate(pred):
        f.write('{},{}\n'.format(i, y))

參考資料
學習率調整:https://ithelp.ithome.com.tw/articles/10277623
LSTM model 參考 : https://ithelp.ithome.com.tw/articles/10292728 、ChatGPT