In [1]:
tr_path = 'deal_train.csv'  # path to training data
tt_path = 'deal_test.csv'   # path to te|sting data
se_path = 'season.csv'

Loading data ...
Size of training data: (1229932, 429)
Size of testing data: (451552, 429)


In [2]:
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For data preprocess
import numpy as np
import csv
import os

# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

myseed = 42069  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

In [None]:
class empDataset(Dataset):

    def __init__(self,
                 path,
                 mode='train',
                 target_only=False):
        self.mode = mode
        # Read data into numpy arrays
        with open(path, 'r', encoding="Big5") as fp:
            data = list(csv.reader(fp))
            data = np.array([list(map(float,i)) for i in data[1:]]).astype(float)
        
        if not target_only:
            feats = list(range(4,46))
        else:
            feats = list(range(4,46))
        if mode == 'test':
            data = data[:, feats]
            self.data = torch.FloatTensor(data)
        else:
            target = data[:, 3]
            data = data[:, feats]
            
            # Splitting training data into train & dev sets
            if mode == 'train':
                indices = [i for i in range(len(data)) if i % 10 < 7]
#             elif mode == 'valid':
#                 indices = [i for i in range(len(data)) if i % 10 == 0]
            elif mode == 'dev':
                indices = [i for i in range(len(data)) if i % 10 >= 7]
            # Convert data into PyTorch tensors
            self.data = torch.FloatTensor(data[indices])
            self.target = torch.FloatTensor(target[indices])

        
        self.dim = self.data.shape[1]

        print('Finished reading the {} set of Dataset ({} samples found, each dim = {})'
              .format(mode, len(self.data), self.dim))

    def __getitem__(self, index):
        # Returns one sample at a time
        if self.mode in ['train', 'dev', 'valid']:
            # For training
            return self.data[index], self.target[index]
        else:
            # For testing (no target)
            return self.data[index]

    def __len__(self):
        # Returns the size of the dataset
        return len(self.data)

Split the labeled data into a training set and a validation set, you can modify the variable `VAL_RATIO` to change the ratio of validation data.

In [3]:
VAL_RATIO = 0.2
percent = int(train.shape[0] * (1 - VAL_RATIO))
train_x, train_y, val_x, val_y = train[:percent], train_label[:percent], train[percent:], train_label[percent:]
print('Size of training set: {}'.format(train_x.shape))
print('Size of validation set: {}'.format(val_x.shape))

Size of training set: (983945, 429)
Size of validation set: (245987, 429)


In [4]:
_train = train_x.reshape(983945,11,39)
_train = _train[:,::-1,:]
train_x = np.append(train_x,_train).reshape(1967890,429)
train_y = np.append(train_y,train_y)
print(train_x.shape)
print(train_y.shape)

(1967890, 429)
(1967890,)


Create a data loader from the dataset, feel free to tweak the variable `BATCH_SIZE` here.

In [5]:
BATCH_SIZE = 256

from torch.utils.data import DataLoader

train_set = TIMITDataset(train_x, train_y)
val_set = TIMITDataset(val_x, val_y)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True) #only shuffle the training data
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

Cleanup the unneeded variables to save memory.<br>

**notes: if you need to use these variables later, then you may remove this block or clean up unneeded variables later<br>the data size is quite huge, so be aware of memory usage in colab**

In [6]:
import gc

del train, train_label, train_x, train_y, val_x, val_y
gc.collect()

252

## Create Model

Define model architecture, you are encouraged to change and experiment with the model architecture.

In [7]:
import torch
import torch.nn as nn
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight, gain=1.0)
        m.bias.data.fill_(0.01)
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.layer1 = nn.Linear(429, 2048).apply(init_weights)
        self.layer2 = nn.Linear(2048,2048).apply(init_weights)
        self.layer3 = nn.Linear(2048,2048).apply(init_weights)#第三層是到512然後沒有第四層
        self.layer5 = nn.Linear(2048,512).apply(init_weights)
        self.out = nn.Linear(512, 39).apply(init_weights)

        self.act_fn = nn.ReLU()
        self.bn429 = nn.BatchNorm1d(429)
        self.droupout = nn.Dropout(0.5)
    def forward(self, x):
        x = self.bn429(x)
        x = self.layer1(x)
        x = self.droupout(x)
        x = self.act_fn(x)
        
        x = self.layer2(x)
        x = self.droupout(x)
        x = self.act_fn(x)

        x = self.layer3(x)
        x = self.droupout(x)
        x = self.act_fn(x)
        
        x = self.layer5(x)
        x = self.droupout(x)
        x = self.act_fn(x)
        
        x = self.out(x)
        
        return x

## Training

In [8]:
#check device
def get_device():
  return 'cuda' if torch.cuda.is_available() else 'cpu'

Fix random seeds for reproducibility.

In [9]:
# fix random seed
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  
    np.random.seed(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

Feel free to change the training parameters here.

In [10]:
# fix random seed for reproducibility
same_seeds(7122)

# get device 
device = get_device()
print(f'DEVICE: {device}')

# training parameters
num_epoch = 50            # number of training epoch
learning_rate = 0.0002       # learning rate

# the path where checkpoint saved
model_path = './model.ckpt'

# create model, define a loss function, and optimizer
model = Classifier().to(device)
criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1.5e-4, betas=(0.9, 0.999), eps=1e-08,  amsgrad=False)
# optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)

DEVICE: cuda


In [None]:
# start training

best_acc = 0.0
for epoch in range(num_epoch):
    train_acc = 0.0
    train_loss = 0.0
    val_acc = 0.0
    val_loss = 0.0

    # training
    model.train() # set the model to training mode
    k = 0
    for i, data in enumerate(train_loader):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device) 
        outputs = model(inputs) 
        batch_loss = criterion(outputs, labels)
        _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
        batch_loss.backward()
        optimizer.step() 
        optimizer.zero_grad()
        train_acc += (train_pred.cpu() == labels.cpu()).sum().item()
        train_loss += batch_loss.item()

    # validation
    if len(val_set) > 0:
        model.eval() # set the model to evaluation mode
        with torch.no_grad():
            for i, data in enumerate(val_loader):
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                batch_loss = criterion(outputs, labels) 
                _, val_pred = torch.max(outputs, 1) 
            
                val_acc += (val_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
                val_loss += batch_loss.item()

            print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f} | Val Acc: {:3.6f} loss: {:3.6f}'.format(
                epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader), val_acc/len(val_set), val_loss/len(val_loader)
            ))

            # if the model improves, save a checkpoint at this epoch
            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(model.state_dict(), model_path)
                print('saving model with acc {:.3f}'.format(best_acc/len(val_set)))
                k = 0
            else:
                k += 1
    else:
        print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f}'.format(
            epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader)
        ))
    if k > 8:
        print('early stop!')
        break

# if not validating, save the last epoch
if len(val_set) == 0:
    torch.save(model.state_dict(), model_path)
    print('saving model at last epoch')
#0.746299
#0.748588
#0.755495
#0.756084
#0.756333

[001/050] Train Acc: 0.551085 Loss: 1.489793 | Val Acc: 0.677422 loss: 1.022515
saving model with acc 0.677
[002/050] Train Acc: 0.640246 Loss: 1.145793 | Val Acc: 0.702980 loss: 0.921611
saving model with acc 0.703
[003/050] Train Acc: 0.666134 Loss: 1.049666 | Val Acc: 0.716839 loss: 0.869011
saving model with acc 0.717
[004/050] Train Acc: 0.678531 Loss: 1.000975 | Val Acc: 0.722697 loss: 0.844585
saving model with acc 0.723
[005/050] Train Acc: 0.686929 Loss: 0.970815 | Val Acc: 0.728266 loss: 0.824311
saving model with acc 0.728
[006/050] Train Acc: 0.693106 Loss: 0.949431 | Val Acc: 0.729872 loss: 0.817551
saving model with acc 0.730
[007/050] Train Acc: 0.697183 Loss: 0.934147 | Val Acc: 0.731685 loss: 0.814622
saving model with acc 0.732
[008/050] Train Acc: 0.700403 Loss: 0.922112 | Val Acc: 0.734510 loss: 0.803349
saving model with acc 0.735
[009/050] Train Acc: 0.702603 Loss: 0.913287 | Val Acc: 0.736214 loss: 0.795548
saving model with acc 0.736
[010/050] Train Acc: 0.70481

## Testing

Create a testing dataset, and load model from the saved checkpoint.

In [None]:
# create testing dataset
test_set = TIMITDataset(test, None)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

# create model and load weights from checkpoint
model = Classifier().to(device)
model.load_state_dict(torch.load(model_path))

Make prediction.

In [None]:
predict = []
model.eval() # set the model to evaluation mode
with torch.no_grad():
    for i, data in enumerate(test_loader):
        inputs = data
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, test_pred = torch.max(outputs, 1) # get the index of the class with the highest probability

        for y in test_pred.cpu().numpy():
            predict.append(y)

Write prediction to a CSV file.

After finish running this block, download the file `prediction.csv` from the files section on the left-hand side and submit it to Kaggle.

In [None]:
with open('prediction.csv', 'w') as f:
    f.write('Id,Class\n')
    for i, y in enumerate(predict):
        f.write('{},{}\n'.format(i, y))