# Speech recognition
for https://www.kaggle.com/c/pass-cmu-dl-together-homework-1-part-2-spring-19/leaderboard

In [None]:
import numpy as np
import torch.utils.data
import torch
import torch.nn as nn


### Dataset loader - given

In [1]:
import numpy as np
import os

class WSJ():
    """ Load the WSJ speech dataset
        
        Ensure WSJ_PATH is path to directory containing 
        all data files (.npy) provided on Kaggle.
        
        Example usage:
            loader = WSJ()
            trainX, trainY = loader.train
            assert(trainX.shape[0] == 24590)
            
    """
  
    def __init__(self):
        self.dev_set = None
        self.train_set = None
        self.test_set = None
  
    @property
    def dev(self):
        if self.dev_set is None:
            self.dev_set = load_raw(os.environ['WSJ_PATH'], 'dev')
        return self.dev_set

    @property
    def train(self):
        if self.train_set is None:
            self.train_set = load_raw(os.environ['WSJ_PATH'], 'train')
        return self.train_set
  
    @property
    def test(self):
        if self.test_set is None:
            self.test_set = (np.load(os.path.join(os.environ['WSJ_PATH'], 'test.npy'), encoding='bytes'), None)
        return self.test_set
    
def load_raw(path, name):
    return (
        np.load(os.path.join(path, '{}.npy'.format(name)), encoding='bytes'), 
        np.load(os.path.join(path, '{}_labels.npy'.format(name)), encoding='bytes')
    )



### Dataset class
Consider a frame in the contenxt
Used to prepend L and append R frames after the current one. Pad frames where needed.

In [16]:
class ContextDataset(torch.utils.data.Dataset):
    """
    L, R : number of elements to add to left and right
    """
    def __init__(self, X, y, L, R, pad_mode='constant', pad_constant=0):
        self.L, self.R = (L, R)
        self.y = None if y is None else np.concatenate(y) 
        #indexes of utterances
        self.cumindex = np.cumsum([i.shape[0] for i in X])
        
        X_pad = []
        
        for utt in X:
            utt_new = np.pad(utt, ((L, R), (0,0)), pad_mode) #, constant_values=pad_constant)
            X_pad.append(utt_new)
            
        self.X = np.concatenate(X_pad)
        
    
    def __len__(self):
        return self.cumindex[-1]

    def __getitem__(self, index):
        #old to new index
        old_utt = np.searchsorted(self.cumindex-1, index)

        new_index = index + self.L + old_utt * (self.L + self.R)
        
        x = self.X[new_index - self.L : new_index + self.R + 1].ravel()
        
        if self.y is None:
            return x
        else:
            return x, self.y[index]

### Load data

In [4]:
os.environ['WSJ_PATH']="/media/data/class-cmu-dl/hw1part2/"
loader = WSJ()

## Train

In [5]:
trainX, trainY = loader.train
assert(trainX.shape[0] == 24590)

## Dev


In [6]:
devX, devY = loader.dev
devX.shape, devY.shape

((1103,), (1103,))

## Merge dev and train, shuffle

In [8]:
trainX = np.append(trainX, devX)
trainY = np.append(trainY, devY)
trainX.shape, trainY.shape

((25693,), (25693,))

In [9]:
from sklearn.utils import shuffle

trainX, trainY = shuffle(trainX, trainY, random_state=17)

## train/test split

In [10]:
split_ratio=0.7
split_index = round(trainX.shape[0] * split_ratio)
split_index

17985

In [None]:
# trainX = devX[:split_index]
# trainY = devY[:split_index]

# devX = devX[split_index:]
# devY = devY[split_index:]

In [11]:
devX = trainX[split_index:]
devY = trainY[split_index:]

trainX = trainX[:split_index]
trainY = trainY[:split_index]

### CUDA or not

In [13]:
#helps to debug CUDA errors
import os
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

In [14]:
train_on_gpu = False
device = 'cpu'
if torch.cuda.is_available():
    train_on_gpu = True
    device = 'cuda'
train_on_gpu

True

### Dataset preparation

In [18]:
from torch.utils.data import TensorDataset, DataLoader

#padding
Left = 7
Right = 3

train_dataset = ContextDataset(
    trainX, # device=device), 
    trainY, #, device=device)
    Left, Right, #L,R
    pad_mode='edge',
)

val_dataset = ContextDataset(
    devX,#, device=device), 
    devY,#, device=device)
    Left, Right,
    pad_mode='edge'
)

In [19]:
batch_size = 512

train_loader = DataLoader(train_dataset, 
                          batch_size=batch_size, 
                          shuffle=True,
                          drop_last=True,
                          num_workers=4) 
val_loader = DataLoader(val_dataset, 
                        batch_size=batch_size, 
                        #shuffle=True,
                        #drop_last=True,
                        num_workers=1) 

## Simple model

In [20]:
frame_size = 40

input_dim = frame_size * (Left + 1 + Right) # in the context
hidden1_dim = 2048
hidden2_dim = 512
output_dim = 138

model = torch.nn.Sequential(
    torch.nn.Linear(input_dim, hidden1_dim, bias=False),
    torch.nn.BatchNorm1d(hidden1_dim, momentum=0.1),
    #orch.nn.ReLU(),
    torch.nn.Sigmoid(),
#     torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden1_dim, hidden2_dim, bias=False),
    torch.nn.BatchNorm1d(hidden2_dim, momentum=0.1),
    torch.nn.Sigmoid(),
#     #torch.nn.ReLU(),
#     torch.nn.LeakyReLU(),
#     torch.nn.Linear(hidden1_dim, hidden2_dim, bias=False),
#     torch.nn.BatchNorm1d(hidden2_dim, momentum=0.1),
#     #torch.nn.ReLU(),
#     torch.nn.LeakyReLU(),
    
    torch.nn.Linear(hidden2_dim, output_dim) #, bias=False),
    #torch.nn.Softmax(dim=1),
)

if train_on_gpu:
    model.cuda()

In [21]:
model

Sequential(
  (0): Linear(in_features=440, out_features=2048, bias=False)
  (1): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Sigmoid()
  (3): Linear(in_features=2048, out_features=512, bias=False)
  (4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): Sigmoid()
  (6): Linear(in_features=512, out_features=138, bias=True)
)

In [22]:
#weight initializtion
#torch.nn.init.xavier_uniform(model.weight)
#https://stackoverflow.com/questions/49433936/how-to-initialize-weights-in-pytorch
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight,  gain=nn.init.calculate_gain('relu'))
        #only fill bias if enabled in the layer
        if hasattr(m.bias, "data"):
            m.bias.data.fill_(0.01)

model = model.apply(init_weights)


In [23]:
#validation helper
def validate(model, val_loader):
    losses = []
    corrects = 0
    n_labels = 0
    
    for data, labels in val_loader:
        if train_on_gpu:
            data = data.to(device)
            labels = labels.to(device)

        output = model(data)
        loss = criterion(output, labels)
        losses.append(loss)
        
        _, pred = torch.max(output, 1)
        correct = (pred == labels).float().sum()
        corrects += correct
        n_labels += len(labels)
    
        return sum(losses)/len(losses), corrects/n_labels

In [24]:
learning_rate=0.001
criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


### Training

In [25]:
n_epochs_total = 0

In [26]:
n_epochs = 10

for epoch in range(n_epochs):
    model.train()
    for data, labels in train_loader:
        if train_on_gpu:
            data = data.to(device)
            labels = labels.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

    _, predicted = torch.max(output, 1)
    correct = (predicted == labels).float().sum()
 
    print("Train Epoch {} - b-loss: {:.3f}, Accuracy: {:.3f}".format(n_epochs_total+epoch+1,
                                                               loss.cpu().item(), 
                                                               correct/output.shape[0]))
    #accuracy on val
    model.eval()
    val_loss, val_acc = validate(model, val_loader)
    print("       Valid Epoch {} - loss: {:.3f}, Accuracy: {:.3f}".format(n_epochs_total+epoch+1,
                                                               val_loss.cpu().item(), 
                                                               val_acc.cpu().item()))
n_epochs_total += n_epochs

Train Epoch 1 - b-loss: 1.746, Accuracy: 0.562
       Valid Epoch 1 - loss: 1.552, Accuracy: 0.588
Train Epoch 2 - b-loss: 1.613, Accuracy: 0.576
       Valid Epoch 2 - loss: 1.465, Accuracy: 0.607
Train Epoch 3 - b-loss: 1.597, Accuracy: 0.576
       Valid Epoch 3 - loss: 1.420, Accuracy: 0.592
Train Epoch 4 - b-loss: 1.786, Accuracy: 0.539
       Valid Epoch 4 - loss: 1.426, Accuracy: 0.600
Train Epoch 5 - b-loss: 1.576, Accuracy: 0.594
       Valid Epoch 5 - loss: 1.394, Accuracy: 0.617
Train Epoch 6 - b-loss: 1.744, Accuracy: 0.529
       Valid Epoch 6 - loss: 1.372, Accuracy: 0.623
Train Epoch 7 - b-loss: 1.669, Accuracy: 0.578
       Valid Epoch 7 - loss: 1.397, Accuracy: 0.633
Train Epoch 8 - b-loss: 1.505, Accuracy: 0.602
       Valid Epoch 8 - loss: 1.349, Accuracy: 0.621
Train Epoch 9 - b-loss: 1.492, Accuracy: 0.588
       Valid Epoch 9 - loss: 1.367, Accuracy: 0.613
Train Epoch 10 - b-loss: 1.643, Accuracy: 0.596
       Valid Epoch 10 - loss: 1.360, Accuracy: 0.619


In [None]:
Stop right here

## Train on full dataset (if previously trained on dev)

In [None]:
trainX, trainY = loader.train
assert(trainX.shape[0] == 24590)


In [None]:
train_dataset = ContextDataset(
    trainX, # device=device), 
    trainY, #, device=device)
    Left, Right #L,R
)
train_loader = DataLoader(train_dataset, 
                          batch_size=batch_size, 
                          shuffle=True,
                          drop_last=True,                          
                          num_workers=4)

In [None]:
if train_on_gpu:
    model.cuda()

In [None]:
model = model.apply(init_weights)

In [None]:
#learning_rate = 0.005
criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
n_epochs = 10

for epoch in range(n_epochs):
    model.train()
    for data, labels in train_loader:
        if train_on_gpu:
            data = data.to(device)
            labels = labels.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

    #accuracy on train (last batch)
    _, predicted = torch.max(output, 1)
    correct = (predicted == labels).float().sum()
 
    print("Train Epoch {} - b-loss: {:.3f}, Accuracy: {:.3f}".format(epoch+1,
                                                               loss.cpu().item(), 
                                                               correct/output.shape[0]))
    #accuracy on val
    model.eval()
    val_loss, val_acc = validate(model, val_loader)
    print("     Valid Epoch {} - loss: {:.3f}, Accuracy: {:.3f}".format(epoch+1,
                                                               val_loss.cpu().item(), 
                                                               val_acc.cpu().item()))


## Inference on test

In [None]:
testX, testY = loader.test

In [None]:
test_dataset = ContextDataset(
    testX,#, device=device), 
    None,
    Left, Right
)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=1) 

In [None]:
%%time
preds = []
model.eval()
for data in test_loader:
        if train_on_gpu:
            data = data.to(device)
        output = model(data)
        _, pred = torch.max(output, 1)
        preds.extend(pred.cpu().tolist())

In [None]:
data.size()

In [None]:
len(preds)

### Form submission

In [None]:
import pandas as pd
sub = pd.read_csv("sample submission.csv", index_col=0)
sub.head()

In [None]:
len(sub)

In [None]:
sub['label']=preds
sub.head()

In [None]:
with open("sub10.cvs", "w") as subf:
    subf.write(sub.to_csv())