In [46]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader

# device check
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [47]:
print(torch.backends.cudnn.is_available())
print(torch.backends.cudnn.version())

True
8200


# Fetching data

Read the data as the Pandas Dataframe

In [48]:
# I use Pandas library to read the data from CSV and drop all the NA value.'''
df = pd.read_csv('converted_syllable_dict.csv')
df.dropna(inplace = True)
# df = df.head(500)

# Split the data into the list of word string and the list of their syllbus
word = list(df['word'])
word = [each_word.lower() for each_word in word]
sylla_count = list(df['syllable_count'])
len(df)

# print(word)
# print(sylla_count)

125912

# Pre-process data

Create a look up table and reformat the data

In [49]:
# Convert all word to the same lenght for embedding purpose
max_len = 0
for i in word:
    if len(i) > max_len:
        max_len = len(i)

max_len

28

In [50]:
# Add " " to the end of each word so all word in the corpus has the same length
word = list(map(lambda x: x + " " * (max_len - len(x)), word ))

In [51]:
# Create the look up table with all characters in the corpus
char_li = [c for c in " abcdefghijklmnopqrstuvwxyz/-'1234567890."]
word_dict = {n: i for i, n in enumerate(char_li)}
number_dict = {i: w for i, w in enumerate(char_li)}

# Train_test split the data

In [52]:
x_train_dem, x_test_dem, y_train_dem, y_test_dem = train_test_split(word, sylla_count, test_size=0.2, random_state=42)
len(x_train_dem)

100729

# Embeding, convert to tensor and batching tha data


In [53]:
# Embedding the data into word vector and convert to tensor

def convert_to_tensor(x, y, word_dict):
    np_x = []

    for each_word in x:
        vec_each_word = [word_dict[n] for n in each_word] # look up each charactor in each word and convert into the character's index from our word dictionary
        np_x.append(vec_each_word)

    # Change both list into tensor
    tensor_x = torch.LongTensor(np_x)
    tensor_y = torch.LongTensor(y)

    return tensor_x, tensor_y

tensor_x_train_dem, tensor_y_train_dem = convert_to_tensor(x_train_dem, y_train_dem, word_dict)
print(tensor_x_train_dem)
print(tensor_y_train_dem.size())

tensor([[ 8,  1,  9,  ...,  0,  0,  0],
        [19,  9, 13,  ...,  0,  0,  0],
        [21, 14, 10,  ...,  0,  0,  0],
        ...,
        [ 1,  4,  1,  ...,  0,  0,  0],
        [ 2, 21, 19,  ...,  0,  0,  0],
        [23,  5, 19,  ...,  0,  0,  0]])
torch.Size([100729])


In [54]:
# Implement the dataloader for batching the data when feeding the data to the model
data = list(zip(tensor_x_train_dem, tensor_y_train_dem))
batch_size = 2
shuffle = True

loader_dem = DataLoader(data, batch_size=batch_size, shuffle = shuffle)

for x_train, y_train in loader_dem:
    print('input')
    print(x_train)
    print('output')
    print(y_train)
    print()
    break

input
tensor([[ 8,  5, 18,  7, 15, 20, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [13,  9,  3,  8,  1,  5, 12, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
output
tensor([2, 3])



# LSTM Model

In [55]:
# Declare the parameters for the layers
n_hidden = 128
n_class = len(word_dict)
max_syllabus = 20

In [56]:
# Implement the LSTM model

class TextLSTM(nn.Module):
    def __init__(self, n_class, n_hidden, max_syllabus):
        super(TextLSTM, self).__init__()

        # Declare variable
        self.n_class = n_class
        self.n_hidden = n_hidden
        self.max_syllabus = max_syllabus
        
        # Embedding layer
        self.embedd = nn.Embedding(num_embeddings=n_class, embedding_dim=5) # [n_class, 5]
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size=5, hidden_size=n_hidden) # [5, n_hidden]

        # Linear layer
        self.W = nn.Linear(n_hidden, max_syllabus, bias=True) # [n_hidden, max_syllabus]
        
        # Softmax layer
        # self.probabilities = nn.LogSoftmax() # [n_hidden, max_syllabus]

    def forward(self, tensor_train):
        tensor_train = tensor_train.transpose(0, 1)
        
        # Embedding the input
        embedded = self.embedd(tensor_train)
        
        # Pass through the LSTM
        model, (_, _) = self.lstm(embedded)
        model = model[-1]
        
        # Pass through the linear layer
        model = self.W(model)
        
        # Softmax layer
        # model = self.probabilities(model)
        
        return model

# Training process

In [57]:
# Init the model, loss, and optimizer
model = TextLSTM(n_class, n_hidden, max_syllabus)
# criterion = nn.NLLLoss() 
criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001)

# The criterion can be changed into CrossEntropyLoss() 
# to give higher rate of prediction (~95%)
# However, you need to deactivate the Softmax layer in the model class
# to use this loss and change the learning rate into 0.001

In [58]:
# Spliting the train_test dataset and convert them to tensor
x_train, x_test, y_train, y_test = train_test_split(word, sylla_count, test_size=0.1, random_state=42)
tensor_x_train, tensor_y_train = convert_to_tensor(x_train, y_train, word_dict)

# Init the training loader
loader = DataLoader(list(zip(tensor_x_train, tensor_y_train)), batch_size = 1024, shuffle = True)

# Put the model into the cuda for computatiing purpose on cuda
model = model.to(device)

In [59]:
# Check the size of the traing data
max(tensor_y_train)

tensor(12)

In [60]:
# Train the data for the "epoch = 100" times
for epoch in range(100):
    tot_loss = 0 # calculate the total loss of each batch

    for x_train, y_train in loader:

        # Put the batch onto cuda
        x_train, y_train = x_train.to(device), y_train.to(device)
        
        # Set the optimizer back to 0
        optimizer.zero_grad()

        # Feed the x_train to the model
        output = model(x_train)

        # Calculate the loss and backproping
        loss = criterion(output, y_train)
        loss.backward()
    
        # Calculate the total loss and optimize
        tot_loss += loss.item()
        optimizer.step()

    # print the total loss
    if (epoch + 1) % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(tot_loss))

Epoch: 0010 cost = 149.519117
Epoch: 0020 cost = 149.560303
Epoch: 0030 cost = 149.567893
Epoch: 0040 cost = 33.668607
Epoch: 0050 cost = 26.131243
Epoch: 0060 cost = 21.884402
Epoch: 0070 cost = 18.665262
Epoch: 0080 cost = 16.597061
Epoch: 0090 cost = 14.464481
Epoch: 0100 cost = 13.051680


# Testing the model

Feed the model with our test dataset to see the model perfomance

In [61]:
# Convert the test dataset to tensor
tensor_x_test, tensor_y_test = convert_to_tensor(x_test, y_test, word_dict)

# Put the tensors onto cuda
tensor_x_test = tensor_x_test.to(device)
tensor_y_test = tensor_y_test.to(device)
tensor_x_test

tensor([[16,  9, 12,  ...,  0,  0,  0],
        [ 7, 15, 15,  ...,  0,  0,  0],
        [ 1, 18,  1,  ...,  0,  0,  0],
        ...,
        [ 1, 21,  4,  ...,  0,  0,  0],
        [ 9, 14, 21,  ...,  0,  0,  0],
        [16,  1, 15,  ...,  0,  0,  0]], device='cuda:0')

In [65]:
# Test the model
predict = model(tensor_x_test).data
predict_1 = [int(np.argmax(x)) for i, x in enumerate(predict.cpu())]

# Count the number of correct prediction
count = 0
for i in range(len(tensor_y_test)):
    if predict_1[i] - int(tensor_y_test[i]) == 0:
        count += 1
    
    else:
        print(x_test[i], predict_1[i], int(tensor_y_test[i])) # printout incorrect prediction

# Print the result
print("Accuracy: ", count/len(tensor_y_test)*100, "%")

dinmukhamed                  3 4
ratatouille                  4 3
rolemodel                    4 3
flour                        1 2
nutriclean                   4 3
sarejevo's                   3 4
genuine                      2 3
cecelia                      4 3
deisher                      2 3
fuels                        1 2
puig                         2 1
fukuoka                      3 4
socialite                    4 3
baidoan                      2 3
joelle                       1 2
polyak                       3 2
emeralds                     3 2
hoene                        1 2
lavinia                      4 3
coretech                     3 2
archaic                      2 3
wunderle                     2 3
yeagle                       1 2
imai                         3 2
stoicism                     3 4
machetes                     2 3
riviera                      3 4
miniaturize                  5 4
historically                 4 5
deline                       2 3
gessler   