In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split

In [2]:

RANDOM_STATE = 42

torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed_all(RANDOM_STATE)  # For multi-GPU.
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)


def _init_fn(worker_id):
    np.random.seed(int(RANDOM_STATE))

In [3]:
train = pd.read_csv('../data/train.csv', header=None, names=["text"])
test = pd.read_csv('../data/answers.csv', header=None, names=["text"])


def separate_xy(df, prefix="train"):
    df[prefix + "_X"] = df["text"].apply(lambda x: x[:-1])
    df[prefix + "_y"] = df["text"].apply(lambda x: x[-1])
    return df

train_df = separate_xy(train, 'train')
test_df = separate_xy(test, 'test')

In [4]:
train_df

Unnamed: 0,text,train_X,train_y
0,SR2SR2SGR,SR2SR2SG,R
1,SRGSRSRSR,SRGSRSRS,R
2,KFKFKFKFK,KFKFKFKF,K
3,X2LS2FRS2,X2LS2FRS,2
4,MMMMGGGGG,MMMMGGGG,G
...,...,...,...
2538,S2ZR22222,S2ZR2222,2
2539,62SNRM1MM,62SNRM1M,M
2540,ASSRSRSRS,ASSRSRSR,S
2541,2ZZT2Z2ZT,2ZZT2Z2Z,T


In [5]:
train_X, val_X, train_y, val_y = train_test_split(train_df["train_X"], train_df["train_y"], test_size=0.1, random_state=RANDOM_STATE)

In [6]:
vocab = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567'

id2char = {idx: char for idx, char in enumerate(vocab)}
char2id = {char: idx for idx, char in enumerate(vocab)}


# Define the size of the input (number of unique characters), hidden layer, and output (same as input)
input_size = len(vocab)
hidden_size = len(vocab) * 3
output_size = len(vocab)

In [7]:

# Define the RNN model
class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=1, embedding_dim=10):
        super(SimpleLSTM, self).__init__()
        self.hidden_size = hidden_size

        # Embedding layer: 8 characters in the alphabet and embedding_dim can be a small number like 10
        self.embedding = nn.Embedding(input_size, embedding_dim=embedding_dim)
        
        # RNN layer
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_hidden_layers, batch_first=True)

        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Embedding input
        embedded = self.embedding(x)

        # RNN output
        lstm_out, _ = self.lstm(embedded)
        last_lstm_out = lstm_out[:, -1, :]

        # Fully connected layer output
        out = self.fc(last_lstm_out)
        return out

In [16]:

# Create the model
simple_rnn = SimpleLSTM(input_size, hidden_size, output_size)

# Define the loss function and the optimizer
# loss_function = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

In [17]:
model_output = simple_rnn(torch.tensor([[18, 18, 18, 18]], dtype=torch.long))
model_output

tensor([[-0.0495,  0.1151,  0.0054, -0.0834, -0.1550,  0.0626, -0.0536, -0.0534,
         -0.1277, -0.0916, -0.0663, -0.0305,  0.1157,  0.0140,  0.0426, -0.0071,
         -0.0010,  0.0088,  0.0553,  0.0114, -0.0589,  0.0406, -0.0770, -0.0772,
         -0.1052,  0.1195, -0.0918,  0.0785, -0.1174, -0.0656,  0.0600, -0.0671,
         -0.1058]], grad_fn=<AddmmBackward0>)

In [18]:
probabilities = F.softmax(model_output, dim=1)
probabilities

tensor([[0.0294, 0.0347, 0.0311, 0.0284, 0.0265, 0.0329, 0.0293, 0.0293, 0.0272,
         0.0282, 0.0289, 0.0300, 0.0347, 0.0314, 0.0323, 0.0307, 0.0309, 0.0312,
         0.0327, 0.0313, 0.0292, 0.0322, 0.0286, 0.0286, 0.0278, 0.0348, 0.0282,
         0.0334, 0.0275, 0.0290, 0.0328, 0.0289, 0.0278]],
       grad_fn=<SoftmaxBackward0>)

In [19]:
id2char[int(probabilities.argmax())]

'Z'

In [20]:
char2id['S']

18

In [21]:
import torch
from torch.utils.data import Dataset, DataLoader

class CharSequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        Args:
            sequences (list of strings): List of character sequences.
            labels (list of strings): List of single characters (labels).
        """
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        # Convert the sequence and label to numeric format
        sequence = torch.tensor([char2id[char] for char in self.sequences[idx]], dtype=torch.long)
        label = torch.tensor(char2id[self.labels[idx]], dtype=torch.long)

        return sequence, label


In [22]:
train_X

2240    2SR22SR2
678     M2SKKZEE
1194    S2LZTK2K
1572    2MSG2M2M
2276    EIVD12E1
          ...   
1638    1111SRER
1095    111R11SR
1130    SSGSRSRS
1294    MZTKE2XZ
860     22MSR1SE
Name: train_X, Length: 2288, dtype: object

In [23]:
train_df["train_X"]

0       SR2SR2SG
1       SRGSRSRS
2       KFKFKFKF
3       X2LS2FRS
4       MMMMGGGG
          ...   
2538    S2ZR2222
2539    62SNRM1M
2540    ASSRSRSR
2541    2ZZT2Z2Z
2542    22MMMZTS
Name: train_X, Length: 2543, dtype: object

In [39]:
train_dataset = CharSequenceDataset(list(train_X), list(train_y))
# train_dataset = CharSequenceDataset(train_df["train_X"], train_df["train_y"])
val_dataset = CharSequenceDataset(list(val_X), list(val_y))
test_dataset = CharSequenceDataset(test_df["test_X"], test_df["test_y"])

train_dataset, val_dataset, test_dataset

(<__main__.CharSequenceDataset at 0x7f48d1b63ed0>,
 <__main__.CharSequenceDataset at 0x7f48d18bfc50>,
 <__main__.CharSequenceDataset at 0x7f48d18be550>)

In [40]:
# Define batch size
batch_size = 64

# Create the DataLoader for training data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=_init_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=_init_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=_init_fn)

train_loader, val_loader, test_loader

(<torch.utils.data.dataloader.DataLoader at 0x7f48d18bf910>,
 <torch.utils.data.dataloader.DataLoader at 0x7f48d18bded0>,
 <torch.utils.data.dataloader.DataLoader at 0x7f48d18bda50>)

In [41]:
import torch
import torch.nn as nn
import torch.optim as optim

# Best params: {'hidden_size': 128, 'learning_rate': 0.0002, 'embedding_dim': 50, 'num_hidden_layers': 1}
# Hyperparameters
hidden_size = 128
embedding_dim = 50
num_hidden_layers = 1
learning_rate = 0.0002
num_epochs = 70

model = SimpleLSTM(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim)


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [42]:
import torch

def train_model(model, train_loader, val_loader, num_epochs, criterion, optimizer, model_save_path):
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        for batch_idx, (data, targets) in enumerate(train_loader):
            # Forward pass
            scores = model(data)
            loss = criterion(scores, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data, targets in val_loader:
                scores = model(data)
                loss = criterion(scores, targets)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        # Print training and validation loss
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Check if this is the best model so far and save it if it is
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_save_path)
            print(f"Saved model with validation loss: {best_val_loss:.4f}")

# Example usage
model_save_path = 'best_model.pth'  # Define your model save path
train_model(model, train_loader, val_loader, num_epochs, criterion, optimizer, model_save_path)


Epoch [1/70], Training Loss: 3.4038, Validation Loss: 3.2583
Saved model with validation loss: 3.2583
Epoch [2/70], Training Loss: 3.0020, Validation Loss: 2.7635
Saved model with validation loss: 2.7635
Epoch [3/70], Training Loss: 2.6640, Validation Loss: 2.5809
Saved model with validation loss: 2.5809
Epoch [4/70], Training Loss: 2.5420, Validation Loss: 2.4662
Saved model with validation loss: 2.4662
Epoch [5/70], Training Loss: 2.4559, Validation Loss: 2.3734
Saved model with validation loss: 2.3734
Epoch [6/70], Training Loss: 2.3853, Validation Loss: 2.3116
Saved model with validation loss: 2.3116
Epoch [7/70], Training Loss: 2.3286, Validation Loss: 2.2553
Saved model with validation loss: 2.2553
Epoch [8/70], Training Loss: 2.2782, Validation Loss: 2.2101
Saved model with validation loss: 2.2101
Epoch [9/70], Training Loss: 2.2357, Validation Loss: 2.1759
Saved model with validation loss: 2.1759
Epoch [10/70], Training Loss: 2.1931, Validation Loss: 2.1365
Saved model with val

In [43]:
import torch
from sklearn.metrics import classification_report, accuracy_score

def predict(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    actuals = []

    with torch.no_grad():
        for data, targets in data_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)

            predictions.extend(predicted.cpu().numpy())
            actuals.extend(targets.cpu().numpy())

    return predictions, actuals


In [44]:
predictions, actuals = predict(model, val_loader)

# Print the classification report
print(classification_report(actuals, predictions, target_names=[id2char[s] for s in set(actuals + predictions)]))

# Calculate and print the accuracy
accuracy = accuracy_score(actuals, predictions)
print(f"Validation Accuracy: {accuracy:.4f}")

              precision    recall  f1-score   support

           A       0.00      0.00      0.00         1
           C       0.33      0.75      0.46         4
           D       0.00      0.00      0.00         0
           E       0.29      0.14      0.19        14
           F       0.00      0.00      0.00         1
           G       0.64      0.35      0.45        26
           H       0.50      1.00      0.67         1
           I       0.25      0.17      0.20         6
           J       0.22      0.25      0.24         8
           K       0.50      0.64      0.56        11
           L       0.00      0.00      0.00         1
           M       0.92      0.61      0.73        18
           N       0.00      0.00      0.00         2
           P       0.68      0.63      0.65        27
           Q       0.00      0.00      0.00         1
           R       0.42      0.74      0.53        31
           S       0.40      0.67      0.50        33
           T       0.80    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
predictions, actuals = predict(model, test_loader)

# Print the classification report
print(classification_report(actuals, predictions, target_names=[id2char[s] for s in set(actuals + predictions)]))

# Calculate and print the accuracy
accuracy = accuracy_score(actuals, predictions)
print(f"Test Accuracy: {accuracy:.4f}")


              precision    recall  f1-score   support

           A       1.00      1.00      1.00         2
           E       0.43      1.00      0.60         3
           F       0.00      0.00      0.00         0
           G       0.83      0.93      0.88        27
           H       0.00      0.00      0.00         0
           I       0.50      1.00      0.67         1
           J       0.88      0.78      0.82         9
           K       1.00      1.00      1.00        12
           M       0.93      0.93      0.93        28
           P       0.95      0.80      0.87       111
           Q       0.00      0.00      0.00         0
           R       0.85      0.90      0.87        92
           S       0.54      0.92      0.68        24
           T       1.00      1.00      1.00        14
           W       0.00      0.00      0.00         0
           1       1.00      0.65      0.79        17
           2       0.88      0.54      0.67        39
           5       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
import torch
import torch.nn as nn
import torch.optim as optim

def create_and_train_model(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim, train_loader, val_loader, learning_rate, num_epochs, model_save_path):
    # Create the model instance
    model = SimpleLSTM(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Initialize the best validation loss to infinity at the start
    best_val_loss = float('inf')

    # Training function
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        for batch_idx, (data, targets) in enumerate(train_loader):
            # Forward pass
            scores = model(data)
            loss = criterion(scores, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data, targets in val_loader:
                scores = model(data)
                loss = criterion(scores, targets)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        # Print training and validation loss
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Check if this is the best model so far and save it if it is
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_save_path)
            print(f"Saved model with validation loss: {best_val_loss:.4f}")

    # Return the trained model and its best validation loss
    return model, best_val_loss


In [None]:
# Best model tracking
best_model = None
best_accuracy = 0.0
best_model_hyperparams = {}
lowest_val_loss = float('inf')

# Hyperparameters to explore
num_hidden_layers_list = [1, 2, 3]
hidden_sizes = [128, 256]
learning_rates = [0.0003, 0.0002, 0.0001]
num_epochs = 50
embedding_dims = [20, 50, 100]

# Iterate over different combinations of hyperparameters
for num_hidden_layers in num_hidden_layers_list:
    for embedding_dim in embedding_dims:
        for hidden_size in hidden_sizes:
            for lr in learning_rates:
                print(f"Training with hidden_size: {hidden_size}, learning_rate: {lr}")
        
                # Create and train the model with given hyperparameters
                model, best_val_loss = create_and_train_model(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim, train_loader, test_loader, lr, num_epochs, model_save_path)
        
                print(f"Model with hidden_size {hidden_size} and learning_rate {lr} achieved validation loss: {best_val_loss}. Embedding dim: {embedding_dim}. num_hidden_layers: {num_hidden_layers}")
        
                predictions, actuals = predict(model, val_loader)
                # Calculate and print the accuracy
                accuracy = accuracy_score(actuals, predictions)
                print(f"===== Accuracy {accuracy:.4f}.")
        
                # Check if this model is the best so far
                # if best_val_loss < lowest_val_loss:
                if accuracy > best_accuracy:
                    lowest_val_loss = best_val_loss
                    best_model = model
                    best_model_hyperparams = {'hidden_size': hidden_size, 'learning_rate': lr, 'embedding_dim': embedding_dim, 'num_hidden_layers': num_hidden_layers}
                    best_accuracy = accuracy

# best_model now holds the best performing model
print(f"Best model hyperparameters: {best_model_hyperparams}")
print(f"Lowest validation loss: {lowest_val_loss}")


Training with hidden_size: 128, learning_rate: 0.0003
Epoch [1/50], Training Loss: 3.3991, Validation Loss: 3.2045
Saved model with validation loss: 3.2045
Epoch [2/50], Training Loss: 2.9280, Validation Loss: 2.3828
Saved model with validation loss: 2.3828
Epoch [3/50], Training Loss: 2.6228, Validation Loss: 2.2065
Saved model with validation loss: 2.2065
Epoch [4/50], Training Loss: 2.5253, Validation Loss: 2.0602
Saved model with validation loss: 2.0602
Epoch [5/50], Training Loss: 2.4511, Validation Loss: 1.8652
Saved model with validation loss: 1.8652
Epoch [6/50], Training Loss: 2.3881, Validation Loss: 1.7802
Saved model with validation loss: 1.7802
Epoch [7/50], Training Loss: 2.3358, Validation Loss: 1.6480
Saved model with validation loss: 1.6480
Epoch [8/50], Training Loss: 2.2847, Validation Loss: 1.5378
Saved model with validation loss: 1.5378
Epoch [9/50], Training Loss: 2.2421, Validation Loss: 1.4673
Saved model with validation loss: 1.4673
Epoch [10/50], Training Loss

In [None]:
# Assuming 'model' is your trained SimpleRNN model
predictions, actuals = predict(best_model, test_loader)

# Print the classification report
print(classification_report(actuals, predictions, target_names=[id2char[s] for s in set(actuals + predictions)]))

# Calculate and print the accuracy
accuracy = accuracy_score(actuals, predictions)
print(f"Accuracy: {accuracy:.4f}")


In [None]:
best_val_loss = float('inf')
best_model_hyperparams = {}
best_model = None

for num_hidden_layers in num_hidden_layers_list:
    for embedding_dim in embedding_dims:
        for hidden_size in hidden_sizes:
            for lr in learning_rates:
                print(f"Training with hidden_size: {hidden_size}, learning_rate: {lr}, embedding_dim: {embedding_dim}, num_hidden_layers: {num_hidden_layers}")
                
                model_save_path = f"model_hs{hidden_size}_lr{lr}_ed{embedding_dim}_nhl{num_hidden_layers}.pth"
                model, val_loss = create_and_train_model(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim, train_loader, val_loader, lr, num_epochs, model_save_path)
                
                print(f"Validation Loss: {val_loss}")
                
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_model = model
                    best_model_hyperparams = {'hidden_size': hidden_size, 'learning_rate': lr, 'embedding_dim': embedding_dim, 'num_hidden_layers': num_hidden_layers}
                    # Optionally, save the best model
                    torch.save(best_model.state_dict(), model_save_path)
                    predictions, actuals = predict(model, val_loader)
                    
                    # Calculate and print the accuracy
                    accuracy = accuracy_score(actuals, predictions)
                    print(f"===== Accuracy {accuracy:.4f}.")

print(f"Best Model Hyperparameters: {best_model_hyperparams}, Lowest Validation Loss: {best_val_loss}")


In [None]:
# Assuming 'model' is your trained SimpleRNN model
predictions, actuals = predict(best_model, test_loader)

# Print the classification report
print(classification_report(actuals, predictions, target_names=[id2char[s] for s in set(actuals + predictions)]))

# Calculate and print the accuracy
accuracy = accuracy_score(actuals, predictions)
print(f"Accuracy: {accuracy:.4f}")
