In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd

In [2]:
train = pd.read_csv('../data/train.csv', header=None, names=["text"])
test = pd.read_csv('../data/answers.csv', header=None, names=["text"])

def separate_xy(df, prefix="train"):
    df[prefix + "_X"] = df["text"].apply(lambda x: x[:-1])
    df[prefix + "_y"] = df["text"].apply(lambda x: x[-1])
    return df

train_df = separate_xy(train, 'train')
test_df = separate_xy(test, 'test')

In [3]:
train_df

Unnamed: 0,text,train_X,train_y
0,SR2SR2SGR,SR2SR2SG,R
1,SRGSRSRSR,SRGSRSRS,R
2,KFKFKFKFK,KFKFKFKF,K
3,X2LS2FRS2,X2LS2FRS,2
4,MMMMGGGGG,MMMMGGGG,G
...,...,...,...
2538,S2ZR22222,S2ZR2222,2
2539,62SNRM1MM,62SNRM1M,M
2540,ASSRSRSRS,ASSRSRSR,S
2541,2ZZT2Z2ZT,2ZZT2Z2Z,T


In [4]:
vocab = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567'

id2char = {idx: char for idx, char in enumerate(vocab)}
char2id = {char: idx for idx, char in enumerate(vocab)}


# Define the size of the input (number of unique characters), hidden layer, and output (same as input)
input_size = len(vocab)
hidden_size = len(vocab) * 3
output_size = len(vocab)

In [5]:

# Define the RNN model
class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=1, embedding_dim=10):
        super(SimpleLSTM, self).__init__()
        self.hidden_size = hidden_size

        # Embedding layer: 8 characters in the alphabet and embedding_dim can be a small number like 10
        self.embedding = nn.Embedding(input_size, embedding_dim=embedding_dim)
        
        # RNN layer
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_hidden_layers, batch_first=True)

        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Embedding input
        embedded = self.embedding(x)

        # RNN output
        lstm_out, _ = self.lstm(embedded)
        last_lstm_out = lstm_out[:, -1, :]

        # Fully connected layer output
        out = self.fc(last_lstm_out)
        return out

In [6]:

# Create the model
simple_rnn = SimpleLSTM(input_size, hidden_size, output_size)

# Define the loss function and the optimizer
# loss_function = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

In [7]:
model_output = simple_rnn(torch.tensor([[18, 18, 18, 18]], dtype=torch.long))
model_output

tensor([[ 3.0451e-02,  8.4776e-02, -8.8940e-02,  3.8290e-02, -1.2575e-02,
          5.1000e-02, -6.8009e-02,  2.3998e-01, -6.7139e-03, -1.0598e-01,
          2.0122e-02,  6.3830e-02,  2.6482e-02,  4.5049e-02,  6.4527e-02,
         -8.0579e-02,  5.1712e-03,  1.6801e-02, -1.6177e-02,  4.3081e-02,
         -9.8260e-02,  8.4264e-02, -1.5770e-02,  2.4700e-02, -1.9264e-01,
          2.3203e-04,  3.7557e-03, -9.9411e-03,  9.6765e-03, -1.6552e-01,
         -1.7477e-02,  2.0609e-02,  1.1238e-01]], grad_fn=<AddmmBackward0>)

In [8]:
probabilities = F.softmax(model_output, dim=1)
probabilities

tensor([[0.0310, 0.0328, 0.0275, 0.0313, 0.0297, 0.0317, 0.0281, 0.0383, 0.0299,
         0.0271, 0.0307, 0.0321, 0.0309, 0.0315, 0.0321, 0.0278, 0.0303, 0.0306,
         0.0296, 0.0314, 0.0273, 0.0328, 0.0296, 0.0309, 0.0248, 0.0301, 0.0302,
         0.0298, 0.0304, 0.0255, 0.0296, 0.0307, 0.0337]],
       grad_fn=<SoftmaxBackward0>)

In [9]:
id2char[int(probabilities.argmax())]

'H'

In [10]:
char2id['S']

18

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class CharSequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        Args:
            sequences (list of strings): List of character sequences.
            labels (list of strings): List of single characters (labels).
        """
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        # Convert the sequence and label to numeric format
        sequence = torch.tensor([char2id[char] for char in self.sequences[idx]], dtype=torch.long)
        label = torch.tensor(char2id[self.labels[idx]], dtype=torch.long)

        return sequence, label


In [12]:
train_dataset = CharSequenceDataset(train_df["train_X"], train_df["train_y"])
test_dataset = CharSequenceDataset(test_df["test_X"], test_df["test_y"])

train_dataset

<__main__.CharSequenceDataset at 0x7f27ab55dc10>

In [13]:
# Define batch size
batch_size = 64

# Create the DataLoader for training data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

test_loader, train_loader

(<torch.utils.data.dataloader.DataLoader at 0x7f27ad1e6110>,
 <torch.utils.data.dataloader.DataLoader at 0x7f27ab55f6d0>)

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim

# Assuming SimpleRNN model is defined as previously
model = SimpleLSTM(input_size, hidden_size, output_size)

# Hyperparameters
learning_rate = 0.00020
num_epochs = 70

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [15]:
import torch

def train_model(model, train_loader, val_loader, num_epochs, criterion, optimizer, model_save_path):
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        for batch_idx, (data, targets) in enumerate(train_loader):
            # Forward pass
            scores = model(data)
            loss = criterion(scores, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data, targets in val_loader:
                scores = model(data)
                loss = criterion(scores, targets)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        # Print training and validation loss
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Check if this is the best model so far and save it if it is
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_save_path)
            print(f"Saved model with validation loss: {best_val_loss:.4f}")

# Example usage
model_save_path = 'best_model.pth'  # Define your model save path
train_model(model, train_loader, test_loader, num_epochs, criterion, optimizer, model_save_path)


Epoch [1/70], Training Loss: 3.4638, Validation Loss: 3.3847
Saved model with validation loss: 3.3847
Epoch [2/70], Training Loss: 3.3052, Validation Loss: 3.0237
Saved model with validation loss: 3.0237
Epoch [3/70], Training Loss: 2.8328, Validation Loss: 2.4151
Saved model with validation loss: 2.4151
Epoch [4/70], Training Loss: 2.6928, Validation Loss: 2.3533
Saved model with validation loss: 2.3533
Epoch [5/70], Training Loss: 2.6444, Validation Loss: 2.2975
Saved model with validation loss: 2.2975
Epoch [6/70], Training Loss: 2.6005, Validation Loss: 2.2182
Saved model with validation loss: 2.2182
Epoch [7/70], Training Loss: 2.5574, Validation Loss: 2.1161
Saved model with validation loss: 2.1161
Epoch [8/70], Training Loss: 2.5171, Validation Loss: 2.0449
Saved model with validation loss: 2.0449
Epoch [9/70], Training Loss: 2.4825, Validation Loss: 1.9485
Saved model with validation loss: 1.9485
Epoch [10/70], Training Loss: 2.4520, Validation Loss: 1.9135
Saved model with val

In [16]:
import torch
from sklearn.metrics import classification_report, accuracy_score

def predict(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    actuals = []

    with torch.no_grad():
        for data, targets in data_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)

            predictions.extend(predicted.cpu().numpy())
            actuals.extend(targets.cpu().numpy())

    return predictions, actuals


In [17]:
# Assuming 'model' is your trained SimpleRNN model
predictions, actuals = predict(model, test_loader)

# Print the classification report
print(classification_report(actuals, predictions, target_names=[id2char[s] for s in set(actuals + predictions)]))

# Calculate and print the accuracy
accuracy = accuracy_score(actuals, predictions)
print(f"Accuracy: {accuracy:.4f}")


              precision    recall  f1-score   support

           A       1.00      0.50      0.67         2
           E       0.00      0.00      0.00         3
           G       0.68      1.00      0.81        27
           H       0.00      0.00      0.00         0
           I       0.50      1.00      0.67         1
           J       0.75      0.33      0.46         9
           K       0.80      1.00      0.89        12
           M       0.90      0.96      0.93        28
           P       0.94      0.82      0.88       111
           R       0.79      0.83      0.81        92
           S       0.66      0.88      0.75        24
           T       0.93      1.00      0.97        14
           1       0.87      0.76      0.81        17
           2       0.93      0.72      0.81        39

    accuracy                           0.83       379
   macro avg       0.70      0.70      0.67       379
weighted avg       0.84      0.83      0.83       379

Accuracy: 0.8285


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim

def create_and_train_model(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim, train_loader, val_loader, learning_rate, num_epochs, model_save_path):
    # Create the model instance
    model = SimpleLSTM(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Initialize the best validation loss to infinity at the start
    best_val_loss = float('inf')

    # Training function
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        for batch_idx, (data, targets) in enumerate(train_loader):
            # Forward pass
            scores = model(data)
            loss = criterion(scores, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data, targets in val_loader:
                scores = model(data)
                loss = criterion(scores, targets)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        # Print training and validation loss
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Check if this is the best model so far and save it if it is
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_save_path)
            print(f"Saved model with validation loss: {best_val_loss:.4f}")

    # Return the trained model and its best validation loss
    return model, best_val_loss


In [21]:
# Best model tracking
best_model = None
best_accuracy = 0.0
best_model_hyperparams = {}
lowest_val_loss = float('inf')

# Hyperparameters to explore
num_hidden_layers_list = [1, 2, 3]
hidden_sizes = [128, 256]
learning_rates = [0.001, 0.0005, 0.0003, 0.0002, 0.0001]
num_epochs = 30
embedding_dims = [10, 20, 50, 100]

# Iterate over different combinations of hyperparameters
for num_hidden_layers in num_hidden_layers_list:
    for embedding_dim in embedding_dims:
        for hidden_size in hidden_sizes:
            for lr in learning_rates:
                print(f"Training with hidden_size: {hidden_size}, learning_rate: {lr}")
        
                # Create and train the model with given hyperparameters
                model, best_val_loss = create_and_train_model(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim, train_loader, test_loader, lr, num_epochs, model_save_path)
        
                print(f"Model with hidden_size {hidden_size} and learning_rate {lr} achieved validation loss: {best_val_loss}. Embedding dim: {embedding_dim}. num_hidden_layers: {num_hidden_layers}")
        
                predictions, actuals = predict(model, test_loader)
                # Calculate and print the accuracy
                accuracy = accuracy_score(actuals, predictions)
                print(f"===== Accuracy {accuracy:.4f}.")
        
                # Check if this model is the best so far
                # if best_val_loss < lowest_val_loss:
                if accuracy > best_accuracy:
                    lowest_val_loss = best_val_loss
                    best_model = model
                    best_model_hyperparams = {'hidden_size': hidden_size, 'learning_rate': lr, 'embedding_dim': embedding_dim, 'num_hidden_layers': num_hidden_layers}
                    best_accuracy = accuracy

# best_model now holds the best performing model
print(f"Best model hyperparameters: {best_model_hyperparams}")
print(f"Lowest validation loss: {lowest_val_loss}")


Training with hidden_size: 128, learning_rate: 0.001
Epoch [1/30], Training Loss: 3.0126, Validation Loss: 2.2954
Saved model with validation loss: 2.2954
Epoch [2/30], Training Loss: 2.5998, Validation Loss: 2.0895
Saved model with validation loss: 2.0895
Epoch [3/30], Training Loss: 2.4684, Validation Loss: 1.8991
Saved model with validation loss: 1.8991
Epoch [4/30], Training Loss: 2.3524, Validation Loss: 1.5295
Saved model with validation loss: 1.5295
Epoch [5/30], Training Loss: 2.2497, Validation Loss: 1.4903
Saved model with validation loss: 1.4903
Epoch [6/30], Training Loss: 2.1753, Validation Loss: 1.2817
Saved model with validation loss: 1.2817
Epoch [7/30], Training Loss: 2.1034, Validation Loss: 1.1972
Saved model with validation loss: 1.1972
Epoch [8/30], Training Loss: 2.0583, Validation Loss: 1.1364
Saved model with validation loss: 1.1364
Epoch [9/30], Training Loss: 1.9978, Validation Loss: 1.1762
Epoch [10/30], Training Loss: 1.9556, Validation Loss: 1.1124
Saved mo

In [22]:
# Assuming 'model' is your trained SimpleRNN model
predictions, actuals = predict(best_model, test_loader)

# Print the classification report
print(classification_report(actuals, predictions, target_names=[id2char[s] for s in set(actuals + predictions)]))

# Calculate and print the accuracy
accuracy = accuracy_score(actuals, predictions)
print(f"Accuracy: {accuracy:.4f}")


              precision    recall  f1-score   support

           A       1.00      1.00      1.00         2
           E       0.50      1.00      0.67         3
           G       0.96      1.00      0.98        27
           H       0.00      0.00      0.00         0
           I       1.00      1.00      1.00         1
           J       0.82      1.00      0.90         9
           K       1.00      1.00      1.00        12
           M       0.90      0.96      0.93        28
           P       1.00      0.86      0.93       111
           R       0.86      0.97      0.91        92
           S       0.74      0.96      0.84        24
           T       1.00      1.00      1.00        14
           1       1.00      0.82      0.90        17
           2       1.00      0.74      0.85        39

    accuracy                           0.91       379
   macro avg       0.84      0.88      0.85       379
weighted avg       0.93      0.91      0.92       379

Accuracy: 0.9129


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
