In [114]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd

In [117]:
train = pd.read_csv('../data/train.csv', header=None, names=["text"])
test = pd.read_csv('../data/answers.csv', header=None, names=["text"])

def separate_xy(df, prefix="train"):
    df[prefix + "_X"] = df["text"].apply(lambda x: x[:-1])
    df[prefix + "_y"] = df["text"].apply(lambda x: x[-1])
    return df

train_df = separate_xy(train, 'train')
test_df = separate_xy(test, 'test')

In [118]:
train_df

Unnamed: 0,text,train_X,train_y
0,SR2SR2SGR,SR2SR2SG,R
1,SRGSRSRSR,SRGSRSRS,R
2,KFKFKFKFK,KFKFKFKF,K
3,X2LS2FRS2,X2LS2FRS,2
4,MMMMGGGGG,MMMMGGGG,G
...,...,...,...
2538,S2ZR22222,S2ZR2222,2
2539,62SNRM1MM,62SNRM1M,M
2540,ASSRSRSRS,ASSRSRSR,S
2541,2ZZT2Z2ZT,2ZZT2Z2Z,T


In [228]:
vocab = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567'

id2char = {idx: char for idx, char in enumerate(vocab)}
char2id = {char: idx for idx, char in enumerate(vocab)}


# Define the size of the input (number of unique characters), hidden layer, and output (same as input)
input_size = len(vocab)
hidden_size = len(vocab) * 3
output_size = len(vocab)

In [244]:

# Define the RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=1, embedding_dim=10):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size

        # Embedding layer: 8 characters in the alphabet and embedding_dim can be a small number like 10
        self.embedding = nn.Embedding(input_size, embedding_dim=embedding_dim)
        
        # RNN layer
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_hidden_layers, batch_first=True)

        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Embedding input
        embedded = self.embedding(x)

        # RNN output
        rnn_out, _ = self.rnn(embedded)
        last_rnn_out = rnn_out[:, -1, :]

        # Fully connected layer output
        out = self.fc(last_rnn_out)
        return out

In [103]:

# Create the model
simple_rnn = SimpleRNN(input_size, hidden_size, output_size)

# Define the loss function and the optimizer
# loss_function = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

In [104]:
model_output = simple_rnn(torch.tensor([[18, 18, 18, 18]], dtype=torch.long))
model_output

tensor([[ 0.1534, -0.2244,  0.0186, -0.4392, -0.0921, -0.3821,  0.1990, -0.1727,
          0.0460,  0.1553, -0.1730,  0.1549, -0.3296, -0.1452, -0.1614,  0.2578,
         -0.1436, -0.5646, -0.1443,  0.0574,  0.1103, -0.4886, -0.1095, -0.2364,
         -0.3765, -0.1388,  0.1650,  0.2636, -0.0422, -0.0586, -0.0937,  0.2367,
         -0.1643]], grad_fn=<AddmmBackward0>)

In [105]:
probabilities = F.softmax(model_output, dim=1)
probabilities

tensor([[0.0376, 0.0258, 0.0329, 0.0208, 0.0294, 0.0220, 0.0394, 0.0272, 0.0338,
         0.0377, 0.0272, 0.0377, 0.0232, 0.0279, 0.0275, 0.0418, 0.0280, 0.0184,
         0.0279, 0.0342, 0.0360, 0.0198, 0.0289, 0.0255, 0.0222, 0.0281, 0.0381,
         0.0420, 0.0309, 0.0304, 0.0294, 0.0409, 0.0274]],
       grad_fn=<SoftmaxBackward0>)

In [108]:
id2char[int(probabilities.argmax())]

'2'

In [109]:
char2id['S']

18

In [135]:
import torch
from torch.utils.data import Dataset, DataLoader

class CharSequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        Args:
            sequences (list of strings): List of character sequences.
            labels (list of strings): List of single characters (labels).
        """
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        # Convert the sequence and label to numeric format
        sequence = torch.tensor([char2id[char] for char in self.sequences[idx]], dtype=torch.long)
        label = torch.tensor(char2id[self.labels[idx]], dtype=torch.long)

        return sequence, label


In [136]:
train_dataset = CharSequenceDataset(train_df["train_X"], train_df["train_y"])
test_dataset = CharSequenceDataset(test_df["test_X"], test_df["test_y"])

train_dataset

<__main__.CharSequenceDataset at 0x7f3bde495b10>

In [137]:
# Define batch size
batch_size = 64

# Create the DataLoader for training data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

test_loader, train_loader

(<torch.utils.data.dataloader.DataLoader at 0x7f3be041b050>,
 <torch.utils.data.dataloader.DataLoader at 0x7f3bdfcfb8d0>)

In [219]:
import torch
import torch.nn as nn
import torch.optim as optim

# Assuming SimpleRNN model is defined as previously
model = SimpleRNN(input_size, hidden_size, output_size)

# Hyperparameters
learning_rate = 0.00020
num_epochs = 70

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [220]:
import torch

def train_model(model, train_loader, val_loader, num_epochs, criterion, optimizer, model_save_path):
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        for batch_idx, (data, targets) in enumerate(train_loader):
            # Forward pass
            scores = model(data)
            loss = criterion(scores, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data, targets in val_loader:
                scores = model(data)
                loss = criterion(scores, targets)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        # Print training and validation loss
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Check if this is the best model so far and save it if it is
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_save_path)
            print(f"Saved model with validation loss: {best_val_loss:.4f}")

# Example usage
model_save_path = 'best_model.pth'  # Define your model save path
train_model(model, train_loader, test_loader, num_epochs, criterion, optimizer, model_save_path)


Epoch [1/70], Training Loss: 3.4177, Validation Loss: 3.2574
Saved model with validation loss: 3.2574
Epoch [2/70], Training Loss: 3.1802, Validation Loss: 2.8630
Saved model with validation loss: 2.8630
Epoch [3/70], Training Loss: 2.8594, Validation Loss: 2.5088
Saved model with validation loss: 2.5088
Epoch [4/70], Training Loss: 2.7275, Validation Loss: 2.3905
Saved model with validation loss: 2.3905
Epoch [5/70], Training Loss: 2.6805, Validation Loss: 2.3515
Saved model with validation loss: 2.3515
Epoch [6/70], Training Loss: 2.6533, Validation Loss: 2.3096
Saved model with validation loss: 2.3096
Epoch [7/70], Training Loss: 2.6293, Validation Loss: 2.2640
Saved model with validation loss: 2.2640
Epoch [8/70], Training Loss: 2.6041, Validation Loss: 2.2228
Saved model with validation loss: 2.2228
Epoch [9/70], Training Loss: 2.5779, Validation Loss: 2.1734
Saved model with validation loss: 2.1734
Epoch [10/70], Training Loss: 2.5505, Validation Loss: 2.1071
Saved model with val

In [221]:
import torch
from sklearn.metrics import classification_report, accuracy_score

def predict(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    actuals = []

    with torch.no_grad():
        for data, targets in data_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)

            predictions.extend(predicted.cpu().numpy())
            actuals.extend(targets.cpu().numpy())

    return predictions, actuals


In [222]:
# Assuming 'model' is your trained SimpleRNN model
predictions, actuals = predict(model, test_loader)

# Print the classification report
print(classification_report(actuals, predictions, target_names=[id2char[s] for s in set(actuals + predictions)]))

# Calculate and print the accuracy
accuracy = accuracy_score(actuals, predictions)
print(f"Accuracy: {accuracy:.4f}")


              precision    recall  f1-score   support

           A       0.00      0.00      0.00         2
           C       0.00      0.00      0.00         0
           E       0.40      0.67      0.50         3
           F       0.00      0.00      0.00         0
           G       0.79      0.96      0.87        27
           I       1.00      1.00      1.00         1
           J       0.88      0.78      0.82         9
           K       1.00      1.00      1.00        12
           M       0.96      0.89      0.93        28
           P       0.94      0.83      0.88       111
           R       0.80      0.86      0.83        92
           S       0.70      0.96      0.81        24
           T       1.00      1.00      1.00        14
           1       1.00      0.76      0.87        17
           2       0.91      0.77      0.83        39
           5       0.00      0.00      0.00         0

    accuracy                           0.85       379
   macro avg       0.65   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [245]:
import torch
import torch.nn as nn
import torch.optim as optim

def create_and_train_model(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim, train_loader, val_loader, learning_rate, num_epochs, model_save_path):
    # Create the model instance
    model = SimpleRNN(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Initialize the best validation loss to infinity at the start
    best_val_loss = float('inf')

    # Training function
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        for batch_idx, (data, targets) in enumerate(train_loader):
            # Forward pass
            scores = model(data)
            loss = criterion(scores, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data, targets in val_loader:
                scores = model(data)
                loss = criterion(scores, targets)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        # Print training and validation loss
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Check if this is the best model so far and save it if it is
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_save_path)
            print(f"Saved model with validation loss: {best_val_loss:.4f}")

    # Return the trained model and its best validation loss
    return model, best_val_loss


In [252]:
# Best model tracking
best_model = None
best_accuracy = 0.0
best_model_hyperparams = {}
lowest_val_loss = float('inf')

# Hyperparameters to explore
num_hidden_layers_list = [1, 2, 3]
hidden_sizes = [128, 256]
learning_rates = [0.001, 0.0005, 0.0003, 0.0002, 0.0001]
num_epochs = 50
embedding_dims = [10, 20, 50]

# Iterate over different combinations of hyperparameters
for num_hidden_layers in num_hidden_layers_list:
    for embedding_dim in embedding_dims:
        for hidden_size in hidden_sizes:
            for lr in learning_rates:
                print(f"Training with hidden_size: {hidden_size}, learning_rate: {lr}")
        
                # Create and train the model with given hyperparameters
                model, best_val_loss = create_and_train_model(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim, train_loader, test_loader, lr, num_epochs, model_save_path)
        
                print(f"Model with hidden_size {hidden_size} and learning_rate {lr} achieved validation loss: {best_val_loss}. Embedding dim: {embedding_dim}. num_hidden_layers: {num_hidden_layers}")
        
                predictions, actuals = predict(model, test_loader)
                # Calculate and print the accuracy
                accuracy = accuracy_score(actuals, predictions)
                print(f"===== Accuracy {accuracy:.4f}.")
        
                # Check if this model is the best so far
                if best_val_loss < lowest_val_loss:
                    lowest_val_loss = best_val_loss
                    best_model = model
                    best_model_hyperparams = {'hidden_size': hidden_size, 'learning_rate': lr, 'embedding_dim': embedding_dim, 'num_hidden_layers': num_hidden_layers}
                    best_accuracy = accuracy

# best_model now holds the best performing model
print(f"Best model hyperparameters: {best_model_hyperparams}")
print(f"Lowest validation loss: {lowest_val_loss}")


Training with hidden_size: 128, learning_rate: 0.001
Epoch [1/50], Training Loss: 2.8705, Validation Loss: 2.2952
Saved model with validation loss: 2.2952
Epoch [2/50], Training Loss: 2.5091, Validation Loss: 1.7820
Saved model with validation loss: 1.7820
Epoch [3/50], Training Loss: 2.3226, Validation Loss: 1.5074
Saved model with validation loss: 1.5074
Epoch [4/50], Training Loss: 2.2176, Validation Loss: 1.3569
Saved model with validation loss: 1.3569
Epoch [5/50], Training Loss: 2.1384, Validation Loss: 1.3401
Saved model with validation loss: 1.3401
Epoch [6/50], Training Loss: 2.0867, Validation Loss: 1.2102
Saved model with validation loss: 1.2102
Epoch [7/50], Training Loss: 2.0305, Validation Loss: 1.1095
Saved model with validation loss: 1.1095
Epoch [8/50], Training Loss: 1.9863, Validation Loss: 1.0803
Saved model with validation loss: 1.0803
Epoch [9/50], Training Loss: 1.9429, Validation Loss: 1.0492
Saved model with validation loss: 1.0492
Epoch [10/50], Training Loss:

In [253]:
# Assuming 'model' is your trained SimpleRNN model
predictions, actuals = predict(best_model, test_loader)

# Print the classification report
print(classification_report(actuals, predictions, target_names=[id2char[s] for s in set(actuals + predictions)]))

# Calculate and print the accuracy
accuracy = accuracy_score(actuals, predictions)
print(f"Accuracy: {accuracy:.4f}")


              precision    recall  f1-score   support

           A       1.00      1.00      1.00         2
           C       0.00      0.00      0.00         0
           E       0.27      1.00      0.43         3
           G       0.76      0.70      0.73        27
           H       0.00      0.00      0.00         0
           I       0.25      1.00      0.40         1
           J       0.60      0.33      0.43         9
           K       1.00      1.00      1.00        12
           M       0.89      0.86      0.87        28
           P       0.97      0.77      0.86       111
           Q       0.00      0.00      0.00         0
           R       0.79      0.74      0.76        92
           S       0.49      0.71      0.58        24
           T       1.00      0.79      0.88        14
           Z       0.00      0.00      0.00         0
           1       0.80      0.71      0.75        17
           2       0.62      0.67      0.64        39
           4       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
