In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

RANDOM_STATE = 42

torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed_all(RANDOM_STATE)  # For multi-GPU.
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)


def _init_fn(worker_id):
    np.random.seed(int(RANDOM_STATE))

In [3]:
train = pd.read_csv('../data/train.csv', header=None, names=["text"])
test = pd.read_csv('../data/answers.csv', header=None, names=["text"])


def separate_xy(df, prefix="train"):
    df[prefix + "_X"] = df["text"].apply(lambda x: x[:-1])
    df[prefix + "_y"] = df["text"].apply(lambda x: x[-1])
    return df

train_df = separate_xy(train, 'train')
test_df = separate_xy(test, 'test')

In [4]:
train_df

Unnamed: 0,text,train_X,train_y
0,SR2SR2SGR,SR2SR2SG,R
1,SRGSRSRSR,SRGSRSRS,R
2,KFKFKFKFK,KFKFKFKF,K
3,X2LS2FRS2,X2LS2FRS,2
4,MMMMGGGGG,MMMMGGGG,G
...,...,...,...
2538,S2ZR22222,S2ZR2222,2
2539,62SNRM1MM,62SNRM1M,M
2540,ASSRSRSRS,ASSRSRSR,S
2541,2ZZT2Z2ZT,2ZZT2Z2Z,T


In [5]:
train_X, val_X, train_y, val_y = train_test_split(train_df["train_X"], train_df["text"], test_size=0.1, random_state=RANDOM_STATE)

In [6]:
vocab = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567'

id2char = {idx: char for idx, char in enumerate(vocab)}
char2id = {char: idx for idx, char in enumerate(vocab)}


# Define the size of the input (number of unique characters), hidden layer, and output (same as input)
input_size = len(vocab)
hidden_size = len(vocab) * 3
output_size = len(vocab)

In [7]:

# Define the RNN model
class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=1, embedding_dim=10):
        super(SimpleLSTM, self).__init__()
        self.hidden_size = hidden_size

        # Embedding layer: 8 characters in the alphabet and embedding_dim can be a small number like 10
        self.embedding = nn.Embedding(input_size, embedding_dim=embedding_dim)
        
        # RNN layer
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_hidden_layers, batch_first=True)

        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Embedding input
        embedded = self.embedding(x)

        # RNN output
        lstm_out, _ = self.lstm(embedded)
        last_lstm_out = lstm_out[:, -1, :]

        # Fully connected layer output
        out = self.fc(last_lstm_out)
        return out

In [8]:

# Create the model
simple_rnn = SimpleLSTM(input_size, hidden_size, output_size)

# Define the loss function and the optimizer
# loss_function = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

In [9]:
model_output = simple_rnn(torch.tensor([[18, 18, 18, 18]], dtype=torch.long))
model_output

tensor([[-0.0763, -0.0340, -0.0236, -0.0842, -0.0150, -0.0044, -0.0131, -0.0194,
          0.0457, -0.0388,  0.0771, -0.0409,  0.0475,  0.1285, -0.0146,  0.0720,
         -0.1960, -0.0074, -0.0617, -0.0261, -0.1313,  0.0337,  0.0061,  0.0036,
         -0.0881,  0.0635, -0.1149,  0.0259,  0.1789, -0.0761,  0.0471,  0.0518,
         -0.0643]], grad_fn=<AddmmBackward0>)

In [10]:
probabilities = F.softmax(model_output, dim=1)
probabilities

tensor([[0.0283, 0.0295, 0.0298, 0.0281, 0.0301, 0.0304, 0.0301, 0.0300, 0.0320,
         0.0294, 0.0330, 0.0293, 0.0320, 0.0347, 0.0301, 0.0328, 0.0251, 0.0303,
         0.0287, 0.0298, 0.0268, 0.0316, 0.0307, 0.0307, 0.0280, 0.0325, 0.0272,
         0.0313, 0.0365, 0.0283, 0.0320, 0.0322, 0.0286]],
       grad_fn=<SoftmaxBackward0>)

In [11]:
id2char[int(probabilities.argmax())]

'3'

In [12]:
char2id['S']

18

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

class CharSequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        Args:
            sequences (list of strings): List of character sequences.
            labels (list of strings): List of single characters (labels).
        """
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        # Convert the sequence and label to numeric format
        sequence = torch.tensor([char2id[char] for char in self.sequences[idx]], dtype=torch.long)
        label = torch.tensor(char2id[self.labels[idx]], dtype=torch.long)

        return sequence, label


In [14]:
train_X

2240    2SR22SR2
678     M2SKKZEE
1194    S2LZTK2K
1572    2MSG2M2M
2276    EIVD12E1
          ...   
1638    1111SRER
1095    111R11SR
1130    SSGSRSRS
1294    MZTKE2XZ
860     22MSR1SE
Name: train_X, Length: 2288, dtype: object

In [18]:
train_df["train_X"]

0       SR2SR2SG
1       SRGSRSRS
2       KFKFKFKF
3       X2LS2FRS
4       MMMMGGGG
          ...   
2538    S2ZR2222
2539    62SNRM1M
2540    ASSRSRSR
2541    2ZZT2Z2Z
2542    22MMMZTS
Name: train_X, Length: 2543, dtype: object

In [19]:
train_dataset = CharSequenceDataset(list(train_X), list(train_y))
# train_dataset = CharSequenceDataset(train_df["train_X"], train_df["train_y"])
val_dataset = CharSequenceDataset(list(val_X), list(val_y))
test_dataset = CharSequenceDataset(test_df["test_X"], test_df["text"])

train_dataset, val_dataset, test_dataset

(<__main__.CharSequenceDataset at 0x7fa6be78cd10>,
 <__main__.CharSequenceDataset at 0x7fa6be78e0d0>,
 <__main__.CharSequenceDataset at 0x7fa6be78ded0>)

In [20]:
# Define batch size
batch_size = 64

# Create the DataLoader for training data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=_init_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=_init_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=_init_fn)

train_loader, val_loader, test_loader

(<torch.utils.data.dataloader.DataLoader at 0x7fa6be78d210>,
 <torch.utils.data.dataloader.DataLoader at 0x7fa6be78c810>,
 <torch.utils.data.dataloader.DataLoader at 0x7fa6be78e150>)

In [21]:

def predict(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    actuals = []

    with torch.no_grad():
        for data, targets in data_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)

            predictions.extend(predicted.cpu().numpy())
            actuals.extend(targets.cpu().numpy())

    return predictions, actuals


In [22]:
def create_and_train_model(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim, train_loader, val_loader, learning_rate, num_epochs, model_save_path):
    # Create the model instance
    model = SimpleLSTM(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Initialize the best validation loss to infinity at the start
    best_val_loss = float('inf')
    patience = 5  # Number of epochs to wait for improvement before stopping
    epochs_no_improve = 0  # Track the number of epochs with no improvement

    # Training function
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        for batch_idx, (data, targets) in enumerate(train_loader):
            # Prepare the targets for next character prediction
            input_seq = data[:, :-1]
            target_seq = targets[:, 1:]

            # Forward pass
            scores = model(input_seq)
            loss = criterion(scores.view(-1, scores.size(-1)), target_seq.view(-1))

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data, targets in val_loader:
                input_seq = data[:, :-1]
                target_seq = targets[:, 1:]

                scores = model(input_seq)
                loss = criterion(scores.view(-1, scores.size(-1)), target_seq.view(-1))
                val_loss += loss.item()

        val_loss /= len(val_loader)

        # Print training and validation loss
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Check for early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_save_path)
            print(f"Saved model with validation loss: {best_val_loss:.4f}")
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs.")
                break

    # Return the trained model and its best validation loss
    return model, best_val_loss


In [23]:

create_and_train_model(
    33, 256, 33, 2, 100, train_loader, val_loader, 0.0003, 5, f"../trained_models/test_model.pth"
)


KeyError: '622K12ZTM'

In [20]:

def objective(trial):
    # Define the hyperparameter search space
    num_hidden_layers = trial.suggest_int('num_hidden_layers', 1, 3)
    embedding_dim = trial.suggest_categorical('embedding_dim', [50, 100, 128])
    hidden_size = trial.suggest_categorical('hidden_size', [128, 256])
    lr = trial.suggest_categorical('lr', [0.0004, 0.0003, 0.0002])
    num_epochs = trial.suggest_int('num_epochs', 40, 70)

    # Train the model using the suggested hyperparameters
    model_save_path = f"../trained_models/model_trial_{trial.number}.pth"
    model, val_loss = create_and_train_model(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim, train_loader, val_loader, lr, num_epochs, model_save_path)

    predictions, actuals = predict(model, val_loader)
    accuracy = accuracy_score(actuals, predictions)
    print(f"===== Accuracy {accuracy:.4f}.")

    return val_loss


In [21]:
n_trials=50

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=n_trials)  # Set n_trials to the number of trials you want to run

best_trial = study.best_trial
print(f"Best trial: {best_trial.number}")
print(f"Best hyperparameters: {best_trial.params}")
print(f"Best validation loss: {best_trial.value}")

# Load the best model
best_model_path = f"../trained_models/model_trial_{best_trial.number}.pth"
best_model = SimpleLSTM(input_size, best_trial.params['hidden_size'], output_size, best_trial.params['num_hidden_layers'], best_trial.params['embedding_dim'])
best_model.load_state_dict(torch.load(best_model_path))


[I 2023-11-22 09:05:43,276] A new study created in memory with name: no-name-711ed1d9-9e73-428e-b43b-d0324e1a8d68
[W 2023-11-22 09:05:43,511] Trial 0 failed with parameters: {'num_hidden_layers': 3, 'embedding_dim': 100, 'hidden_size': 128, 'lr': 0.0004, 'num_epochs': 52} because of the following error: IndexError('too many indices for tensor of dimension 1').
Traceback (most recent call last):
  File "/home/giyaseddin/miniconda3/envs/chatbot-special-language-modelling-assignment/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_77046/3130284487.py", line 11, in objective
    model, val_loss = create_and_train_model(input_size, hidden_size, output_size, num_hidden_layers, embedding_dim, train_loader, val_loader, lr, num_epochs, model_save_path)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

IndexError: too many indices for tensor of dimension 1

In [None]:
# Assuming 'model' is your trained SimpleRNN model
predictions, actuals = predict(best_model, val_loader)

# Print the classification report
print(classification_report(actuals, predictions, target_names=[id2char[s] for s in set(actuals + predictions)]))

# Calculate and print the accuracy
accuracy = accuracy_score(actuals, predictions)
print(f"Accuracy: {accuracy:.4f}")


In [None]:
# Assuming 'model' is your trained SimpleRNN model
predictions, actuals = predict(best_model, test_loader)

# Print the classification report
print(classification_report(actuals, predictions, target_names=[id2char[s] for s in set(actuals + predictions)]))

# Calculate and print the accuracy
accuracy = accuracy_score(actuals, predictions)
print(f"Accuracy: {accuracy:.4f}")


In [22]:

hidden_size = 128
embedding_dim = 100
num_hidden_layers = 1
lr = 0.0004
num_epochs = 50

model_save_path = f"../trained_models/model_trial_full_run_next_char_loss.pth"

model, val_loss = create_and_train_model(
    input_size, hidden_size, output_size, num_hidden_layers, embedding_dim, train_loader, val_loader, lr, num_epochs, model_save_path
)


IndexError: too many indices for tensor of dimension 1

In [None]:
# Assuming 'model' is your trained SimpleRNN model
predictions, actuals = predict(best_model, test_loader)

# Print the classification report
print(classification_report(actuals, predictions, target_names=[id2char[s] for s in set(actuals + predictions)]))

# Calculate and print the accuracy
accuracy = accuracy_score(actuals, predictions)
print(f"Accuracy: {accuracy:.4f}")


In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Dataset class
class CharSequenceDataset(Dataset):
    def __init__(self, dataframe, char2id):
        self.sequences = dataframe['train_X'].tolist()
        self.labels = dataframe['train_y'].tolist()
        self.char2id = char2id

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        input_sequence = self.sequences[idx]
        target_label = self.labels[idx]

        input_sequence_encoded = torch.tensor([self.char2id[char] for char in input_sequence], dtype=torch.long)
        target_label_encoded = torch.tensor(self.char2id[target_label], dtype=torch.long)

        return input_sequence_encoded, target_label_encoded

# LSTM Model
class CharLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, embedding_dim):
        super(CharLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        h0, c0 = self.init_hidden(x.size(0))
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

    def init_hidden(self, batch_size):
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        return h0, c0

# Prepare data
char2id = {char: idx for idx, char in enumerate('ABCD1234...')} # Adjust the alphabet as per your requirement
id2char = {idx: char for char, idx in char2id.items()}
data = {
    'train_X': ['SR2SR2SG', 'SRGSRSRS'], # Example data
    'train_y': ['R', 'R']
}
df = pd.DataFrame(data)
dataset = CharSequenceDataset(df, char2id)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Model hyperparameters
input_size = len(char2id)
hidden_size = 128
output_size = input_size
num_layers = 2
embedding_dim = 100
learning_rate = 0.001
num_epochs = 50

# Model, loss, optimizer
model = CharLSTM(input_size, hidden_size, output_size, num_layers, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(data_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Function to generate sequences
def generate_sequence(model, start_seq, char2id, id2char, length=9):
    model.eval()
    sequence = start_seq

    for _ in range(length - len(start_seq)):
        input_seq = torch.tensor([char2id[char] for char in sequence], dtype=torch.long).unsqueeze(0)
        with torch.no_grad():
            output = model(input_seq)
            predicted_char_id = output.argmax(dim=1).item()
        sequence += id2char[predicted_char_id]

    return sequence

# Example sequence generation
start_seq = 'SR2SR2SG'
generated_seq = generate_sequence(model, start_seq, char2id, id2char)
print("Generated sequence:", generated_seq)


KeyError: 'S'

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Dataset class
class CharSequenceDataset(Dataset):
    def __init__(self, sequences, char2id):
        """
        sequences (list of strings): List of character sequences.
        char2id (dict): Mapping from character to unique integer ID.
        """
        self.sequences = sequences
        self.char2id = char2id

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_sequence = sequence[:-1]  # First 8 characters as input
        target_label = sequence[-1]     # Last character as target

        input_sequence_encoded = torch.tensor([self.char2id[char] for char in input_sequence], dtype=torch.long)
        target_label_encoded = torch.tensor(self.char2id[target_label], dtype=torch.long)

        return input_sequence_encoded, target_label_encoded

# LSTM Model
class CharLSTM(nn.Module):
    # ... (same as before)

# Prepare data
data = {
    'text': ['SR2SR2SGR', 'SRGSRSRSR']  # Example data
}
df = pd.DataFrame(data)
char2id = {char: idx for idx, char in enumerate(sorted(set(''.join(df['text']))))}
id2char = {idx: char for char, idx in char2id.items()}

sequences = df['text'].tolist()
dataset = CharSequenceDataset(sequences, char2id)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Model hyperparameters
input_size = len(char2id)
hidden_size = 128
output_size = input_size
num_layers = 2
embedding_dim = 100
learning_rate = 0.001
num_epochs = 50

# Model, loss, optimizer
model = CharLSTM(input_size, hidden_size, output_size, num_layers, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(data_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    
# Example sequence generation
start_seq = 'SR2SR2SG'  # Example start sequence
generated_seq = generate_sequence(model, start_seq, char2id, id2char)
print("Generated sequence:", generated_seq)
