### Problem 1 - Sentiment Analysis using recurrent models


#### 1. Import

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('/content/drive/MyDrive/DS301/IMDB Dataset.csv')
df.sentiment = (df.sentiment == "positive").astype("int")
df.head()

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import re
from collections import Counter
from keras.utils import to_categorical

train_df, temp_df = train_test_split(df, train_size=0.7, random_state=42)
val_df, test_df = train_test_split(temp_df, train_size=(0.15/0.3), random_state=42)

# Preprocessing function to clean and tokenize text
def process_tokens(text):
    # Using regular expressions to remove punctuation and numbers
    return re.sub(r"[^a-zA-Z\s]", "", text.lower()).split()

# Simplified preprocessing function
def preprocessing(data):
    return [process_tokens(sentence) for sentence in data]

# The text and labels are in the first and second columns respectively
train_texts, train_labels = train_df.iloc[:, 0].values, train_df.iloc[:, 1].astype('int')
val_texts, val_labels = val_df.iloc[:, 0].values, val_df.iloc[:, 1].astype('int')
test_texts, test_labels = test_df.iloc[:, 0].values, test_df.iloc[:, 1].astype('int')

# Process texts
train_data = preprocessing(train_texts)
val_data = preprocessing(val_texts)
test_data = preprocessing(test_texts)

# Vectorizer using a simple Bag of Words model
class Vectorizer:
    def __init__(self, max_features=2000):
        self.max_features = max_features
        self.vocab_list = []
        self.token_to_index = {}

    def fit(self, dataset):
        # Flatten the dataset and get the most common tokens
        all_tokens = [token for sublist in dataset for token in sublist]
        most_common_tokens = [token for token, _ in Counter(all_tokens).most_common(self.max_features)]
        self.vocab_list = most_common_tokens
        self.token_to_index = {token: idx for idx, token in enumerate(self.vocab_list)}

    def transform(self, dataset):
        data_matrix = np.zeros((len(dataset), self.max_features))
        for i, sentence in enumerate(dataset):
            for token in sentence:
                index = self.token_to_index.get(token)
                if index is not None:
                    data_matrix[i, index] += 1
        return data_matrix

# Initialize and fit the vectorizer
vectorizer = Vectorizer()
vectorizer.fit(train_data)

# Transform datasets
X_train = vectorizer.transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

# Encode labels
y_train = to_categorical(train_labels, 2)
y_val = to_categorical(val_labels, 2)
y_test = to_categorical(test_labels, 2)

# Reshaping the data to fit the model input shape
X_train = X_train[:, np.newaxis, :]
X_val = X_val[:, np.newaxis, :]
X_test = X_test[:, np.newaxis, :]

print(f'X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}')


#### 2. RNN

In [None]:
# Determine max_features
actual_vocab_size = len(vectorizer.vocab_list)
print(f"The actual vocabulary size used: {actual_vocab_size}")

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

# Define and compile the RNN model in a function
def compile_and_train_rnn(X_train, y_train, X_val, y_val, input_shape, units=256, learning_rate=0.01, epochs=10, batch_size=256):
    # Initialize the Sequential model
    model = Sequential([
        SimpleRNN(units, input_shape=input_shape, activation='tanh'),
        Dense(2, activation='softmax')  # Output layer for binary classification
    ])

    # Compile the model with Adam optimizer and a learning rate of 0.01
    model.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), verbose=1)

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

    return test_accuracy

max_features = 2000
# Train the model and print the test accuracy
test_accuracy = compile_and_train_rnn(X_train, y_train, X_val, y_val, input_shape=(1, max_features))
print(f'Test accuracy: {test_accuracy}')


#### 3. LSTM

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam


# Define and compile the LSTM model in a function
def compile_and_train_lstm(X_train, y_train, X_val, y_val, input_shape, units=256, learning_rate=0.01, epochs=10, batch_size=256):
    # Initialize the Sequential model
    model = Sequential([
        LSTM(units, input_shape=input_shape, activation='tanh'),
        Dense(2, activation='softmax')  # Output layer for binary classification
    ])

    # Compile the model with Adam optimizer and a learning rate of 0.01
    model.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), verbose=1)

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

    return test_accuracy

max_features = 2000  # Make sure this matches the vocabulary size used in your text vectorization
# Train the model and print the test accuracy
test_accuracy = compile_and_train_lstm(X_train, y_train, X_val, y_val, input_shape=(1, max_features))
print(f'Test accuracy: {test_accuracy}')


#### 4. GRU

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.optimizers import Adam

# Define and compile the GRU model in a function
def compile_and_train_gru(X_train, y_train, X_val, y_val, input_shape, units=256, learning_rate=0.01, epochs=10, batch_size=256):
    # Initialize the Sequential model
    model = Sequential([
        GRU(units, input_shape=input_shape, activation='tanh'),
        Dense(2, activation='softmax')  # Output layer for binary classification
    ])

    # Compile the model with Adam optimizer and a learning rate of 0.01
    model.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), verbose=1)

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

    return test_accuracy

max_features = 2000
# Train the model and print the test accuracy
test_accuracy = compile_and_train_gru(X_train, y_train, X_val, y_val, input_shape=(1, max_features))
print(f'Test accuracy: {test_accuracy}')

#### 5. BiLSTM

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Define and compile the BiLSTM model in a function
def compile_and_train_bilstm(X_train, y_train, X_val, y_val, input_shape, units=256, learning_rate=0.01, epochs=10, batch_size=256):
    # Initialize the Sequential model
    model = Sequential([
        Bidirectional(LSTM(units, input_shape=input_shape, activation='tanh')),
        Dense(2, activation='softmax')  # Output layer for binary classification
    ])

    # Compile the model with Adam optimizer and a learning rate of 0.01
    model.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), verbose=1)

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

    return test_accuracy

max_features = 2000
# Train the model and print the test accuracy
test_accuracy = compile_and_train_bilstm(X_train, y_train, X_val, y_val, input_shape=(1, max_features))
print(f'Test accuracy: {test_accuracy}')

#### 6. Best accuracy
LSTM model yields the best accuracy, which is 0.875.

### Problem 2 - Training a simple chatbot using a seq-to-seq mode

#### 1. Chatbot model

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import json


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [None]:
corpus_name = "movie-corpus"
drive_path = "/content/drive/MyDrive/DS301"
corpus = os.path.join(drive_path, corpus_name)

def printLines(file, n=10):
    with open(file, 'r', encoding='utf-8') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

printLines(os.path.join(corpus_path, "utterances.jsonl"))

{"id": "L1045", "conversation_id": "L1044", "text": "They do not!", "speaker": "u0", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "not", "tag": "RB", "dep": "neg", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": "L1044", "timestamp": null, "vectors": []}

{"id": "L1044", "conversation_id": "L1044", "text": "They do to!", "speaker": "u2", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "to", "tag": "TO", "dep": "dobj", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": null, "timestamp": null, "vectors": []}

{"id": "L985", "conversation_id": "L984", "text": "I hope so.", "speaker": "u0", "meta": {"movie_id"

In [None]:
# Splits each line of the file to create lines and conversations
def loadLinesAndConversations(fileName):
    lines = {}
    conversations = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            lineJson = json.loads(line)
            # Extract fields for line object
            lineObj = {}
            lineObj["lineID"] = lineJson["id"]
            lineObj["characterID"] = lineJson["speaker"]
            lineObj["text"] = lineJson["text"]
            lines[lineObj['lineID']] = lineObj

            # Extract fields for conversation object
            if lineJson["conversation_id"] not in conversations:
                convObj = {}
                convObj["conversationID"] = lineJson["conversation_id"]
                convObj["movieID"] = lineJson["meta"]["movie_id"]
                convObj["lines"] = [lineObj]
            else:
                convObj = conversations[lineJson["conversation_id"]]
                convObj["lines"].insert(0, lineObj)
            conversations[convObj["conversationID"]] = convObj

    return lines, conversations


# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations.values():
        # Iterate over all the lines of the conversation
        for i in range(len(conversation["lines"]) - 1):  # We ignore the last line (no answer for it)
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i+1]["text"].strip()
            # Filter wrong samples (if one of the lists is empty)
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs

In [None]:
import codecs
import csv

drive_path = "/content/drive/MyDrive/DS301/movie-corpus"

# Define path to new file
datafile = os.path.join(drive_path, "formatted_movie_lines.txt")

delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Initialize lines dict and conversations dict
lines = {}
conversations = {}

print("\nProcessing corpus into lines and conversations...")
lines, conversations = loadLinesAndConversations(os.path.join(drive_path, "utterances.jsonl"))

# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)

# Function to print a sample of lines
def printLines(file, n=10):
    with open(file, 'r', encoding='utf-8') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)



Processing corpus into lines and conversations...

Writing newly formatted file...

Sample lines from file:
They do to!	They do not!

She okay?	I hope so.

Wow	Let's go.

"I'm kidding.  You know how sometimes you just become this ""persona""?  And you don't know how to quit?"	No

No	Okay -- you're gonna need to learn how to lie.

I figured you'd get to the good stuff eventually.	What good stuff?

What good stuff?	"The ""real you""."

"The ""real you""."	Like my fear of wearing pastels?

do you listen to this crap?	What crap?

What crap?	Me.  This endless ...blonde babble. I'm like, boring myself.



In [None]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token
UNK_token = 3  # Unknown word token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {"UNK": UNK_token}
        self.word2count = {"UNK": 0}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", UNK_token: "UNK"}
        self.num_words = 4  # Count SOS, EOS, PAD, UNK

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Call this method to safely get the index of a word
    def getIndex(self, word):
        return self.word2index.get(word, UNK_token)

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count and k != "UNK":  # Don't trim the "UNK" token
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index) - 1,  # Exclude "UNK" token from the original count
            float(len(keep_words)) / (len(self.word2index) - 1)
        ))

        # Reinitialize dictionaries
        self.word2index = {"UNK": UNK_token}
        self.word2count = {"UNK": 0}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", UNK_token: "UNK"}
        self.num_words = 4  # Count default tokens plus "UNK"

        for word in keep_words:
            self.addWord(word)


In [None]:
MAX_LENGTH = 10  # Maximum sentence length to consider

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # Read the file and split into lines
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using the ``filterPair`` condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading lines...
Read 221282 sentence pairs
Trimmed to 64313 sentence pairs
Counting words...
Counted words: 18083

pairs:
['they do to !', 'they do not !']
['she okay ?', 'i hope so .']
['wow', 'let s go .']
['what good stuff ?', 'the real you .']
['the real you .', 'like my fear of wearing pastels ?']
['do you listen to this crap ?', 'what crap ?']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['have fun tonight ?', 'tons']


In [None]:
MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 7833 / 18079 = 0.4333
Trimmed from 64313 pairs to 53131, 0.8261 of total


In [None]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[ 38,  20,  20, 104,  20],
        [285, 336,  18,  11,  11],
        [ 20,  86,  23,   2,   2],
        [  5, 258, 295,   0,   0],
        [ 51,  26,  11,   0,   0],
        [ 49,  11,   2,   0,   0],
        [ 11,   2,   0,   0,   0],
        [  2,   0,   0,   0,   0]])
lengths: tensor([8, 7, 6, 3, 3])
target_variable: tensor([[  17,   35,  278,  104,    5],
        [  73,  141,   73,   20,   25],
        [ 646,  269,   11,   11,  351],
        [  63,  100,    2,    2,  353],
        [ 202,  188,    0,    0,   73],
        [5741,   11,    0,    0,   11],
        [ 159,    2,    0,    0,    2],
        [  25,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
mask: tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True, False, False,  True],
        [ True,  True, False, False,  True],
        [ True, 

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size parameters are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

In [None]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [None]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [None]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [None]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for RNN packing should always be on the CPU
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropagation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [None]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [None]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [None]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [None]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#``attn_model = 'general'``
#``attn_model = 'concat'``
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000

In [None]:
loadFilename = os.path.join(save_dir, model_name, corpus_name,
                    '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
                    '{}_checkpoint.tar'.format(checkpoint_iter))

In [None]:
save_dir = "/content/drive/MyDrive/DS301/movie-corpus/save_dir"

# Load model if a ``loadFilename`` is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
num_embeddings = voc.num_words # new code
embedding = nn.Embedding(voc.num_words, hidden_size)

if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

FileNotFoundError: [Errno 2] No such file or directory: 'data/save/cb_model/movie-corpus/2-2_500/4000_checkpoint.tar'

In [None]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have CUDA, configure CUDA to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

In [None]:
# Set dropout layers to ``eval`` mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

In [None]:
# Begin chatting (uncomment and run the following line to begin)
evaluateInput(encoder, decoder, searcher, voc)

At iteration 3998, the chatbot model is 100.0% complete, with the average loss being 2.4532. At iteration 4000, the average loss is 2.8663.

#### 2. 3. W&B random search

In [None]:
!pip install wandb -Uq

In [None]:
import wandb

In [None]:
wandb.login()

In [None]:
sweep_config = {
    'method': 'random',
    'metric': {
      'name': 'loss',
      'goal': 'minimize'
    },
    'parameters': {
        'learning_rate': {
            'values': [0.0001, 0.00025, 0.0005, 0.001]
        },
        'optimizer': {
            'values': ['adam', 'sgd']
        },
        'clip': {
            'values': [0, 25, 50, 100]
        },
        'teacher_forcing_ratio': {
            'values': [0, 0.5, 1.0]
        },
        'decoder_learning_ratio': {
            'values': [1.0, 3.0, 5.0, 10.0]
        },
        'fc_layer_size': {
            'values': [128, 256, 512]
        },
        'dropout': {
            'values': [0.3, 0.4, 0.5]
        },
        'batch_size': {
            'distribution': 'q_log_uniform_values',
            'q': 8,
            'min': 32,
            'max': 256
        },
        'epochs': {
          'value': 1}
    }
}

import pprint
pprint.pprint(sweep_config)

#### 4. Run the hyperparameter sweeps

In [None]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
from torchvision import datasets, transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        loader = build_dataset(config.batch_size)
        network = build_network(config.fc_layer_size, config.dropout)
        optimizer = build_optimizer(network, config.optimizer, config.learning_rate)

        for epoch in range(config.epochs):
            avg_loss = train_epoch(network, loader, optimizer)
            wandb.log({"loss": avg_loss, "epoch": epoch})

In [None]:
def build_dataset(batch_size):

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307,), (0.3081,))])
    # download MNIST training dataset
    dataset = datasets.MNIST(".", train=True, download=True,
                             transform=transform)
    sub_dataset = torch.utils.data.Subset(
        dataset, indices=range(0, len(dataset), 5))
    loader = torch.utils.data.DataLoader(sub_dataset, batch_size=batch_size)

    return loader


def build_network(fc_layer_size, dropout):
    network = nn.Sequential(  # fully-connected, single hidden layer
        nn.Flatten(),
        nn.Linear(784, fc_layer_size), nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(fc_layer_size, 10),
        nn.LogSoftmax(dim=1))

    return network.to(device)


def build_optimizer(network, optimizer, learning_rate):
    if optimizer == "sgd":
        optimizer = optim.SGD(network.parameters(),
                              lr=learning_rate, momentum=0.9)
    elif optimizer == "adam":
        optimizer = optim.Adam(network.parameters(),
                               lr=learning_rate)
    return optimizer


def train_epoch(network, loader, optimizer):
    cumu_loss = 0
    for _, (data, target) in enumerate(loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()

        # ➡ Forward pass
        loss = F.nll_loss(network(data), target)
        cumu_loss += loss.item()

        # ⬅ Backward pass + weight update
        loss.backward()
        optimizer.step()

        wandb.log({"batch loss": loss.item()})

    return cumu_loss / len(loader)

In [None]:
# sweep_id = wandb.sweep(sweep_config)
sweep_id = wandb.sweep(sweep_config, project="pytorch-sweeps-demo")
# wandb.agent(sweep_id, function=train)
wandb.agent(sweep_id, train, count=5)

#### 5. Best hyperparameters

In [None]:
api = wandb.Api()

entity_name = 'huangytelina'
project_name = 'pytorch-sweeps-demo'

# Get the sweep
sweep = api.sweep(f"{entity_name}/{project_name}/{sweep_id}")

# Find the run with the lowest loss
best_run = sorted(sweep.runs, key=lambda r: r.summary.get('loss', float('inf')))[0]
best_run_id = best_run.id
best_hyperparameters = best_run.config

# Print the best hyperparameters
print(f"Best Run ID: {best_run_id}")
print("Best Hyperparameters:")
for key, value in best_hyperparameters.items():
    print(f"{key}: {value}")

# Download the model artifacts for the best run
model_artifacts = best_run.logged_artifacts()
model_path = 'path_to_save_model'
for artifact in model_artifacts:
    if artifact.type == 'model':
        artifact.download(root=model_path)

print(f"Model downloaded to {model_path}")

Hyperparameters such as learning rate, batch size, and the architecture depth are critical for model convergence. The learning rate controls the size of the steps taken during optimization and needs to be balanced to ensure that the model converges without overshooting the minimum loss. Batch size affects the stability and speed of the convergence, while the architecture depth can impact the model's ability to capture complex patterns in the data.

The optimal hyperparameters, as highlighted by W&B feature importance, suggest a carefully tuned balance for convergence. A moderate learning rate (0.001) and batch size (80) ensure steady progress without overfitting, while dropout (0.3) aids generalization. The fully connected layer size (256) captures complex data patterns effectively. The chosen optimizer, SGD, is simple yet effective, indicating robustness for the dataset used. Teacher forcing (0) and a higher decoder learning ratio (3) hint at a specialized focus on prediction quality. Gradient clipping (25) provides stability by preventing exploding gradients, contributing to the model's successful training dynamics.

### Problem 3 - Attention in Transformer


1. Encoder input vectors are transformed into Query (Q), Key (K), and Value (V) vectors through trained weight matrices, resulting in three distinct vectors per word.

2. Self-attention calculates softmax scores by dot products of query and key vectors, scaled by the key vectors' dimension's square root, then normalized by softmax to yield probabilities summing to 1.

3. In multi-headed attention, each head has its own set of weight matrices for the query, key, and value transformations. It uses 8 separate sets of weight matrices for transforming them. With 8 heads, each requiring 3 matrices (Q, K, and V), a total of 8*3=24 matrices are learned. Each head's matrices are sized 512x512, matching the input and output vector dimensions.

4. To create a single matrix input for the feed-forward layer from multiple attention heads, the output vectors from all heads are concatenated. This concatenated output is then multiplied by a specifically trained weight matrix called the output linear layer, resulting in a single, unified vector per word. For 8 heads, each producing 512-dimensional vectors, the concatenated output is 4096-dimensional. It is then transformed by the output linear layer, which has a dimension of 8x512x512, to produce an output vector of size 512 for each word, compatible with the feed-forward layer’s expected input size.

### Problem 4 - Using BERT for Question Answering

In [None]:
# Install the transformers library that will be used for BERT models.
!pip install transformers

#### 1. BertTokenizer

We will use the BertForQuestionAnswering model and the BertTokenizer as our tokenizer.

In [None]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

#Get the pretrained 'bert-large-uncased-whole-word-masking-finetuned-squad' model from the BertForQuestionAnswering library
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

"""TO DO:
# Similarly, get the tokenizer from pretrained 'bert-large-uncased-whole-word-masking-finetuned-squad' from the BertTokenizer library.
"""

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


In [None]:
question = "What was BERT trained on?"

paragraph = "BERT stands for Bidirectional Encoder Representation of Transformer. I feel that its name itself is descriptive enough to get the gist. Still, to understand it better, it’s encoder part of the encoder-decoder transformer model, it’s also bidirectional in nature, which means that for any input it’s able to learn dependencies from both left and right of any word. It was trained on Wikipedia text and BooksCorpus and open-sourced back in 2018 by Google. You can find the official repository and paper at Github: BERT and BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. There are two models introduced in the paper. BERT base — 12 layers (transformer blocks), 110 million parameters. BERT Large — 24 layers, 340 million parameters. Later google also released Multi-lingual BERT to accelerate the research"

#### 2. encode_plus

In [None]:
encoding = tokenizer.encode_plus(text=question, text_pair=paragraph, add_special_tokens=True)

#### 3. Keys

In [None]:
print(encoding.keys())

In [None]:
inputs = encoding['input_ids']  # Token embeddings
sentence_embedding = encoding['token_type_ids']  # Segment embeddings

# We convert the input ids to tokens
tokens = tokenizer.convert_ids_to_tokens(inputs)  # input tokens

# The model returns the most probable start and end words scores.
scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
print(scores)

#### 4. Index 5. Print output

In [None]:
import torch

start_index = torch.argmax(scores.start_logits)
end_index = torch.argmax(scores.end_logits)

if end_index >= start_index:
    # Convert tokens to string from start to end index
    answer = " ".join(tokens[start_index:end_index + 1])
    print(answer)
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")

#### 5. Unusual tokens
The output looks unusual in the tokens "##corp" and "##us" because the tokens from the BERT tokenizer include special characters to indicate subword units. BERT uses a WordPiece tokenization method that breaks down words into smaller pieces so that the model can deal with a wide range of vocabulary with a limited set of learned embeddings.

### Problem 5 - Hyperparameter Optimization using H20

#### 1. Grid search

In [None]:
!pip install h2o

Collecting h2o
  Downloading h2o-3.46.0.1-py2.py3-none-any.whl (265.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.6/265.6 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h2o
Successfully installed h2o-3.46.0.1


In [None]:
import h2o
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2ORandomForestEstimator

# Initialize H2O
h2o.init()

airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")

# Define the hyperparameters to search over
hyper_parameters = {'ntrees': [10, 30, 50, 100], 'max_depth': [1, 2, 4, 6]}

# Initialize the Grid Search with Random Forest estimator
grid_search = H2OGridSearch(model=H2ORandomForestEstimator(seed=1234), hyper_params=hyper_parameters)

# set the predictor names and the response column name
predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
              "DayOfWeek", "Month", "Distance", "FlightNum"]
response = "IsDepDelayed"

# split into train and validation sets
train, valid = airlines.split_frame(ratios = [.8], seed = 1234)

# Train the models with the grid search
grid_search.train(x=predictors, y=response, training_frame=train, validation_frame=valid)

# Get the grid search results, sorted by accuracy in a decreasing order
sorted_grid = grid_search.get_grid(sort_by='accuracy', decreasing=True)
print(sorted_grid)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.22" 2024-01-16; OpenJDK Runtime Environment (build 11.0.22+7-post-Ubuntu-0ubuntu222.04.1); OpenJDK 64-Bit Server VM (build 11.0.22+7-post-Ubuntu-0ubuntu222.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpv9wz9yu3
  JVM stdout: /tmp/tmpv9wz9yu3/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpv9wz9yu3/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,26 days
H2O_cluster_name:,H2O_from_python_unknownUser_99ggro
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,12.75 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%
Hyper-Parameter Search Summary: ordered by decreasing accuracy
    max_depth    ntrees    model_ids                                                     accuracy
--  -----------  --------  ------------------------------------------------------------  ----------
    6            30        Grid_DRF_py_2_sid_8a8d_model_python_1712601730295_1_model_8   0.663
    6            50        Grid_DRF_py_2_sid_8a8d_model_python_1712601730295_1_model_12  0.662771
    6            100       Grid_DRF_py_2_sid_8a8d_model_python_1712601730295_1_model_16  0.661969
    6            10        Grid_DRF_py_2_sid_8a8d_model_python_1712601730295_1_model_4   0.660823
    4            50        Grid_DRF_py_2_sid_8a8d_model_python_1712601730295_1_model_11  0.65051
    4            100       Grid_DRF_py_2_sid_8a8d_model_python_

In [None]:
# Identify the best model and evaluate the model’s performance on a test set
best_model = sorted_grid.models[0]
best_model_perf = best_model.model_performance(valid)
print("The best model is model_8 with max_depth 6 and 30 trees")
print(f"AUC Score on validation set: {best_model_perf.auc()}")

The best model is model_8 with max_depth 6 and 30 trees
AUC Score on validation set: 0.7166542527113227


#### 2. Randomized grid search

In [None]:
# Specify search criteria for randomized grid search
search_criteria = {"strategy": "RandomDiscrete", "max_models": 10, 'seed': 42}

train, valid = airlines.split_frame(ratios=[.8], seed=1234)

# Initialize the Randomized Grid Search with Random Forest estimator
grid_search = H2OGridSearch(model=H2ORandomForestEstimator(seed=1234),
                            hyper_params=hyper_parameters,
                            search_criteria=search_criteria)

# Train the models with the randomized grid search
grid_search.train(x=predictors, y=response, training_frame=train)

# Get the grid search results, sorted by accuracy in a decreasing order
sorted_grid = grid_search.get_grid(sort_by='accuracy', decreasing=True)
print(sorted_grid)

drf Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%
Hyper-Parameter Search Summary: ordered by decreasing accuracy
    max_depth    ntrees    model_ids                                                        accuracy
--  -----------  --------  ---------------------------------------------------------------  ----------
    6            100       Grid_DRF_py_5_sid_8a8d_model_python_1712601730295_1115_model_6   0.671016
    6            30        Grid_DRF_py_5_sid_8a8d_model_python_1712601730295_1115_model_4   0.667697
    4            100       Grid_DRF_py_5_sid_8a8d_model_python_1712601730295_1115_model_8   0.659357
    6            10        Grid_DRF_py_5_sid_8a8d_model_python_1712601730295_1115_model_7   0.658113
    4            30        Grid_DRF_py_5_sid_8a8d_model_python_1712601730295_1115_model_10  0.653258
    4            10        Grid_DRF_py_5_sid_8a8d_model_python_1712601730295_1115_model_1   0.638525
    2            100       Grid_

In [None]:
# Identify the best model and evaluate the model’s performance on a test set
best_model = sorted_grid.models[0]
best_model_perf = best_model.model_performance(valid)
print("The best model is model_6 with max_depth 6 and 100 trees")
print(f"AUC Score on validation set: {best_model_perf.auc()}")

The best model is model_6 with max_depth 6 and 100 trees
AUC Score on validation set: 0.7188120050200918


#### 3. H2O AutoML

In [None]:
# Repeat the preparation process
# Initialize H2O
h2o.init()

airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")

# Define the hyperparameters to search over
hyper_parameters = {'ntrees': [10, 30, 50, 100], 'max_depth': [1, 2, 4, 6]}

# Initialize the Grid Search with Random Forest estimator
grid_search = H2OGridSearch(model=H2ORandomForestEstimator(seed=1234), hyper_params=hyper_parameters)

# set the predictor names and the response column name
predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
              "DayOfWeek", "Month", "Distance", "FlightNum"]
response = "IsDepDelayed"

# split into train and validation sets
train, valid = airlines.split_frame(ratios = [.8], seed = 1234)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 min 25 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,26 days
H2O_cluster_name:,H2O_from_python_unknownUser_99ggro
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,12.74 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [None]:
from h2o.automl import H2OAutoML

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=10, seed=1, max_runtime_secs=1000) # Run for at most 100 secs to save running time
aml.train(x=predictors, y=response, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_1_AutoML_1_20240408_184340,0.751711,0.589492,0.764765,0.355465,0.449521,0.202069
StackedEnsemble_BestOfFamily_1_AutoML_1_20240408_184340,0.750975,0.590115,0.764132,0.339008,0.449827,0.202345
GBM_1_AutoML_1_20240408_184340,0.746962,0.593601,0.759656,0.353689,0.451444,0.203801
GBM_4_AutoML_1_20240408_184340,0.745632,0.595072,0.756256,0.353863,0.452035,0.204336
XRT_1_AutoML_1_20240408_184340,0.743993,0.597298,0.754893,0.36765,0.453099,0.205298
XGBoost_2_AutoML_1_20240408_184340,0.743973,0.596507,0.757167,0.364245,0.452756,0.204988
XGBoost_1_AutoML_1_20240408_184340,0.743243,0.598759,0.756401,0.35556,0.453596,0.205749
GBM_3_AutoML_1_20240408_184340,0.742187,0.598218,0.751502,0.364613,0.453418,0.205588
GBM_2_AutoML_1_20240408_184340,0.741271,0.599021,0.751022,0.351863,0.453806,0.20594
XGBoost_3_AutoML_1_20240408_184340,0.736816,0.602151,0.750317,0.361275,0.455632,0.207601


In [None]:
# Identify the best performing model and print its parameters
print(aml.leader)
print("Parameters:", aml.leader.params)

Model Details
H2OStackedEnsembleEstimator : Stacked Ensemble
Model Key: StackedEnsemble_AllModels_1_AutoML_1_20240408_184340


Model Summary for Stacked Ensemble: 
key                                   value
------------------------------------  ----------------
Stacking strategy                     cross_validation
Number of base models (used / total)  7/10
# GBM base models (used / total)      2/4
# XGBoost base models (used / total)  3/3
# DRF base models (used / total)      2/2
# GLM base models (used / total)      0/1
Metalearner algorithm                 GLM
Metalearner fold assignment scheme    Random
Metalearner nfolds                    5
Metalearner fold_column
Custom metalearner hyperparameters    None

ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.17746762744143907
RMSE: 0.42126906774820183
LogLoss: 0.5319711860278803
AUC: 0.8185480248240496
AUCPR: 0.8360351136154335
Gini: 0.6370960496480993
Null degrees of freedom: 10066
Residual degrees of

In [None]:
# Display the AUC score of the best model for the test set
leader_perf = aml.leader.model_performance(valid)
print(f"AUC Score of the best model on the test set: {leader_perf.auc()}")

AUC Score of the best model on the test set: 0.7543269369063046


In [None]:
# Identify the best XGBoost model using logloss

# Get the full leaderboard sorted by log loss
import pandas as pd
# Convert leaderboard to a Pandas DataFrame
lb_df = lb.as_data_frame()
# Sort the DataFrame by log loss
lb_df_sorted = lb_df.sort_values(by='logloss', ascending=True)
# Display the sorted DataFrame
print(lb_df_sorted)

                                             model_id       auc   logloss  \
0   StackedEnsemble_AllModels_1_AutoML_1_20240408_...  0.751711  0.589492   
1   StackedEnsemble_BestOfFamily_1_AutoML_1_202404...  0.750975  0.590115   
2                      GBM_1_AutoML_1_20240408_184340  0.746962  0.593601   
3                      GBM_4_AutoML_1_20240408_184340  0.745632  0.595072   
5                  XGBoost_2_AutoML_1_20240408_184340  0.743973  0.596507   
4                      XRT_1_AutoML_1_20240408_184340  0.743993  0.597298   
7                      GBM_3_AutoML_1_20240408_184340  0.742187  0.598218   
6                  XGBoost_1_AutoML_1_20240408_184340  0.743243  0.598759   
8                      GBM_2_AutoML_1_20240408_184340  0.741271  0.599021   
9                  XGBoost_3_AutoML_1_20240408_184340  0.736816  0.602151   
10                     DRF_1_AutoML_1_20240408_184340  0.730975  0.621951   
11                     GLM_1_AutoML_1_20240408_184340  0.689424  0.636708   


with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
    pandas_df = h2o_df.as_data_frame()



By log loss, the best XGBoost model is XGBoost_2_AutoML_3_20240402_181949, with AUC 0.743973 and log loss 0.596507.