In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers

In [None]:
# !pip install datasets

In [None]:
# !pip install tensorflow scikit-learn datasets


C-LSTM BINARY CLASSIFICATTION ON THE IMDB

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Load IMDB dataset from Hugging Face
dataset = load_dataset("imdb")

# Extract text and labels from training and test sets
train_sentences = dataset['train']['text']
train_labels = dataset['train']['label']
test_sentences = dataset['test']['text']
test_labels = dataset['test']['label']

# Tokenization and padding
VOCAB_SIZE = 10000  # Limit vocabulary size
MAX_LEN = 250  # Max sequence length based on IMDB reviews

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_sentences)

train_sequences = pad_sequences(tokenizer.texts_to_sequences(train_sentences), maxlen=MAX_LEN)
test_sequences = pad_sequences(tokenizer.texts_to_sequences(test_sentences), maxlen=MAX_LEN)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Path to GloVe embeddings file (update as needed)
glove_file_path = "/content/drive/MyDrive/glove/glove.6B.300d.txt"
glove_embeddings = load_glove_embeddings(glove_file_path, embedding_dim=300)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE, 300)

# Configuration for binary classification
class Config:
    def __init__(self, max_length, vocab_size, embedding_size=300, l2_reg_lambda=0.001, keep_prob=0.5, num_filters=150, hidden_size=150):
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.l2_reg_lambda = l2_reg_lambda
        self.keep_prob = keep_prob
        self.num_filters = num_filters
        self.hidden_size = hidden_size
        self.num_classes = 2  # Binary classification

config = Config(max_length=MAX_LEN, vocab_size=VOCAB_SIZE)

# Define the C-LSTM model for binary classification
class CLSTMBinaryClassifierIMDB(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMBinaryClassifierIMDB, self).__init__()
        self.embedding = layers.Embedding(input_dim=config.vocab_size,
                                          output_dim=config.embedding_size,
                                          input_length=config.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)
        self.embedding_dropout = layers.Dropout(rate=config.keep_prob)

        # Convolutional layer with 150 filters, kernel size of 3
        self.conv_layer = layers.Conv2D(filters=config.num_filters,
                                        kernel_size=(3, config.embedding_size),
                                        activation='relu', padding='valid')
        self.batch_norm = layers.BatchNormalization()

        # LSTM layer with 150 hidden units
        self.lstm = layers.LSTM(config.hidden_size, return_sequences=False)
        self.dropout = layers.Dropout(rate=config.keep_prob)

        # Output layer for binary classification
        self.fc_binary = layers.Dense(config.num_classes, activation='softmax',
                                      kernel_regularizer=tf.keras.regularizers.L2(config.l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.embedding_dropout(x, training=training)
        x = tf.expand_dims(x, -1)

        # Apply convolutional layer and batch normalization
        conv_out = self.conv_layer(x)
        conv_out = self.batch_norm(conv_out, training=training)
        conv_out = tf.squeeze(conv_out, 2)

        # Pass through LSTM
        rnn_outputs = self.lstm(conv_out)
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Final output for binary classification
        binary_output = self.fc_binary(rnn_outputs)

        return binary_output

# Training function
def compile_and_train_model(config, embedding_matrix, train_sequences, train_labels, test_sequences, test_labels, batch_size=64, epochs=10):
    # Initialize the model with the pre-loaded embedding matrix
    model = CLSTMBinaryClassifierIMDB(config, embedding_matrix=embedding_matrix)

    # Compile the model for binary classification
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=1e-4)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    train_labels_np = np.array(train_labels)
    test_labels_np = np.array(test_labels)

    history = model.fit(
        train_sequences,
        train_labels_np,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(test_sequences, test_labels_np),
        verbose=1
    )

    return model

# Train the model
model = compile_and_train_model(config, embedding_matrix, train_sequences, train_labels, test_sequences, test_labels)

# Evaluate the model
test_loss, test_acc = model.evaluate(test_sequences, np.array(test_labels))
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 40ms/step - accuracy: 0.6346 - loss: 0.6354 - val_accuracy: 0.6104 - val_loss: 0.9065
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 36ms/step - accuracy: 0.8157 - loss: 0.4294 - val_accuracy: 0.7950 - val_loss: 0.4324
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 37ms/step - accuracy: 0.8429 - loss: 0.3775 - val_accuracy: 0.8686 - val_loss: 0.3214
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 37ms/step - accuracy: 0.8627 - loss: 0.3285 - val_accuracy: 0.8820 - val_loss: 0.2910
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 37ms/step - accuracy: 0.8846 - loss: 0.2872 - val_accuracy: 0.8142 - val_loss: 0.4150
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 37ms/step - accuracy: 0.8896 - loss: 0.2753 - val_accuracy: 0.8562 - val_loss: 0.3605
Epoch 7/10
[1m3

C-LSTM FINE-GRAINED CLASSIFICATTION ON THE IMDB

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from sklearn.preprocessing import KBinsDiscretizer
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Load IMDB dataset
dataset = load_dataset("imdb")

# Extract text and binary labels from training and test sets
train_sentences = dataset['train']['text']
train_binary_labels = dataset['train']['label']
test_sentences = dataset['test']['text']
test_binary_labels = dataset['test']['label']

# Convert binary labels to fine-grained labels (e.g., 5 classes)
# Here, we simulate fine-grained labels by binning binary labels
def convert_to_fine_grained_labels(binary_labels, n_classes=5):
    binary_labels = np.array(binary_labels).reshape(-1, 1)
    est = KBinsDiscretizer(n_bins=n_classes, encode='ordinal', strategy='uniform')
    fine_grained_labels = est.fit_transform(binary_labels)
    return fine_grained_labels.astype(int).reshape(-1)

train_fine_labels = convert_to_fine_grained_labels(train_binary_labels)
test_fine_labels = convert_to_fine_grained_labels(test_binary_labels)

# Tokenization and padding
VOCAB_SIZE = 10000  # Vocabulary size limit
MAX_LEN = 250  # Max sequence length based on IMDB reviews

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_sentences)

train_sequences = pad_sequences(tokenizer.texts_to_sequences(train_sentences), maxlen=MAX_LEN)
test_sequences = pad_sequences(tokenizer.texts_to_sequences(test_sentences), maxlen=MAX_LEN)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Path to GloVe embeddings file
glove_file_path = "/content/drive/MyDrive/glove/glove.6B.300d.txt"
glove_embeddings = load_glove_embeddings(glove_file_path, embedding_dim=300)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE, 300)

# Configuration for fine-grained classification
class Config:
    def __init__(self, max_length, vocab_size, embedding_size=300, l2_reg_lambda=0.001, keep_prob=0.5, num_filters=150, hidden_size=150, num_classes=5):
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.l2_reg_lambda = l2_reg_lambda
        self.keep_prob = keep_prob
        self.num_filters = num_filters
        self.hidden_size = hidden_size
        self.num_classes = num_classes  # Set to 5 for fine-grained sentiment

config = Config(max_length=MAX_LEN, vocab_size=VOCAB_SIZE, num_classes=5)

# Define the C-LSTM model for fine-grained classification
class CLSTMFineGrainedClassifierIMDB(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMFineGrainedClassifierIMDB, self).__init__()
        self.embedding = layers.Embedding(input_dim=config.vocab_size,
                                          output_dim=config.embedding_size,
                                          input_length=config.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)
        self.embedding_dropout = layers.Dropout(rate=config.keep_prob)
        self.conv_layer = layers.Conv2D(filters=config.num_filters,
                                        kernel_size=(3, config.embedding_size),
                                        activation='relu', padding='valid')
        self.batch_norm = layers.BatchNormalization()
        self.lstm = layers.LSTM(config.hidden_size, return_sequences=False)
        self.dropout = layers.Dropout(rate=config.keep_prob)
        self.fc_fine = layers.Dense(config.num_classes, activation='softmax',
                                    kernel_regularizer=tf.keras.regularizers.L2(config.l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.embedding_dropout(x, training=training)
        x = tf.expand_dims(x, -1)
        conv_out = self.conv_layer(x)
        conv_out = self.batch_norm(conv_out, training=training)
        conv_out = tf.squeeze(conv_out, 2)
        rnn_outputs = self.lstm(conv_out)
        rnn_outputs = self.dropout(rnn_outputs, training=training)
        fine_output = self.fc_fine(rnn_outputs)
        return fine_output

# Training function
def compile_and_train_model(config, embedding_matrix, train_sequences, train_labels, test_sequences, test_labels, batch_size=64, epochs=10):
    # Initialize the model with the pre-loaded embedding matrix
    model = CLSTMFineGrainedClassifierIMDB(config, embedding_matrix=embedding_matrix)
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=1e-4)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    train_labels_np = np.array(train_labels)
    test_labels_np = np.array(test_labels)

    history = model.fit(
        train_sequences,
        train_labels_np,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(test_sequences, test_labels_np),
        verbose=1
    )

    return model

# Train the model
model = compile_and_train_model(config, embedding_matrix, train_sequences, train_fine_labels, test_sequences, test_fine_labels)

# Evaluate the model
test_loss, test_acc = model.evaluate(test_sequences, np.array(test_fine_labels))
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')

Epoch 1/10




[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 37ms/step - accuracy: 0.6178 - loss: 0.7645 - val_accuracy: 0.7258 - val_loss: 0.5497
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 36ms/step - accuracy: 0.7936 - loss: 0.4774 - val_accuracy: 0.8398 - val_loss: 0.3735
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 37ms/step - accuracy: 0.8262 - loss: 0.4120 - val_accuracy: 0.8600 - val_loss: 0.3455
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 37ms/step - accuracy: 0.8469 - loss: 0.3673 - val_accuracy: 0.8785 - val_loss: 0.3051
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 37ms/step - accuracy: 0.8638 - loss: 0.3303 - val_accuracy: 0.8744 - val_loss: 0.2998
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 37ms/step - accuracy: 0.8757 - loss: 0.3073 - val_accuracy: 0.8855 - val_loss: 0.2794
Epoch 7/10
[1m391/391[0m 