## CNN-RNN

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dropout, Conv1D, MaxPooling1D, LSTM
from tensorflow.keras.models import Model
import optuna
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

### Preprocessing module

In [None]:
# Load the data
train_url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv'
test_url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv'

train_df = pd.read_csv(train_url, header=None)
test_df = pd.read_csv(test_url, header=None)

train_df.columns = ['Class Index', 'Title', 'Description']
test_df.columns = ['Class Index', 'Title', 'Description']

In [None]:
# Define stop_words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower() # convert text to lowercase
    text = re.sub(r'\b(u\.s\.|us)\b', 'usa', text, flags=re.IGNORECASE)  # replace "U.S." or "US" with "usa"
    text = re.sub(r'<[^>]+>', '', text)  # remove HTML tags
    text = re.sub(r'&\w+;', '', text)  # remove HTML entities
    text = re.sub(r'[^a-z\s]', '', text)  # remove numbers and punctuation (keep only letters and spaces)
    words = text.split()
    words = [word for word in words if word not in stop_words] # remove stop-words, tokenization
    words = [lemmatizer.lemmatize(word) for word in words] # lemmatization
    return ' '.join(words) # combining words into a string

# Preprocess text data
train_df['clean_text'] = (train_df['Title'] + ' ' + train_df['Description']).apply(preprocess_text)
test_df['clean_text'] = (test_df['Title'] + ' ' + test_df['Description']).apply(preprocess_text)

# Tokenization
tokenizer = Tokenizer(num_words=5000) # The tokenizer will only consider the top 5000 words in the training data to limit vocabulary size for embedding.
tokenizer.fit_on_texts(train_df['clean_text'])

# Convert the 'Title' and 'Description' columns into sequences of integer indices
X_train_title_seq = tokenizer.texts_to_sequences(train_df['Title'])
X_test_title_seq = tokenizer.texts_to_sequences(test_df['Title'])
X_train_description_seq = tokenizer.texts_to_sequences(train_df['Description'])
X_test_description_seq = tokenizer.texts_to_sequences(test_df['Description'])

# Padding sequences
max_length_titles = max(len(x) for x in X_train_title_seq)  # Max length of title sequences
max_length_descriptions = max(len(x) for x in X_train_description_seq)  # Max length of description sequences
X_train_title_pad = pad_sequences(X_train_title_seq, maxlen=max_length_titles)  # Pad title sequences in training data
X_test_title_pad = pad_sequences(X_test_title_seq, maxlen=max_length_titles)  # Pad title sequences in test data
X_train_description_pad = pad_sequences(X_train_description_seq, maxlen=max_length_descriptions)  # Pad description sequences in training data
X_test_description_pad = pad_sequences(X_test_description_seq, maxlen=max_length_descriptions)  # Pad description sequences in test data

# Assign target labels
y_train = train_df['Class Index'].values - 1  # Class indices in training data, adjusted to start at 0
y_test = test_df['Class Index'].values - 1  # Class indices in test data, adjusted to start at 0


### Load GloVe Embeddings

# Download GloVe embeddings
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip

# Creating an index of GloVe embeddings
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()  # Split the line into words
        word = values[0]  # The first value is the word itself
        coefs = np.asarray(values[1:], dtype='float32')  # The rest are the embedding coefficients
        embedding_index[word] = coefs  # Store the word and its embedding vector in a dictionary

# Building an embedding matrix for the tokens from our dataset
embedding_matrix = np.zeros((5000, 100))  # Initialize with zeros; size: 5000 words, 100-dimensional embeddings
for word, i in tokenizer.word_index.items():
    if i < 5000:  # Consider only the top 5000 words
        embedding_vector = embedding_index.get(word)  # Retrieve the embedding vector if it exists
        if embedding_vector is not None:  # If an embedding was found
            embedding_matrix[i] = embedding_vector  # Fill the embedding matrix

# Creating the Embedding layer using GloVe embeddings
embedding_layer = Embedding(input_dim=5000, output_dim=100, weights=[embedding_matrix], trainable=False)


### Basic training module

In [None]:
# Model definition
title_input = Input(shape=(max_length_titles,), name='title_input')
title_embedding = embedding_layer(title_input)
title_conv = Conv1D(filters=128, kernel_size=5, activation='relu')(title_embedding)
title_pooling = MaxPooling1D(pool_size=2)(title_conv)
title_lstm = LSTM(128, return_sequences=False)(title_pooling)

description_input = Input(shape=(max_length_descriptions,), name='description_input')
description_embedding = embedding_layer(description_input)
description_conv = Conv1D(filters=128, kernel_size=5, activation='relu')(description_embedding)
description_pooling = MaxPooling1D(pool_size=2)(description_conv)
description_lstm = LSTM(128, return_sequences=False)(description_pooling)

merged = Concatenate()([title_lstm, description_lstm])
dense_1 = Dense(128, activation='relu')(merged)
dense_2 = Dense(64, activation='relu')(dense_1)
output = Dense(4, activation='softmax')(dense_2)

model = Model(inputs=[title_input, description_input], outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit([X_train_title_pad, X_train_description_pad], y_train,
          validation_split=0.1, epochs=5, batch_size=32, verbose=1,
          callbacks=[early_stopping])

### Testing module 1

In [None]:
# Test on base model
y_pred = model.predict([X_test_title_pad, X_test_description_pad])
y_pred_classes = y_pred.argmax(axis=1)
test_accuracy = (y_pred_classes == y_test).mean()
print(f'Base Test Accuracy: {test_accuracy:.4f}')
print("Classification Report (Base Model):")
print(classification_report(y_test, y_pred_classes))
conf_matrix = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (Base Model)')
plt.show()

### Hyperparameter selection module

In [None]:
def model_builder(trial):
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
    lstm_units = trial.suggest_int('lstm_units', 64, 256)
    cnn_filters = trial.suggest_int('cnn_filters', 64, 256)
    kernel_size = trial.suggest_int('kernel_size', 3, 5)
    pool_size = trial.suggest_int('pool_size', 2, 3)
    num_units = trial.suggest_int('num_units', 64, 512)

    title_input = Input(shape=(max_length_titles,))
    title_embedding = embedding_layer(title_input)
    title_conv = Conv1D(filters=cnn_filters, kernel_size=kernel_size, activation='relu')(title_embedding)
    title_pooling = MaxPooling1D(pool_size=pool_size)(title_conv)
    title_lstm = LSTM(lstm_units, return_sequences=False)(title_pooling)

    description_input = Input(shape=(max_length_descriptions,))
    description_embedding = embedding_layer(description_input)
    description_conv = Conv1D(filters=cnn_filters, kernel_size=kernel_size, activation='relu')(description_embedding)
    description_pooling = MaxPooling1D(pool_size=pool_size)(description_conv)
    description_lstm = LSTM(lstm_units, return_sequences=False)(description_pooling)

    merged = Concatenate()([title_lstm, description_lstm])
    dense_1 = Dense(num_units, activation='relu')(merged)
    dropout_1 = Dropout(dropout_rate)(dense_1)
    output = Dense(4, activation='softmax')(dropout_1)

    model = Model(inputs=[title_input, description_input], outputs=output)
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

def objective(trial):
    model = model_builder(trial)
    model.fit([X_train_title_pad, X_train_description_pad], y_train,
              validation_split=0.1, epochs=3, batch_size=32, verbose=1)
    loss, accuracy = model.evaluate([X_test_title_pad, X_test_description_pad], y_test, verbose=1)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

### Testing module 2

In [None]:
best_trial = study.best_trial
model = model_builder(best_trial)
model.fit([X_train_title_pad, X_train_description_pad], y_train,
          validation_split=0.1, epochs=5, batch_size=32, verbose=1)

y_pred = model.predict([X_test_title_pad, X_test_description_pad])
y_pred_classes = y_pred.argmax(axis=1)
test_accuracy = (y_pred_classes == y_test).mean()
print(f'Optimized Test Accuracy: {test_accuracy:.4f}')
print("Classification Report (Optimized Model):")
print(classification_report(y_test, y_pred_classes))
conf_matrix = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (Optimized Model)')
plt.show()