In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, Conv1D, GlobalMaxPooling1D, Dropout, Flatten, LSTM
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from google.colab import files  # For file upload (assuming you are using Colab)
import os

# Upload the dataset
uploaded = files.upload()

# Load the dataset (assuming it's in Excel format)
file_name = next(iter(uploaded))  # Get the uploaded file name
data = pd.read_excel(file_name)  # Read the Excel file into a pandas DataFrame

# Data Preprocessing
data = data.dropna(subset=['comments', 'tag'])  # Drop rows with missing comments or tags
comments = data['comments'].astype(str)
tags = data['tag']

# Encode labels
label_encoder = LabelEncoder()
encoded_tags = label_encoder.fit_transform(tags)
num_classes = len(label_encoder.classes_)
labels = to_categorical(encoded_tags, num_classes=num_classes)

# Tokenize and pad sequences
max_words = 20000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Define Model Architectures
def build_dnn():
    model = Sequential([
        Embedding(max_words, 128, input_length=max_len),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    return model

def build_cnn():
    model = Sequential([
        Embedding(max_words, 128, input_length=max_len),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    return model

def build_bilstm():
    model = Sequential([
        Embedding(max_words, 128, input_length=max_len),
        Bidirectional(LSTM(128, activation='relu', return_sequences=True)),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    return model


def build_mlp():
    model = Sequential([
        Embedding(max_words, 128, input_length=max_len),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    return model

# Train and Evaluate Model
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
    model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2,
              callbacks=[early_stopping, lr_scheduler], verbose=1)
    predictions = model.predict(X_test)
    y_pred = np.argmax(predictions, axis=1)
    y_true = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1

# Initialize models
models = {
    'DNN': build_dnn(),
    'CNN': build_cnn(),
    'BiLSTM': build_bilstm(),
    'MLP': build_mlp()
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    acc, prec, rec, f1 = train_and_evaluate(model, X_train, y_train, X_test, y_test)
    results[model_name] = {
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1 Score': f1
    }

# Define the results file name by appending '_Deep_Model_Results' to the uploaded file name
output_file_name = os.path.splitext(file_name)[0] + '_Deep_Model_Results.xlsx'

# Export the results to Excel
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.to_excel(output_file_name, index=True)

# Download the results file
files.download(output_file_name)

print(f"Results saved and ready for download from {output_file_name}")


Saving total cleaned corpus.xlsx to total cleaned corpus.xlsx




Training DNN...
Epoch 1/20
[1m5370/5370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 34ms/step - accuracy: 0.7034 - loss: 0.6981 - val_accuracy: 0.7629 - val_loss: 0.5936 - learning_rate: 0.0010
Epoch 2/20
[1m5370/5370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 32ms/step - accuracy: 0.7881 - loss: 0.5420 - val_accuracy: 0.7743 - val_loss: 0.5746 - learning_rate: 0.0010
Epoch 3/20
[1m5370/5370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 30ms/step - accuracy: 0.8140 - loss: 0.4775 - val_accuracy: 0.7784 - val_loss: 0.5765 - learning_rate: 0.0010
Epoch 4/20
[1m5370/5370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 31ms/step - accuracy: 0.8324 - loss: 0.4298 - val_accuracy: 0.7833 - val_loss: 0.5890 - learning_rate: 0.0010
Epoch 5/20
[1m5370/5370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 31ms/step - accuracy: 0.8495 - loss: 0.3856 - val_accuracy: 0.7883 - val_loss: 0.6021 - learning_rate: 0.0010
Epoch 6/20
[1m5370/5370[0m 