In [3]:
from google.colab import drive

In [7]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import StackingClassifier
from keras.models import Sequential
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

df = pd.read_excel('/content/gdrive/My Drive/Colab Notebooks/all_data.xlsx')

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.sequence import pad_sequences
import numpy as np

df=df.dropna(subset=['Cleaned_Text'])


# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Cleaned_Text'], df['Category'], test_size=0.2, random_state=42)

# Preprocess data for MNB (Bag of Words)
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Preprocess data for CNN and ANN (Tokenization and Padding)
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(df['Cleaned_Text'])
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

max_length = max(len(x) for x in X_train_tokens)  # Get max sequence length
X_train_padded = pad_sequences(X_train_tokens, maxlen=max_length)
X_test_padded = pad_sequences(X_test_tokens, maxlen=max_length)

from sklearn.preprocessing import LabelEncoder

# Encode target variables
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)


# Ensure the input is numpy array format
X_train_padded = np.array(X_train_padded)
X_test_padded = np.array(X_test_padded)
y_train = np.array(y_train)
y_test = np.array(y_test)


# Check unique values
print(np.unique(y_train))

# Use assertions to ensure no non-numeric data slips through
assert np.all(np.isfinite(X_train_padded)), "X_train_padded contains non-numeric data"
assert np.all(np.isin(y_train, [0, 1])), "y_train contains values other than 0 and 1"


# For CNN and ANN
max_length = max([len(x) for x in tokenizer.texts_to_sequences(df['Cleaned_Text'])])  # Global max length
X_train_padded = pad_sequences(X_train_tokens, maxlen=max_length)
X_test_padded = pad_sequences(X_test_tokens, maxlen=max_length)


def create_cnn_model(input_dim):
    model = Sequential([
        Embedding(5000, 50, input_length=input_dim),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(10, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model



# For CNN and ANN, ensure they're receiving the same type of processed data:
X_train_padded = np.array(X_train_padded).astype(np.float32)
X_test_padded = np.array(X_test_padded).astype(np.float32)

# Check dimensions
print("Padded train shape:", X_train_padded.shape)
print("Padded test shape:", X_test_padded.shape)


from sklearn.base import BaseEstimator, ClassifierMixin

class KerasClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, build_fn, epochs=10, batch_size=32, verbose=0):
        self.build_fn = build_fn
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.model = build_fn()

    def fit(self, X, y):
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose)
        self.classes_ = np.unique(y)
        return self

    def predict(self, X):
        predictions = self.model.predict(X)
        return (predictions > 0.5).astype(int)

    def predict_proba(self, X):
        predictions = self.model.predict(X)
        # Return probabilities for both classes
        return np.hstack([1-predictions, predictions])


cnn = KerasClassifierWrapper(build_fn=lambda: create_cnn_model(max_length), epochs=10, batch_size=32)


import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.models import Model

# Ensure the custom TransformerBlock imports are correct
from tensorflow.keras.layers import Layer, MultiHeadAttention, LayerNormalization


from tensorflow.keras.layers import Input, Embedding, GlobalMaxPooling1D, Dense, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
import tensorflow as tf

class SimpleTransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0):
        super(SimpleTransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Dense(ff_dim, activation="relu")
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        return self.layernorm2(out1 + ffn_output)

def create_very_simple_tnn_model(input_dim):
    input_layer = Input(shape=(input_dim,))
    embedding_layer = Embedding(5000, 10, input_length=input_dim)(input_layer)  # Further reduced embedding dimension
    transformer_block = SimpleTransformerBlock(10, 1, 10)  # Reduced complexity
    x = transformer_block(embedding_layer)
    x = GlobalMaxPooling1D()(x)
    output = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

# Wrap the even simpler model in KerasClassifierWrapper
tnn= KerasClassifierWrapper(build_fn=lambda: create_very_simple_tnn_model(max_length), epochs=5, batch_size=32)


[0 1]
Padded train shape: (65816, 185)
Padded test shape: (16454, 185)


In [4]:
# Reinitialize models
from sklearn.metrics import accuracy_score

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
# Define stacking ensemble with the updated setup
# Define the stacking ensemble
stack_model = StackingClassifier(
    estimators=[
        ('RF', RandomForestClassifier(n_estimators=100)),  # Example model; use your CNN model
        ('KNN', KNeighborsClassifier()),
        ('cnn', cnn),

    ],
    final_estimator=RandomForestClassifier(),
    stack_method='predict_proba',
    passthrough=False
)

# Fit stacked model
stack_model.fit(X_train_padded, y_train)  # Ensure X_train_padded is appropriate for all models
y_pred = stack_model.predict(X_test_padded)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of stacked model: {accuracy}')

Accuracy of stacked model: 0.770754831651878
