In [1]:
import pandas as pd
train = pd.read_csv('/kaggle/input/data11234567/train.csv')
test = pd.read_csv('/kaggle/input/data11234567/test.csv')

In [2]:
train['Discussion'].isnull().sum()

343

In [3]:
train = train.dropna(subset=['Discussion'])

In [4]:
train['Discussion'].isnull().sum()

0

In [5]:
file = open('/kaggle/input/data11234567/ClassesMap.txt','r')
file2 = file.read()
file.close()
file2

'Politics --> 0\nSports --> 1\nMedia --> 2\nMarket & Economy --> 3\nSTEM --> 4'

In [6]:
train['Category'].replace({
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}, inplace=True)

In [7]:
train['Category'].head(7)

0    1
1    4
2    4
3    1
4    0
5    2
6    2
Name: Category, dtype: int64

In [8]:
! pip install wordninja



In [9]:
! pip install tensorflow



In [10]:
import re
import wordninja
import nltk
# stop_words = set(["the", "and", "is", "in", "to", "a", "of", "for", "on", "with", "at", "by", "from", "this", "that", "it","n","nn"])
def preprocess_text(text):
    # Step 1: Convert to lowercase
    text = text.lower()
    # Step 2: Remove newlines (\n)
    text = text.replace('\n', ' ')
    # Step 3: Remove special characters and punctuations but keep numbers
    text = re.sub(r'[^\w\s\d]', '', text)
    # Step 4: Apply WordNinja to split words longer than 10 characters
    text = ' '.join([
        ' '.join(wordninja.split(word)) if len(word) > 10 else word
        for word in text.split()
    ])
    # text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

In [11]:
train['Discussion'] = train['Discussion'].apply(preprocess_text)
test['Discussion'] = test['Discussion'].apply(preprocess_text)

In [12]:
train['Discussion'].iloc[0]

'without sitting down and doing it manually you might try some scheduling software there are several here is one that you can download i havent tried it but it seems to do the job nn http www download com the league system pro 30007427 410505040 html tag pdp prod'

In [13]:
test['Discussion'].iloc[281]

'http www x rates com d usd mxn graph 120 html'

In [14]:
# List of words to remove
remove_words = ["http", "www", "com"]

def remove_specific_words(text):
    # Split the text into words and filter out the unwanted words
    filtered_text = ' '.join([word for word in text.split() if word not in remove_words])
    return filtered_text

# Apply the function to both train and test DataFrames
train['Discussion'] = train['Discussion'].apply(remove_specific_words)
test['Discussion'] = test['Discussion'].apply(remove_specific_words)


In [15]:
print(train['Discussion'].iloc[0])
print("......................................................................................")
print(test['Discussion'].iloc[281])

without sitting down and doing it manually you might try some scheduling software there are several here is one that you can download i havent tried it but it seems to do the job nn download the league system pro 30007427 410505040 html tag pdp prod
......................................................................................
x rates d usd mxn graph 120 html


In [16]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dropout, Dense, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np

# Combine train and test for tokenization
combined = pd.concat([train['Discussion'], test['Discussion']], axis=0)

# Step 2: Tokenization and Word Embedding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(train['Discussion'])
X_test_seq = tokenizer.texts_to_sequences(test['Discussion'])

# Pad sequences to ensure equal length
max_seq_length = 250  # Adjust this based on text length analysis
X_train_padded = pad_sequences(X_train_seq, maxlen=max_seq_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_seq_length, padding='post')

# Prepare labels
y_train = to_categorical(train['Category'].values)  # Convert to one-hot encoding

# Step 3: Split Train Data
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_padded, y_train, test_size=0.2, random_state=42, stratify=train['Category']
)

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import numpy as np

def create_transformer_block(inputs, embed_dim, num_heads, ff_dim, dropout=0.1):
    # Multi-Head Attention
    attention_output = MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim // num_heads
    )(inputs, inputs)
    attention_output = Dropout(dropout)(attention_output)
    attention_output = LayerNormalization(epsilon=1e-6)(inputs + attention_output)

    # Feed Forward Network
    ffn_output = Dense(ff_dim, activation="relu")(attention_output)
    ffn_output = Dense(embed_dim)(ffn_output)
    ffn_output = Dropout(dropout)(ffn_output)
    sequence_output = LayerNormalization(epsilon=1e-6)(attention_output + ffn_output)

    return sequence_output

# Model parameters
vocab_size = len(tokenizer.word_index) + 1
embed_dim = 256
num_heads = 8
ff_dim = 512
num_transformer_blocks = 2
lstm_units = 128
dropout_rate = 0.2

# Model architecture
inputs = Input(shape=(max_seq_length,))

# Embedding layer
embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=embed_dim,
    input_length=max_seq_length
)(inputs)

# Transformer branch
transformer_output = embedding_layer
for _ in range(num_transformer_blocks):
    transformer_output = create_transformer_block(
        transformer_output,
        embed_dim,
        num_heads,
        ff_dim,
        dropout_rate
    )

# LSTM branch
lstm_output = Bidirectional(LSTM(lstm_units, return_sequences=True))(embedding_layer)
lstm_output = LayerNormalization(epsilon=1e-6)(lstm_output)
lstm_output = Dropout(dropout_rate)(lstm_output)

# Combine transformer and LSTM branches
combined = Concatenate(axis=-1)([transformer_output, lstm_output])

# Final processing
x = GlobalAveragePooling1D()(combined)
x = Dense(256, activation="relu")(x)
x = Dropout(dropout_rate)(x)
x = LayerNormalization(epsilon=1e-6)(x)
x = Dense(128, activation="relu")(x)
x = Dropout(dropout_rate)(x)
outputs = Dense(y_train.shape[1], activation="softmax")(x)

# Create and compile model
model = Model(inputs=inputs, outputs=outputs)

# Compile with a slower initial learning rate for stability
initial_learning_rate = 0.0001
optimizer = tf.keras.optimizers.Adam(learning_rate=initial_learning_rate)
model.compile(
    optimizer=optimizer,
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

# Simplified callback - only early stopping on val_loss
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=7,
        restore_best_weights=True
    )
]
# Train the model
history = model.fit(
    X_train_final,
    y_train_final,
    batch_size=32,
    epochs=50,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

# Generate predictions
test_predictions = model.predict(X_test_padded)
predicted_classes = np.argmax(test_predictions, axis=1)



Epoch 1/50
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 78ms/step - accuracy: 0.2274 - loss: 1.6626 - val_accuracy: 0.4398 - val_loss: 1.3509
Epoch 2/50
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 77ms/step - accuracy: 0.4881 - loss: 1.2459 - val_accuracy: 0.6460 - val_loss: 0.9333
Epoch 3/50
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 77ms/step - accuracy: 0.7115 - loss: 0.7924 - val_accuracy: 0.6937 - val_loss: 0.8803
Epoch 4/50
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 77ms/step - accuracy: 0.8258 - loss: 0.5160 - val_accuracy: 0.6805 - val_loss: 0.9936
Epoch 5/50
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 77ms/step - accuracy: 0.8956 - loss: 0.3303 - val_accuracy: 0.6548 - val_loss: 1.2244
Epoch 6/50
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 77ms/step - accuracy: 0.9312 - loss: 0.2228 - val_accuracy: 0.6637 - val_loss: 1.3686
Epoch 7/50
[1m6

In [20]:
# Create and save submission
submission = pd.DataFrame({
    'SampleID': test['SampleID'],
    'Category': predicted_classes
})
submission.to_csv('hybrid_transformer_lstm_predictions333.csv', index=False)


In [None]:
final_train_loss, final_train_accuracy = model.evaluate(X_train_final, y_train_final, verbose=0)
final_val_loss, final_val_accuracy = model.evaluate(X_val, y_val, verbose=0)

print(f"\nFinal Training Accuracy: {final_train_accuracy:.4f}")
print(f"Final Validation Accuracy: {final_val_accuracy:.4f}")


Final Training Accuracy: 0.8457
Final Validation Accuracy: 0.6937


In [21]:
# Save the whole model to a file
model.save('hybird_transformer_model.h5')
print("Model saving is done")

Model saving is done
