Import Data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix

from xgboost import XGBClassifier

from transformers import AutoTokenizer,AutoModel
import torch

from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout

import seaborn as sns
from wordcloud import WordCloud
from collections import Counter



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train_data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train_data.head()

In [None]:
train_data.info()

In [None]:
test_data = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_data.head()

EDA Analysis

In [None]:
# Plot target distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=train_data, x='target', palette='viridis')
plt.title('Target Distribution')
plt.xlabel('Target')
plt.ylabel('Count')
plt.show()

#Data
#0 - not related to disaster
#1 - related to disaster

In [None]:
# Function to generate a word cloud
def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16)
    plt.show()

In [None]:
# Generate word cloud for all text data
all_text = ' '.join(train_data['text'].astype(str))
generate_wordcloud(all_text, 'Word Cloud for All Tweets')

# Generate word cloud for each target class
for target in train_data['target'].unique():
    target_text = ' '.join(train_data[train_data['target'] == target]['text'].astype(str))
    generate_wordcloud(target_text, f'Word Cloud for Target: {target}')

# Analyze frequent words
def most_common_words(text, num_words=10):
    words = text.split()
    word_counts = Counter(words)
    return word_counts.most_common(num_words)

In [None]:
# Clean text
train_data["clean_text"] = train_data["text"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True).str.lower().str.strip()
test_data["clean_text"] = test_data["text"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True).str.lower().str.strip()


# 

BERTs Transformer Model

In [None]:
# Hyperparameters
MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100

# Load BERT Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/transformers/bert-base-uncased")
bert_model = AutoModel.from_pretrained("/kaggle/input/transformers/bert-base-uncased")

# Extract BERT Embeddings
def get_bert_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

# Apply BERT Embeddings
X_bert_train = get_bert_embeddings(train_data["clean_text"].tolist())
X_bert_test = get_bert_embeddings(test_data["clean_text"].tolist())
y = np.array(train_data["target"])

# Class Weight Handling
classes = np.unique(y)
class_weights = compute_class_weight('balanced', classes=classes, y=y)
class_weight_dict = dict(zip(classes, class_weights))



In [None]:

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_bert_train, y, test_size=0.2, random_state=42)

# Reshape for LSTM
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])


In [None]:
# Build LSTM Model
lstm_model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    Bidirectional(LSTM(256, return_sequences=True)),
    Dropout(0.4),
    LSTM(256),
    Dropout(0.4),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), class_weight=class_weight_dict)

# Extract LSTM Features
X_train_lstm_features = lstm_model.predict(X_train).reshape(X_train.shape[0], -1)
X_test_lstm_features = lstm_model.predict(X_test).reshape(X_test.shape[0], -1)

# Train XGBoost Model
xgb_model = XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=10, random_state=42)
xgb_model.fit(X_train_lstm_features, y_train)
y_pred_xgb = xgb_model.predict(X_test_lstm_features)


In [None]:
# Evaluate XGBoost
print(f"Hybrid BERT + LSTM + XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print("\nClassification Report - XGBoost (BERT + LSTM Features)")
print(classification_report(y_test, y_pred_xgb))


In [None]:
# Additional Evaluation - F1 Score
print(f"F1 Score (Macro): {f1_score(y_test, y_pred_xgb, average='macro'):.4f}")

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - XGBoost (BERT + LSTM Features)")
plt.show()
