In [17]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import re
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [18]:
#Enhanced Text Preprocessing
def clean_text(text):
    """Clean and preprocess text data"""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove @ mentions and hashtag symbol
    text = re.sub(r'[^\w\s]', '', text)   # Remove punctuation
    return text.strip()

In [21]:
# Enhanced categorical feature preprocessing
encoder = LabelEncoder()
scaler = MinMaxScaler()

train_ds['keyword'] = train_ds['keyword'].fillna('unknown')
train_ds['location'] = train_ds['location'].fillna('unknown')

train_ds['keyword'] = encoder.fit_transform(train_ds['keyword'])
train_ds['location'] = encoder.fit_transform(train_ds['location'])

train_ds['keyword'] = scaler.fit_transform(train_ds[['keyword']])
train_ds['location'] = scaler.fit_transform(train_ds[['location']])

In [22]:
# Drop unnecessary columns
train_ds.drop('id', axis=1, inplace=True)

In [23]:
# Enhanced Data Split with Stratification
from sklearn.model_selection import train_test_split

X = train_ds.drop('target', axis=1, inplace=False)
y = train_ds['target'].values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42, stratify=y_temp)

In [25]:
# Enhanced Text Vectorization
text_vectorization = layers.TextVectorization(
    max_tokens=20000,
    output_mode="int",
    output_sequence_length=50,
)

text_vectorization.adapt(X_train['text'])

In [26]:
# Create datasets with enhanced preprocessing
def prepare_features(X_data, y_data):
    text_data = text_vectorization(X_data['text'])
    cat_data = X_data[['keyword', 'location']].values.astype('float32')
    return text_data, cat_data, y_data

X_train_text, X_train_cat, y_train = prepare_features(X_train, y_train)
X_val_text, X_val_cat, y_val = prepare_features(X_val, y_val)
X_test_text, X_test_cat, y_test = prepare_features(X_test, y_test)


In [36]:
# Enhanced Model Architecture
def get_enhanced_model(max_tokens=20000, embed_dim=128, num_heads=2):
    # Text input branch
    text_input = layers.Input(shape=(50,), name='text_input')
    x = layers.Embedding(max_tokens, embed_dim)(text_input)
    
    # Add transformer encoder block
    transformer_block = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    x = transformer_block(x, x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.2)(x)
    
    # Categorical features branch
    cat_input = layers.Input(shape=(2,), name='cat_input')
    cat_features = layers.Dense(32, activation='relu')(cat_input)
    
    # Combine features
    combined = layers.concatenate([x, cat_features])
    
    # Dense layers with residual connections
    dense1 = layers.Dense(64, activation='relu')(combined)
    dense1 = layers.Dropout(0.3)(dense1)
    dense2 = layers.Dense(32, activation='relu')(dense1)
    dense2 = layers.Dropout(0.2)(dense2)
    
    # Output layer
    output = layers.Dense(1, activation='sigmoid')(dense2)
    
    # Create model
    model = keras.Model(inputs=[text_input, cat_input], outputs=output)
    
    # Compile with enhanced optimizer and learning rate schedule    
    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy', 'AUC']
    )
    
    return model

In [37]:
# Create and train enhanced model
batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices(((X_train_text, X_train_cat), y_train))
train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices(((X_val_text, X_val_cat), y_val))
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices(((X_test_text, X_test_cat), y_test))
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Create model
model = get_enhanced_model()

In [38]:
# Add enhanced callbacks
callbacks = [
    keras.callbacks.ModelCheckpoint(
        "enhanced_disaster_model.keras",
        save_best_only=True,
        monitor='val_accuracy'
    ),
    keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=3,
        restore_best_weights=True
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
    )
]

In [39]:
# Train model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=20,
    callbacks=callbacks
)

Epoch 1/20
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - AUC: 0.5306 - accuracy: 0.5494 - loss: 0.6803 - val_AUC: 0.8292 - val_accuracy: 0.7255 - val_loss: 0.5458 - learning_rate: 0.0010
Epoch 2/20
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - AUC: 0.8673 - accuracy: 0.8074 - loss: 0.4321 - val_AUC: 0.8576 - val_accuracy: 0.7948 - val_loss: 0.4769 - learning_rate: 0.0010
Epoch 3/20
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - AUC: 0.9507 - accuracy: 0.8942 - loss: 0.2759 - val_AUC: 0.8491 - val_accuracy: 0.7797 - val_loss: 0.6317 - learning_rate: 0.0010
Epoch 4/20
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - AUC: 0.9788 - accuracy: 0.9445 - loss: 0.1696 - val_AUC: 0.8369 - val_accuracy: 0.7654 - val_loss: 0.7932 - learning_rate: 0.0010
Epoch 5/20
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - AUC: 0.9905 - accuracy: 0.9645 - lo

In [40]:
# Evaluate on test set
model = keras.models.load_model("enhanced_disaster_model.keras")
test_results = model.evaluate(test_dataset)
print(f"\nTest accuracy: {test_results[1]:.3f}")
print(f"Test AUC: {test_results[2]:.3f}")

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - AUC: 0.8837 - accuracy: 0.8241 - loss: 0.4141

Test accuracy: 0.816
Test AUC: 0.872
