In [1]:
import pandas as pd
from sklearn.metrics import *
from tqdm import tqdm
from utils import Load_Rumours_Dataset_filtering_since_first_post_Transfer_Learning
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
from imblearn.over_sampling import SMOTE

In [2]:
train_dataset = 'charlie_hebdo'
test_dataset = 'sydneysiege'
time_cut =3*60*24
processor = Load_Rumours_Dataset_filtering_since_first_post_Transfer_Learning(train_dataset,\
           test_dataset, time_cut=time_cut,test_size=0.7)

processor.load_data()
processor.process_data()
train,test = processor.get_final_dataframes()

rumour
0    535
1    286
Name: count, dtype: int64


In [3]:
X_train  = train.drop(columns=['rumour'])
X_train = np.hstack([X_train.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_train.embeddings_avg.tolist()))])


X_test  = test.drop(columns=['rumour'])
X_test = np.hstack([X_test.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_test.embeddings_avg.tolist()))])

#X = np.hstack([X.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X.embeddings_avg.tolist()))])
y_train =train['rumour']
y_test =test['rumour']

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset



# Model definition
class RumorDetectionLSTM(nn.Module):
    def __init__(self, embedding_dim=100, lstm_hidden_size=32, dense_hidden_size=16):
        super(RumorDetectionLSTM, self).__init__()
        
        # LSTM for the 100-dimensional embeddings
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=lstm_hidden_size, batch_first=True)
        
        # Dense layers for other features
        self.dense1 = nn.Linear(8, 16)  # 8 non-embedding features
        self.dense2 = nn.Linear(16, dense_hidden_size)
        
        # Combine LSTM and dense features
        self.fc1 = nn.Linear(lstm_hidden_size + dense_hidden_size, 64)
        self.fc2 = nn.Linear(64, 1)
        
    def forward(self, x):
        # Separate embeddings and other features
        embeddings = x[:, -100:].unsqueeze(1)  # (batch, seq_len=1, embedding_dim)
        other_features = x[:, :8]  # First 8 features
        
        # LSTM output
        lstm_out, _ = self.lstm(embeddings)
        lstm_out = lstm_out[:, -1, :]  # Get the last LSTM output
        
        # Dense layers for other features
        dense_out = torch.relu(self.dense1(other_features))
        dense_out = torch.relu(self.dense2(dense_out))
        
        # Concatenate LSTM and dense outputs
        combined = torch.cat((lstm_out, dense_out), dim=1)
        
        # Fully connected layers for classification
        x = torch.relu(self.fc1(combined))
        x = torch.sigmoid(self.fc2(x))
        return x.squeeze()


In [7]:
# Assuming X_train, X_test, y_train, and y_test are available as numpy arrays
# Convert them to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Dataset and DataLoader
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [10]:
from sklearn.metrics import recall_score, precision_score

# Model, criterion, optimizer initialization (as before)
model = RumorDetectionLSTM()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop with loss and recall monitoring
epochs = 75  # Adjust as needed
train_recall_interval = 50  # Calculate train recall every 10 epochs
loss_interval = 50  # Print loss every 10 epochs

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    # Print loss every 10 epochs
    if (epoch + 1) % loss_interval == 0:
        model.eval()
        train_preds = []
        train_labels = []
        with torch.no_grad():
            for X_batch, y_batch in train_loader:
                output = model(X_batch)
                preds = (output >= 0.5).int()  # Binarize predictions
                train_preds.extend(preds.tolist())
                train_labels.extend(y_batch.tolist())
        
        train_recall = recall_score(train_labels, train_preds)
        train_precision = precision_score(train_labels, train_preds)
        
print(f"Epoch {epoch + 1}, Train Loss: {epoch_loss / len(train_loader):.4f},\
              Train Precision: {train_precision:.4f},Train Recall: {train_recall:.4f}")
    


# Final evaluation on test set with recall and precision
model.eval()
test_preds = []
test_labels = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        output = model(X_batch)
        preds = (output >= 0.5).int()  # Binarize predictions
        test_preds.extend(preds.tolist())
        test_labels.extend(y_batch.tolist())

# Calculate final test recall and precision
test_recall = recall_score(test_labels, test_preds)
test_precision = precision_score(test_labels, test_preds)

print(f"Final Test Recall: {test_recall:.4f}")
print(f"Final Test Precision: {test_precision:.4f}")


Epoch 75, Train Loss: 19.0257,              Train Precision: 1.0000,Train Recall: 0.3227
Final Test Recall: 0.4650
Final Test Precision: 0.6856


In [4]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
#mlflow.set_experiment("spyder-experiment")
import mlflow.pytorch
mlflow.set_experiment("LSTM  2025-07-15 Ottawa Shooting")

<Experiment: artifact_location='/workspaces/rumour-detection-gnn/mlruns/103', creation_time=1752687500205, experiment_id='103', last_update_time=1752687500205, lifecycle_stage='active', name='LSTM  2025-07-15 Ottawa Shooting', tags={}>

#### Testing Draf Model

In [None]:
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support
import numpy as np

def find_best_f1_threshold(y_true, y_probs):
    thresholds = np.linspace(0.05, 1, 20)
    best_thresh = 0.5
    best_f1 = 0
    for thresh in thresholds:
        preds = (y_probs >= thresh).astype(int)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, preds, average="binary")
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    return best_thresh

def evaluate(model, loader):
    model.eval()
    all_logits = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            output = model(X_batch).squeeze()
            all_logits.extend(output.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())
    probs = torch.sigmoid(torch.tensor(all_logits)).numpy()
    labels = np.array(all_labels)
    auc = roc_auc_score(labels, probs)
    return labels, probs, auc

for time_cut in range(10, 24*3*60, 15):
    print(time_cut)
    
    train_dataset = 'charlie_hebdo'
    test_dataset = 'ottawa_shooting'
    processor = Load_Rumours_Dataset_filtering_since_first_post_Transfer_Learning(train_dataset, test_dataset, time_cut=time_cut, test_size=0.7)
    
    processor.load_data()
    processor.process_data()
    train, test = processor.get_final_dataframes()

    X_train = train.drop(columns=['rumour'])
    X_train = np.hstack([X_train.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_train.embeddings_avg.tolist()))])

    X_test = test.drop(columns=['rumour'])
    X_test = np.hstack([X_test.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_test.embeddings_avg.tolist()))])

    y_train = train['rumour']
    y_test = test['rumour']

    #smote = SMOTE(random_state=42, sampling_strategy='minority')
    #X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)

    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # Handle class imbalance
    num_pos = sum(y_train)
    num_neg = len(y_train) - num_pos
    pos_weight = torch.tensor([num_neg / num_pos], dtype=torch.float32)

    with mlflow.start_run():

        model = RumorDetectionLSTM()
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        epochs = 200
        
        for epoch in range(epochs):
            model.train()
            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()
                output = model(X_batch).squeeze()
                loss = criterion(output, y_batch)
                loss.backward()
                optimizer.step()

            # Evaluate at every epoch
            y_train_true, y_train_probs, auc_train = evaluate(model, train_loader)
            y_test_true, y_test_probs, auc_test = evaluate(model, test_loader)

            best_thresh = find_best_f1_threshold(y_train_true, y_train_probs)

        def compute_metrics(y_true, y_probs, threshold):
            preds = (y_probs >= threshold).astype(int)
            precision, recall, f1, _ = precision_recall_fscore_support(y_true, preds, average='binary')
            return precision, recall, f1

        best_thresh = find_best_f1_threshold(y_train_true, y_train_probs)

        prec_train, rec_train, f1_train = compute_metrics(y_train_true, y_train_probs, best_thresh)
        prec_test, rec_test, f1_test = compute_metrics(y_test_true, y_test_probs, best_thresh)

        print(f"Epoch {epoch+1}")
        print(f"  Train AUC: {auc_train:.4f} | Precision: {prec_train:.4f} | Recall: {rec_train:.4f} | F1: {f1_train:.4f}")
        print(f"  Test  AUC: {auc_test:.4f} | Precision: {prec_test:.4f} | Recall: {rec_test:.4f} | F1: {f1_test:.4f}")
        print(f"  Threshold: {best_thresh:.2f}\n")

        # Log final test results
        mlflow.log_metric("final_test_precision", prec_test)
        mlflow.log_metric("final_test_recall", rec_test)
        mlflow.log_metric("final_test_f1", f1_test)
        mlflow.log_metric("final_test_auc", auc_test)
        mlflow.log_param("best_threshold", best_thresh)
        mlflow.log_param("learning_rate", 0.001)
        mlflow.log_param("epochs", epochs)
        mlflow.log_param("time_cut", time_cut)
