In [8]:
import pandas as pd
from tqdm import tqdm
from utils import Load_Rumours_Dataset_filtering_since_first_post
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pylab as plt
from sklearn.metrics import *
from sklearn.ensemble import RandomForestClassifier
import uuid
from sklearn.utils.class_weight import compute_class_weight

In [9]:
# Usage
file_path_replies = r"replies_charlie_hebdo.pkl"
file_path_posts = r"posts_charlie_hebdo.pkl"

In [10]:
processor = Load_Rumours_Dataset_filtering_since_first_post(file_path_replies, file_path_posts, time_cut=3*60*24)
processor.load_data()
processor.process_data()
train,test= processor.get_final_dataframes()


In [11]:
X_train  = train.drop(columns=['rumour'])
X_train = np.hstack([X_train.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_train.embeddings_avg.tolist()))])
#X = np.hstack([X.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X.embeddings_avg.tolist()))])
y_train =train['rumour']

X_test  = test.drop(columns=['rumour'])
X_test = np.hstack([X_test.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_test.embeddings_avg.tolist()))])
#X = np.hstack([X.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X.embeddings_avg.tolist()))])
y_test =test['rumour']

In [12]:
# Compute class weights to handle imbalance
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))


In [17]:
# Compute class weights to handle imbalance
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))

# Initialize Random Forest with class weights
model = RandomForestClassifier(
    n_estimators=250,
    max_depth=4,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    class_weight=class_weight_dict,  # Handles imbalance
    n_jobs=-1,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
y_proba_test = model.predict_proba(X_test)[:, 1]  # For ROC AUC

y_proba_train = model.predict_proba(X_train)[:, 1]  # For ROC AUC

In [18]:
y_train_prob = model.predict_proba(X_train)[:, 1]
y_test_prob = model.predict_proba(X_test)[:, 1]

thresholds = np.linspace(0.01, 0.99, 100)
f1_scores = [f1_score(y_train, (y_train_prob > t).astype(int)) for t in thresholds]
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

y_train_pred = (y_train_prob > best_threshold).astype(int)
y_test_pred = (y_test_prob > best_threshold).astype(int)

thresholds = np.linspace(0.01, 0.99, 100)
f1_scores = [f1_score(y_train, (y_train_prob > t).astype(int)) for t in thresholds]
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_threshold

# Evaluation function
def evaluate(y_true, y_pred, y_prob, label=""):
    print(f"  - Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
    print(f"  - Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"  - Recall:    {recall_score(y_true, y_pred):.4f}")
    print(f"  - AUC:       {roc_auc_score(y_true, y_prob):.4f}")
    print("")

# Show metrics
evaluate(y_train, y_train_pred, y_train_prob, label="Train")
evaluate(y_test, y_test_pred, y_test_prob, label="Test")

  - Accuracy:  0.8929
  - Precision: 0.7309
  - Recall:    0.8243
  - AUC:       0.9537

  - Accuracy:  0.8336
  - Precision: 0.6118
  - Recall:    0.6940
  - AUC:       0.8941



In [11]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
#mlflow.set_experiment("spyder-experiment")
import mlflow.pytorch
mlflow.set_experiment("Random Forest  2025-06-07 Charlie Hebdo")

2025/06/07 12:36:41 INFO mlflow.tracking.fluent: Experiment with name 'Random Forest  2025-06-07 Charlie Hebdo' does not exist. Creating a new experiment.


<Experiment: artifact_location='/workspaces/rumour-detection-gnn/mlruns/68', creation_time=1749299801922, experiment_id='68', last_update_time=1749299801922, lifecycle_stage='active', name='Random Forest  2025-06-07 Charlie Hebdo', tags={}>

#### Testing a Draft model

In [None]:
for time_cut in range(20, (60 * 24 * 3), 15):
    print(f"\n=== Time Cut: {time_cut} ===")
    
    processor = Load_Rumours_Dataset_filtering_since_first_post(file_path_replies, file_path_posts, time_cut=time_cut)
    processor.load_data()
    processor.process_data()
    train, test = processor.get_final_dataframes()

    # Prepare features and labels
    X_train = train.drop(columns=['rumour'])
    X_train = np.hstack([
        X_train.drop(columns=['embeddings_avg']).values, 
        np.array(pd.DataFrame(X_train.embeddings_avg.tolist()))
    ])
    y_train = train['rumour']

    X_test = test.drop(columns=['rumour'])
    X_test = np.hstack([
        X_test.drop(columns=['embeddings_avg']).values, 
        np.array(pd.DataFrame(X_test.embeddings_avg.tolist()))
    ])
    y_test = test['rumour']

    # Compute class weights to handle imbalance
    classes = np.unique(y_train)
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weight_dict = dict(zip(classes, class_weights))
    
    model = RandomForestClassifier(
        n_estimators=50,
        max_depth=2,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        class_weight=class_weight_dict,
        n_jobs=-1,
        random_state=42
    )

    with mlflow.start_run(run_name=f"time_cut_{time_cut}"):
        model.fit(X_train, y_train)

        # Probabilities
        y_train_prob = model.predict_proba(X_train)[:, 1]
        y_test_prob = model.predict_proba(X_test)[:, 1]

        # Find best threshold to maximize F1 on train set
        thresholds = np.linspace(0.01, 0.99, 100)
        f1_scores = [f1_score(y_train, (y_train_prob > t).astype(int)) for t in thresholds]
        best_idx = np.argmax(f1_scores)
        best_threshold = thresholds[best_idx]

        # Apply best threshold
        y_train_pred = (y_train_prob > best_threshold).astype(int)
        y_test_pred = (y_test_prob > best_threshold).astype(int)

        # Train metrics
        mlflow.log_metric("train_accuracy", accuracy_score(y_train, y_train_pred))
        mlflow.log_metric("train_precision", precision_score(y_train, y_train_pred))
        mlflow.log_metric("train_recall", recall_score(y_train, y_train_pred))
        mlflow.log_metric("train_f1", f1_score(y_train, y_train_pred))
        mlflow.log_metric("train_auc", roc_auc_score(y_train, y_train_prob))

        # Test metrics
        mlflow.log_metric("final_acc", accuracy_score(y_test, y_test_pred))
        mlflow.log_metric("final_precision", precision_score(y_test, y_test_pred))
        mlflow.log_metric("final_recall", recall_score(y_test, y_test_pred))
        mlflow.log_metric("final_f1", f1_score(y_test, y_test_pred))
        mlflow.log_metric("final_auc", roc_auc_score(y_test, y_test_prob))

        # Log best threshold and time_cut
        mlflow.log_metric("optimal_threshold", best_threshold)
        mlflow.log_metric("time_cut", time_cut)
