In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import warnings
import random
warnings.filterwarnings("ignore")

# Optional: For visualization
import matplotlib.pyplot as plt
import seaborn as sns  # Added for advanced plotting

warnings.filterwarnings("ignore")

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

## Generation of Synthetic Data

In [2]:
def generate_user_profiles(num_users):
    """Generates synthetic user profiles."""
    user_profiles = pd.DataFrame({
        'user_id': range(num_users),
        'CRT_score': np.random.randint(0, 8, num_users),
        'CMQ_score': np.random.randint(0, 8, num_users),
        'openness': np.random.rand(num_users),
        'conscientiousness': np.random.rand(num_users),
        'extraversion': np.random.rand(num_users),
        'agreeableness': np.random.rand(num_users),
        'neuroticism': np.random.rand(num_users),
        'political_ideology': np.random.choice(['liberal', 'moderate', 'conservative'], num_users),
        'age': np.random.randint(18, 65, num_users),
        'gender': np.random.choice(['male', 'female', 'other'], num_users),
        'education': np.random.choice(['high_school', 'bachelor', 'master', 'phd'], num_users),
        'past_interventions': np.zeros(num_users),  # Initialize past interventions
        'susceptibility_score': np.random.rand(num_users)  # Initial susceptibility score
    })
    return user_profiles

In [3]:
def generate_content_items(num_items):
    """Generates synthetic content items."""
    content_items = pd.DataFrame({
        'content_id': range(num_items),
        'is_misinformation': np.random.choice([0, 1], num_items, p=[0.7, 0.3]),
        'topic': np.random.choice(['health', 'politics', 'technology', 'sports'], num_items),
        'sentiment': np.random.choice(['positive', 'neutral', 'negative'], num_items),
        'complexity': np.random.rand(num_items),
        'readability_score': np.random.rand(num_items),
        'emotional_impact': np.random.rand(num_items),
        'credibility_indicator': np.random.rand(num_items)
    })
    return content_items

## Interaction Simulation

In [4]:

def simulate_interactions(user_profiles, content_items, time_steps):
    """Simulates user-content interactions over multiple time steps."""
    interactions_list = []
    for t in range(time_steps):
        # Update user susceptibility based on past interventions
        user_profiles['susceptibility_score'] = user_profiles['susceptibility_score'] * (1 - 0.1 * user_profiles['past_interventions'])
        # Ensure susceptibility_score stays between 0 and 1
        user_profiles['susceptibility_score'] = user_profiles['susceptibility_score'].clip(0, 1)
        # Merge user profiles and content items to simulate interactions
        interactions = pd.merge(user_profiles.assign(key=1), content_items.assign(key=1), on='key').drop('key', axis=1)
        # Create a synthetic target variable: 1 if user is susceptible, 0 otherwise
        interactions['susceptible'] = interactions.apply(lambda row: 
            int(
                row['is_misinformation'] == 1 and
                row['susceptibility_score'] > np.random.rand()
            ), axis=1)
        # Add time step
        interactions['time_step'] = t
        interactions_list.append(interactions)
    # Concatenate all interactions
    all_interactions = pd.concat(interactions_list, ignore_index=True)
    return all_interactions

## Feature Encoding

In [5]:
def encode_features(df):
    """Encodes categorical features using OneHotEncoder."""
    categorical_cols = ['political_ideology', 'gender', 'education', 'topic', 'sentiment']
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded = encoder.fit_transform(df[categorical_cols])
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))
    df = pd.concat([df.drop(categorical_cols, axis=1).reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
    return df

## Model Training

In [6]:
def train_model(interactions):
    """Trains an XGBoost classifier to predict susceptibility."""
    # Encode features
    interactions_encoded = encode_features(interactions)
    # Define features and target
    X = interactions_encoded.drop(['user_id', 'content_id', 'susceptible', 'is_misinformation', 'time_step'], axis=1)
    y = interactions_encoded['susceptible']
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Train model
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)
    # Predict
    y_pred = model.predict(X_test)
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")
    return model

## Interventions

In [7]:
# Define baseline interventions and their selection probabilities
baseline_interventions = [
    'Standard Warning',
    'Informational Message',
    'Neutral Feedback',
    'Engagement Prompt'
]

# Define selection probabilities (they should sum to 1)
baseline_intervention_probs = [0.4, 0.3, 0.2, 0.1]

In [8]:
def select_intervention_personalized(row):
    """Selects intervention based on user and content features."""
    # New condition for political ideology towards conservative
    if row['political_ideology'] == 'conservative':
        return 'Nudge Warning'  # Assign 'Nudge Warning' to conservative users
    # Existing personalized intervention logic
    if row['CMQ_score'] > 4 and row['topic'] == 'politics':
        return 'Prebunking (Context)'
    elif row['CRT_score'] < 4 and row['complexity'] > 0.5:
        return 'Boosting (Educational Video)'
    else:
        return 'Standard Warning'

def select_intervention_baseline(row):
    """Randomly selects a baseline intervention for each interaction."""
    return random.choices(baseline_interventions, weights=baseline_intervention_probs, k=1)[0]

## Intervention Effectiveness

In [9]:
# Define base effectiveness for each baseline intervention
baseline_effectiveness = {
    'Standard Warning': 0.5,          # 50% base effectiveness
    'Informational Message': 0.4,     # 40% base effectiveness
    'Neutral Feedback': 0.3,          # 30% base effectiveness
    'Engagement Prompt': 0.6          # 60% base effectiveness
}

def intervention_effectiveness_personalized(row):
    """Determines the effectiveness of personalized interventions."""
    if row['susceptible'] == 1:
        base_effectiveness = 0.0
        if row['intervention'] == 'Prebunking (Context)':
            base_effectiveness = 0.8 if row['CMQ_score'] > 4 else 0.5
        elif row['intervention'] == 'Boosting (Educational Video)':
            base_effectiveness = 0.7 if row['CRT_score'] < 4 else 0.5
        elif row['intervention'] == 'Standard Warning':
            base_effectiveness = 0.5
        elif row['intervention'] == 'Nudge Warning':  # Handle 'Nudge Warning'
            base_effectiveness = 0.6  # Define base effectiveness for 'Nudge Warning'
        # Adjust effectiveness based on content complexity and emotional impact
        content_factor = (1 - row['complexity']) * 0.5 + (1 - row['emotional_impact']) * 0.5
        effectiveness = base_effectiveness * content_factor
        return int(random.random() < effectiveness)
    else:
        return 0  # Not susceptible, so intervention not needed

In [10]:
def intervention_effectiveness_baseline(row):
    """Determines the effectiveness of baseline interventions."""
    if row['susceptible'] == 1:
        # Get the base effectiveness based on the intervention type
        base_effectiveness = baseline_effectiveness.get(row['intervention'], 0.5)  # Default to 0.5 if not found
        
        # Adjust effectiveness based on content complexity and emotional impact
        content_factor = (1 - row['complexity']) * 0.5 + (1 - row['emotional_impact']) * 0.5
        effectiveness = base_effectiveness * content_factor
        
        # Determine if the intervention is effective
        return int(random.random() < effectiveness)
    else:
        return 0  # Not susceptible, so intervention not needed

## Simulation over Time Period

# Personalised Time Period

In [11]:
def simulate_interventions_over_time_personalized(interactions, model, user_profiles):
    """Simulates personalized interventions and updates user profiles."""
    interactions_encoded = encode_features(interactions)
    X = interactions_encoded.drop(['user_id', 'content_id', 'susceptible', 'is_misinformation', 'time_step'], axis=1)
    interactions['predicted_susceptibility'] = model.predict_proba(X)[:,1]
    # Select interventions (Personalized)
    interactions['intervention'] = interactions.apply(select_intervention_personalized, axis=1)
    
    # Simulate user response
    interactions['effective'] = interactions.apply(intervention_effectiveness_personalized, axis=1)
    
    # Update user profiles based on effective interventions
    effective_interventions = interactions[interactions['effective'] == 1]
    users_with_effective_interventions = effective_interventions['user_id'].unique()
    user_profiles.loc[user_profiles['user_id'].isin(users_with_effective_interventions), 'past_interventions'] += 1
    
    # Calculate effectiveness
    total_susceptible = interactions['susceptible'].sum()
    total_effective = interactions['effective'].sum()
    effectiveness_percentage = (total_effective / total_susceptible * 100) if total_susceptible > 0 else 0
    print(f"Personalized Intervention Effectiveness at time step {interactions['time_step'].iloc[0]+1}: {total_effective}/{total_susceptible} ({effectiveness_percentage:.2f}%)")
    
    return interactions, user_profiles, total_effective, total_susceptible



## Over Baseline Time Period

In [12]:
def simulate_interventions_over_time_baseline(interactions, model, user_profiles):
    """Simulates baseline interventions and updates user profiles."""
    interactions_encoded = encode_features(interactions)
    X = interactions_encoded.drop(['user_id', 'content_id', 'susceptible', 'is_misinformation', 'time_step'], axis=1)
    interactions['predicted_susceptibility'] = model.predict_proba(X)[:,1]
    # Select interventions (Baseline)
    interactions['intervention'] = interactions.apply(select_intervention_baseline, axis=1)
    
    # Simulate user response
    interactions['effective'] = interactions.apply(intervention_effectiveness_baseline, axis=1)
    
    # Update user profiles based on effective interventions
    effective_interventions = interactions[interactions['effective'] == 1]
    users_with_effective_interventions = effective_interventions['user_id'].unique()
    user_profiles.loc[user_profiles['user_id'].isin(users_with_effective_interventions), 'past_interventions'] += 1
    
    # Calculate effectiveness
    total_susceptible = interactions['susceptible'].sum()
    total_effective = interactions['effective'].sum()
    effectiveness_percentage = (total_effective / total_susceptible * 100) if total_susceptible > 0 else 0
    print(f"Baseline Intervention Effectiveness at time step {interactions['time_step'].iloc[0]+1}: {total_effective}/{total_susceptible} ({effectiveness_percentage:.2f}%)")
    
    return interactions, user_profiles, total_effective, total_susceptible

## Number of Users

In [13]:
num_users = 100  # Number of users
num_content = 200  # Number of content items
time_steps = 100  # Number of time steps to simulate

## Generating Users for Systems

In [14]:
print("Generating synthetic user profiles for Personalized and Baseline systems...")
user_profiles_personalized = generate_user_profiles(num_users)
user_profiles_baseline = generate_user_profiles(num_users)  # Separate profiles for baseline

Generating synthetic user profiles for Personalized and Baseline systems...


In [15]:
user_profiles_baseline

Unnamed: 0,user_id,CRT_score,CMQ_score,openness,conscientiousness,extraversion,agreeableness,neuroticism,political_ideology,age,gender,education,past_interventions,susceptibility_score
0,0,1,1,0.625891,0.565732,0.844783,0.727169,0.443750,moderate,25,other,high_school,0.0,0.988275
1,1,3,3,0.885978,0.267028,0.761024,0.742707,0.868142,moderate,20,other,bachelor,0.0,0.443273
2,2,6,2,0.615863,0.878630,0.626220,0.425493,0.177150,conservative,61,female,high_school,0.0,0.047831
3,3,4,4,0.232959,0.797426,0.131245,0.345935,0.692626,moderate,20,male,bachelor,0.0,0.038022
4,4,1,3,0.024401,0.658452,0.032526,0.371039,0.838115,liberal,39,other,master,0.0,0.746207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,3,3,0.480587,0.968193,0.072198,0.201853,0.181438,moderate,41,other,phd,0.0,0.811752
96,96,3,4,0.592408,0.185526,0.030653,0.673432,0.525163,liberal,59,female,high_school,0.0,0.741712
97,97,2,5,0.824681,0.868623,0.257683,0.969912,0.709046,conservative,47,female,master,0.0,0.881451
98,98,5,0,0.347809,0.776597,0.462623,0.093901,0.106877,liberal,31,other,high_school,0.0,0.277902


In [16]:
user_profiles_personalized

Unnamed: 0,user_id,CRT_score,CMQ_score,openness,conscientiousness,extraversion,agreeableness,neuroticism,political_ideology,age,gender,education,past_interventions,susceptibility_score
0,0,6,6,0.031429,0.642032,0.051682,0.103124,0.698162,liberal,54,other,high_school,0.0,0.961070
1,1,3,1,0.636410,0.084140,0.531355,0.902553,0.536096,moderate,50,male,high_school,0.0,0.148663
2,2,4,1,0.314356,0.161629,0.540635,0.505252,0.309528,moderate,39,male,master,0.0,0.414624
3,3,6,3,0.508571,0.898554,0.637430,0.826457,0.813795,liberal,38,male,phd,0.0,0.085350
4,4,2,1,0.907566,0.606429,0.726091,0.320050,0.684731,moderate,23,other,phd,0.0,0.996874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,5,1,0.349210,0.522243,0.930757,0.353352,0.473962,moderate,28,other,bachelor,0.0,0.117751
96,96,1,1,0.725956,0.769994,0.858413,0.583656,0.667558,conservative,58,female,high_school,0.0,0.576516
97,97,3,3,0.897110,0.215821,0.428994,0.077735,0.172320,conservative,54,male,high_school,0.0,0.274055
98,98,5,4,0.887086,0.622890,0.750871,0.974395,0.192289,liberal,31,female,master,0.0,0.554178


## Generating Content for Systems

In [17]:
print("Generating synthetic content items...")
content_items = generate_content_items(num_content)

Generating synthetic content items...


In [18]:
content_items

Unnamed: 0,content_id,is_misinformation,topic,sentiment,complexity,readability_score,emotional_impact,credibility_indicator
0,0,1,technology,neutral,0.721393,0.528578,0.710135,0.405751
1,1,0,sports,negative,0.862881,0.793394,0.937378,0.220488
2,2,0,sports,negative,0.002104,0.990546,0.603292,0.762483
3,3,1,politics,negative,0.717572,0.200651,0.473401,0.542471
4,4,1,sports,negative,0.829588,0.994649,0.059716,0.149601
...,...,...,...,...,...,...,...,...
195,195,0,health,positive,0.632544,0.606147,0.252814,0.911168
196,196,0,technology,positive,0.644470,0.291144,0.271439,0.354861
197,197,0,politics,neutral,0.819798,0.207095,0.125356,0.424772
198,198,0,technology,negative,0.333407,0.152328,0.336965,0.215344


## Simulating interactions for Personalized system..

In [19]:
interactions_personalized = simulate_interactions(user_profiles_personalized, content_items, time_steps)

In [20]:
interactions_personalized

Unnamed: 0,user_id,CRT_score,CMQ_score,openness,conscientiousness,extraversion,agreeableness,neuroticism,political_ideology,age,...,content_id,is_misinformation,topic,sentiment,complexity,readability_score,emotional_impact,credibility_indicator,susceptible,time_step
0,0,6,6,0.031429,0.642032,0.051682,0.103124,0.698162,liberal,54,...,0,1,technology,neutral,0.721393,0.528578,0.710135,0.405751,1,0
1,0,6,6,0.031429,0.642032,0.051682,0.103124,0.698162,liberal,54,...,1,0,sports,negative,0.862881,0.793394,0.937378,0.220488,0,0
2,0,6,6,0.031429,0.642032,0.051682,0.103124,0.698162,liberal,54,...,2,0,sports,negative,0.002104,0.990546,0.603292,0.762483,0,0
3,0,6,6,0.031429,0.642032,0.051682,0.103124,0.698162,liberal,54,...,3,1,politics,negative,0.717572,0.200651,0.473401,0.542471,1,0
4,0,6,6,0.031429,0.642032,0.051682,0.103124,0.698162,liberal,54,...,4,1,sports,negative,0.829588,0.994649,0.059716,0.149601,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,99,4,2,0.779876,0.085347,0.754543,0.986211,0.040869,moderate,47,...,195,0,health,positive,0.632544,0.606147,0.252814,0.911168,0,99
1999996,99,4,2,0.779876,0.085347,0.754543,0.986211,0.040869,moderate,47,...,196,0,technology,positive,0.644470,0.291144,0.271439,0.354861,0,99
1999997,99,4,2,0.779876,0.085347,0.754543,0.986211,0.040869,moderate,47,...,197,0,politics,neutral,0.819798,0.207095,0.125356,0.424772,0,99
1999998,99,4,2,0.779876,0.085347,0.754543,0.986211,0.040869,moderate,47,...,198,0,technology,negative,0.333407,0.152328,0.336965,0.215344,0,99


## Model Training

In [21]:
model_personalized = train_model(interactions_personalized)

Model Accuracy: 0.92


## Simulating over Time Period