## Import and Setup


In [13]:
import sys
import os
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, QuantileTransformer, StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# To allow for importing of 'utils' module from parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

from utils import import_data

dataset_path = "../data"
dataset_version = "automated-v1.0"

# Import the dataset (returns a dictionary)
automated_dataset = import_data(dataset_path, dataset_version)

# Extract the dataframe from the dictionary
df = automated_dataset['dataframe']

print(f"Dataset shape: {df.shape}")
print(f"Dataset columns: {df.columns.tolist()}")
df.head()

def extract_timestamp_features(timestamps):
    # Sort timestamps
    sorted_timestamps = sorted(timestamps)
    
    # Calculate intervals between posts
    intervals = [sorted_timestamps[i+1] - sorted_timestamps[i] 
                for i in range(len(sorted_timestamps)-1)]
    
    # Key features
    features = {
        "interval_std": np.std(intervals),
    }
    
    return features

# Split into features and target
X = df.drop('automated_behaviour', axis=1)  
y = df['automated_behaviour']  

# Check for columns containing lists
list_columns = []
for col in X.columns:
    if isinstance(X[col].iloc[0], list):
        list_columns.append(col)

print(f"List columns that need preprocessing: {list_columns}")

# Process list columns to extract numeric features
X_processed = X.copy()

# Define binary and numeric columns
binary_columns = ['media_comments_are_disabled', 'media_has_location_info']
numeric_columns = ['media_comment_numbers', 'media_hashtag_numbers', 'media_like_numbers']

for i, col in enumerate(X_processed.columns, 1):
    print(f"{i}. {col}")
print(f"\nTotal number of features: {len(X_processed.columns)}")

for col in list_columns:
    if col == 'mediaUpload_times':  # Special handling for timestamps
        # Extract timestamp features
        X_processed[f'{col}_interval_std'] = X_processed[col].apply(
            lambda x: extract_timestamp_features(x)['interval_std'] if len(x) > 1 else 0
        )
        X_processed = X_processed.drop(col, axis=1)
    elif col in binary_columns:
        # For binary features, sum up the 1s
        X_processed[col] = X_processed[col].apply(np.sum)
    elif col in numeric_columns:
        # For numeric features, take the average
        X_processed[col] = X_processed[col].apply(lambda x: np.mean(x) if len(x) > 0 else 0)

# Splitting the dataset (20% Testing, 20% Validating, 60% Training)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_processed, y,   # Changed X to X_processed here
    test_size=0.2, 
    random_state=42, 
    shuffle=True, 
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, 
    test_size=0.25, 
    random_state=42, 
    shuffle=True, 
    stratify=y_train_val
)

print(np.unique(y_train, return_counts=True))

# Apply StandardScaler to the numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)    # Added validation set scaling
X_test_scaled = scaler.transform(X_test)

# Print shapes to verify
print(f"Processed features shape: {X_processed.shape}")
print(f"Training features shape: {X_train.shape}")
print(f"Validation features shape: {X_val.shape}")    # Added validation shape
print(f"Testing features shape: {X_test.shape}")
print(f"Scaled training data shape: {X_train_scaled.shape}")

columns_to_drop = ['media_comments_are_disabled', 'username_digit_count']
X_processed = X_processed.drop(columns=columns_to_drop)
print("\nColumns in X_processed:")
for i, col in enumerate(X_processed.columns, 1):
    print(f"{i}. {col}")
print(f"\nTotal number of features: {len(X_processed.columns)}")



Dataset shape: (1400, 17)
Dataset columns: ['user_media_count', 'user_follower_count', 'user_following_count', 'user_has_highligh_reels', 'user_has_external_url', 'user_tags_count', 'follower_following_ratio', 'user_biography_length', 'username_length', 'username_digit_count', 'media_comment_numbers', 'media_comments_are_disabled', 'media_has_location_info', 'media_hashtag_numbers', 'media_like_numbers', 'mediaUpload_times', 'automated_behaviour']
List columns that need preprocessing: ['media_comment_numbers', 'media_comments_are_disabled', 'media_has_location_info', 'media_hashtag_numbers', 'media_like_numbers', 'mediaUpload_times']
1. user_media_count
2. user_follower_count
3. user_following_count
4. user_has_highligh_reels
5. user_has_external_url
6. user_tags_count
7. follower_following_ratio
8. user_biography_length
9. username_length
10. username_digit_count
11. media_comment_numbers
12. media_comments_are_disabled
13. media_has_location_info
14. media_hashtag_numbers
15. media_l

In [14]:


c_values = [0.001, 0.01, 0.1, 1, 10, 100]  # We'll use these as alpha=1/C for regularization

# Prepare transformed datasets (reuse your previous code)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
X_test_poly = poly.transform(X_test_scaled)

X_train_log = np.log1p(X_train_scaled - X_train_scaled.min(axis=0) + 1)
X_val_log = np.log1p(X_val_scaled - X_train_scaled.min(axis=0) + 1)
X_test_log = np.log1p(X_test_scaled - X_train_scaled.min(axis=0) + 1)

quantile = QuantileTransformer(output_distribution='normal', random_state=42)
X_train_quant = quantile.fit_transform(X_train_scaled)
X_val_quant = quantile.transform(X_val_scaled)
X_test_quant = quantile.transform(X_test_scaled)

transformations = {
    'Polynomial': (X_train_poly, X_val_poly, X_test_poly),
    'Logarithmic': (X_train_log, X_val_log, X_test_log),
    'Quantile (Normal)': (X_train_quant, X_val_quant, X_test_quant)
}



In [12]:
all_results = []
all_val_preds = {}
all_test_preds = {}

for trans_name, (Xtr, Xv, Xte) in transformations.items():
    best_val_acc = -np.inf
    best_c = None
    best_val_prec = None
    best_val_rec = None
    best_test_acc = None
    for c in c_values:
        alpha = 1.0 / c  # Regularization parameter for MLPClassifier
        model = MLPClassifier(
            hidden_layer_sizes=(64, 32),  # You can adjust the architecture
            activation='relu',
            solver='adam',
            alpha=alpha,
            max_iter=300,
            random_state=42
        )
        model.fit(Xtr, y_train)
        # Training metrics
        y_train_pred = model.predict(Xtr)
        train_acc = accuracy_score(y_train, y_train_pred)
        # Validation metrics
        y_val_pred = model.predict(Xv)
        val_acc = accuracy_score(y_val, y_val_pred)
        # Test metrics
        y_test_pred = model.predict(Xte)
        test_acc = accuracy_score(y_test, y_test_pred)
        all_results.append({
            'Transformation': trans_name,
            'C': c,
            'Train Accuracy': train_acc,
            'Val Accuracy': val_acc,
            'Test Accuracy': test_acc
        })
        all_val_preds[(trans_name, c)] = y_val_pred
        all_test_preds[(trans_name, c)] = y_test_pred
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_c = c
            best_val_prec = precision_score(y_val, y_val_pred, zero_division=0)
            best_val_rec = recall_score(y_val, y_val_pred, zero_division=0)
            best_test_acc = test_acc
    print(f"\nBest for {trans_name}:")
    print(f"  C={best_c}, Val Accuracy={best_val_acc:.4f}, Test Accuracy={best_test_acc:.4f}")
    print(f"  Final Validation Precision: {best_val_prec:.4f}")
    print(f"  Final Validation Recall:    {best_val_rec:.4f}")




Best for Polynomial:
  C=1, Val Accuracy=0.9214, Test Accuracy=0.9393
  Final Validation Precision: 0.9609
  Final Validation Recall:    0.8786

Best for Logarithmic:
  C=10, Val Accuracy=0.9250, Test Accuracy=0.9357
  Final Validation Precision: 0.9685
  Final Validation Recall:    0.8786





Best for Quantile (Normal):
  C=1, Val Accuracy=0.9464, Test Accuracy=0.9179
  Final Validation Precision: 0.9845
  Final Validation Recall:    0.9071


