<a href="https://colab.research.google.com/github/Rahulappu2004/Fake-Account-Detection-in-Twitter-X/blob/main/EGSLA_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-learn networkx




In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
df = pd.read_csv('/content/twitter_data.csv')

In [4]:
# Handle missing values
df.fillna(df.mean(), inplace=True)

# Extract features and target
X = df.drop('Fake Or Not Category', axis=1)
y = df['Fake Or Not Category']

In [5]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Create labeled and unlabeled data
np.random.seed(42)
labeled_indices = np.random.choice(len(y), size=int(0.3 * len(y)), replace=False)
unlabeled_indices = np.setdiff1d(np.arange(len(y)), labeled_indices)

y_semi = -1 * np.ones_like(y)
y_semi[labeled_indices] = y[labeled_indices]

In [8]:
gamma = 0.1  # Lower gamma reduces similarity
similarity_matrix = rbf_kernel(X_scaled, gamma=gamma)
# Set a stricter similarity threshold
G = nx.Graph()
for i in range(len(X_scaled)):
    for j in range(i + 1, len(X_scaled)):
        if similarity_matrix[i, j] > 0.3:  # Increase threshold
            G.add_edge(i, j, weight=similarity_matrix[i, j])

In [9]:
# Step 2: Enhance the Graph (optional, add enhancements specific to EGSLA)
# Example: Remove noisy edges (edges with very low weight)
edges_to_remove = [(u, v) for u, v, w in G.edges(data=True) if w['weight'] < 0.4]
G.remove_edges_from(edges_to_remove)
# Step 3: Label Propagation
# Initialize labels for propagation
labels = np.copy(y_semi)

In [10]:
# Perform label propagation
max_iterations = 10
for iteration in range(max_iterations):
    new_labels = np.copy(labels)
    for node in G.nodes():
        if node in labeled_indices:  # Skip labeled nodes
            continue
        # Aggregate labels from neighbors
        neighbors = list(G.neighbors(node))
        neighbor_labels = [labels[neighbor] for neighbor in neighbors if labels[neighbor] != -1]
        if neighbor_labels:  # If neighbors have labels
            new_labels[node] = max(set(neighbor_labels), key=neighbor_labels.count)
    # Check for convergence
    if np.array_equal(labels, new_labels):
        break
    labels = new_labels

In [11]:
# Simulated dataset
np.random.seed(42)
num_samples = 1000
num_features = 7
X = np.random.rand(num_samples, num_features)  # Feature matrix
y = np.random.randint(0, 2, size=num_samples)  # Binary labels



In [12]:
# Preprocess data (scaling)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Add noise to features
noise = np.random.normal(0, 0.1, X_scaled.shape)  # Noise with mean=0, std=0.1
X_scaled_noisy = X_scaled + noise


In [13]:
# Split labeled and unlabeled indices
labeled_ratio = 0.2  # Use 20% of the data as labeled
labeled_indices = np.random.choice(len(y), size=int(labeled_ratio * len(y)), replace=False)
unlabeled_indices = np.setdiff1d(np.arange(len(y)), labeled_indices)

In [14]:
# Introduce label noise (20% mislabeled data)
num_mislabeled = int(0.2 * len(labeled_indices))
mislabeled_indices = np.random.choice(labeled_indices, size=num_mislabeled, replace=False)
y_noisy = y.copy()
y_noisy[mislabeled_indices] = 1 - y_noisy[mislabeled_indices]  # Flip labels

In [15]:
# Similarity matrix using RBF kernel with lower gamma
gamma = 0.1
similarity_matrix = rbf_kernel(X_scaled_noisy, gamma=gamma)

# Construct a sparse graph (threshold-based)
threshold = 0.4  # Higher threshold reduces graph connectivity
G = nx.Graph()
for i in range(len(X_scaled_noisy)):
    for j in range(i + 1, len(X_scaled_noisy)):
        if similarity_matrix[i, j] > threshold:
            G.add_edge(i, j, weight=similarity_matrix[i, j])

In [16]:
# Initialize labels for semi-supervised learning
y_semi = np.full(len(y), -1)  # -1 indicates unlabeled
y_semi[labeled_indices] = y_noisy[labeled_indices]  # Use noisy labels for labeled data

In [17]:
# Semi-supervised label propagation (EGSLA-inspired)
def label_propagation(graph, y_semi, max_iterations=50, alpha=0.9):
    node_labels = np.zeros((len(graph.nodes), 2))  # One-hot encoded labels
    for i in graph.nodes:
        if y_semi[i] != -1:
            node_labels[i, y_semi[i]] = 1  # Initialize labeled nodes

    for _ in range(max_iterations):
        new_labels = np.zeros_like(node_labels)
        for node in graph.nodes:
            neighbors = list(graph.neighbors(node))
            if y_semi[node] != -1:
                new_labels[node] = node_labels[node]  # Preserve labeled data
            else:
                # Label propagation: weighted average of neighbors' labels
                for neighbor in neighbors:
                    weight = graph[node][neighbor]['weight']
                    new_labels[node] += weight * node_labels[neighbor]
                new_labels[node] = alpha * new_labels[node] + (1 - alpha) * node_labels[node]

        # Normalize labels
        new_labels = new_labels / (new_labels.sum(axis=1, keepdims=True) + 1e-6)
        node_labels = new_labels

    # Final predicted labels
    return np.argmax(node_labels, axis=1)

# Run label propagation
predicted_labels = label_propagation(G, y_semi)

In [18]:
# Evaluate performance on labeled data
true_labels = y[labeled_indices]
predicted_on_labeled = predicted_labels[labeled_indices]

accuracy = accuracy_score(true_labels, predicted_on_labeled)
precision = precision_score(true_labels, predicted_on_labeled)
recall = recall_score(true_labels, predicted_on_labeled)
f1 = f1_score(true_labels, predicted_on_labeled)

print("Performance Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Performance Metrics:
Accuracy: 0.80
Precision: 0.83
Recall: 0.79
F1 Score: 0.81
