# Importing Libraries and Datasets

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_curve, confusion_matrix

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
import warnings

warnings.filterwarnings("ignore")


def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    fxn()

In [None]:
# Load the training dataset
instagram_df_train=pd.read_csv('kaggle/input/instagram-fake-spammer-genuine-accounts/train.csv')
instagram_df_train

In [None]:
# Load the testing data
instagram_df_test=pd.read_csv('kaggle/input/instagram-fake-spammer-genuine-accounts/test.csv')
instagram_df_test

# Statistical Analysis

In [None]:
instagram_df_train.head()

In [None]:
instagram_df_train.tail()

In [None]:
# Getting dataframe info
instagram_df_train.info()

In [None]:
# Get the statistical summary of the dataframe
instagram_df_train.describe()

In [None]:
# Checking if null values exist
instagram_df_train.isnull().sum()

In [None]:
# Get the number of unique values in the "profile pic" feature
instagram_df_train['profile pic'].value_counts()

In [None]:
# Get the number of unique values in "fake" (Target column)
instagram_df_train['fake'].value_counts()

# Data Visualization

In [None]:
# Visualiser la distribution des profils réels vs faux
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='fake', data=instagram_df_train, palette=['#3498db', '#e74c3c'])
plt.title('Distribution des Profils Instagram: Réels vs Faux', fontsize=16)
plt.xlabel('Faux Profil (0 = Non, 1 = Oui)', fontsize=12)
plt.ylabel('Nombre de Profils', fontsize=12)

# Ajouter les valeurs sur les barres
for p in ax.patches:
    ax.annotate(f'{p.get_height():,}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'bottom', fontsize=12)

plt.xticks([0, 1], ['Réel (0)', 'Faux (1)'])
plt.show()

In [None]:
# Visualiser la distribution des profils privés vs publics
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='private', data=instagram_df_train, palette=['#2ecc71', '#9b59b6'])
plt.title('Distribution des Profils: Publics vs Privés', fontsize=16)
plt.xlabel('Profil Privé (0 = Non, 1 = Oui)', fontsize=12)
plt.ylabel('Nombre de Profils', fontsize=12)

# Ajouter les valeurs sur les barres
for p in ax.patches:
    ax.annotate(f'{p.get_height():,}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'bottom', fontsize=12)
    
plt.xticks([0, 1], ['Public (0)', 'Privé (1)'])
plt.show()

In [None]:
# Visualiser la distribution des profils avec/sans photo de profil
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='profile pic', data=instagram_df_train, palette=['#f39c12', '#1abc9c'])
plt.title('Distribution des Profils: Avec vs Sans Photo de Profil', fontsize=16)
plt.xlabel('Photo de Profil (0 = Non, 1 = Oui)', fontsize=12)
plt.ylabel('Nombre de Profils', fontsize=12)

# Ajouter les valeurs sur les barres
for p in ax.patches:
    ax.annotate(f'{p.get_height():,}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'bottom', fontsize=12)
    
plt.xticks([0, 1], ['Sans Photo (0)', 'Avec Photo (1)'])
plt.show()

In [None]:
# Visualiser la distribution du ratio chiffres/longueur des noms d'utilisateur
plt.figure(figsize = (14, 8))
ax = sns.histplot(instagram_df_train['nums/length username'], bins=30, kde=True, color='#3498db')
plt.title('Distribution du Ratio Chiffres/Longueur des Noms d\'Utilisateur', fontsize=16)
plt.xlabel('Ratio Chiffres/Longueur du Nom d\'Utilisateur', fontsize=12)
plt.ylabel('Fréquence', fontsize=12)
plt.grid(axis='y', alpha=0.3)

# Ajouter une ligne verticale pour la moyenne
mean_val = instagram_df_train['nums/length username'].mean()
plt.axvline(x=mean_val, color='#e74c3c', linestyle='--', linewidth=2)
plt.text(mean_val + 0.02, plt.ylim()[1]*0.9, f'Moyenne: {mean_val:.3f}', color='#e74c3c', fontsize=12)

plt.show()

In [None]:
# Visualiser différemment: Distribution du ratio chiffres/longueur par type de profil
plt.figure(figsize = (14, 8))
sns.boxplot(x='fake', y='nums/length username', data=instagram_df_train, palette=['#3498db', '#e74c3c'])
plt.title('Distribution du Ratio Chiffres/Longueur par Type de Profil', fontsize=16)
plt.xlabel('Type de Profil', fontsize=12)
plt.ylabel('Ratio Chiffres/Longueur du Nom d\'Utilisateur', fontsize=12)
plt.xticks([0, 1], ['Réel (0)', 'Faux (1)'])
plt.grid(axis='y', alpha=0.3)
plt.show()

In [None]:
# Visualisation améliorée de la matrice de corrélation
plt.figure(figsize=(16, 14))
mask = np.triu(instagram_df_train.corr())
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(instagram_df_train.corr(), annot=True, fmt='.2f', cmap=cmap, linewidths=0.5, 
            mask=mask, vmin=-1, vmax=1, center=0, square=True, cbar_kws={"shrink": .8})

plt.title('Matrice de Corrélation des Caractéristiques', fontsize=18, pad=20)
plt.tight_layout()
plt.show()

# Data Modelling

In [None]:
# Training and testing dataset (inputs)
X_train = instagram_df_train.drop(columns = ['fake'])
X_test = instagram_df_test.drop(columns = ['fake'])
X_train

In [None]:
# Training and testing dataset (Outputs)
y_train = instagram_df_train['fake']
y_test = instagram_df_test['fake']
y_train

In [20]:
# Scale the data before training the model
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler_x = StandardScaler()
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)

In [21]:
# Convertir en tenseurs PyTorch
X_train_tensor = torch.FloatTensor(X_train)
X_test_tensor = torch.FloatTensor(X_test)

In [22]:
# Convertir les labels en one-hot encoding
y_train_tensor = torch.zeros(len(y_train), 2)
y_train_tensor[range(len(y_train)), y_train.astype(int)] = 1
y_test_tensor = torch.zeros(len(y_test), 2)
y_test_tensor[range(len(y_test)), y_test.astype(int)] = 1

In [23]:
# Créer des datasets PyTorch
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [24]:
# Créer des dataloaders
batch_size = 32
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [25]:
# Définition du modèle PyTorch
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim=11):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(input_dim, 50)
        self.layer2 = nn.Linear(50, 150)
        self.layer3 = nn.Linear(150, 150)
        self.layer4 = nn.Linear(150, 25)
        self.layer5 = nn.Linear(25, 2)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.dropout(x)
        x = self.relu(self.layer3(x))
        x = self.dropout(x)
        x = self.relu(self.layer4(x))
        x = self.dropout(x)
        x = self.softmax(self.layer5(x))
        return x

In [None]:
# Créer le modèle
model = NeuralNetwork()
print(model)

In [27]:
# Définir la fonction de perte et l'optimiseur
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [28]:
# Entraînement du modèle
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 50
history = {'train_loss': [], 'val_loss': []}

In [29]:
# Validation split
val_size = int(0.1 * len(train_dataset))
train_size = len(train_dataset) - val_size
train_subset, val_subset = torch.utils.data.random_split(train_dataset, [train_size, val_size])
train_loader = DataLoader(dataset=train_subset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_subset, batch_size=batch_size, shuffle=False)

In [None]:
for epoch in range(num_epochs):
    # Training
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(train_loader)
    history['train_loss'].append(epoch_loss)
    
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    
    val_loss = val_loss / len(val_loader)
    history['val_loss'].append(val_loss)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}')

# Model Validation and Results

In [None]:
print(history.keys())

In [None]:
# Visualisation améliorée de la progression des pertes
plt.figure(figsize=(12, 7))
epochs = range(1, len(history['train_loss']) + 1)

plt.plot(epochs, history['train_loss'], 'b-', linewidth=2, label='Perte Entraînement')
plt.plot(epochs, history['val_loss'], 'r-', linewidth=2, label='Perte Validation')

plt.title('Évolution des Pertes Pendant l\'Entraînement et la Validation', fontsize=16)
plt.ylabel('Perte', fontsize=14)
plt.xlabel('Époque', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(fontsize=12)

min_val_loss = min(history['val_loss'])
min_val_epoch = history['val_loss'].index(min_val_loss) + 1
plt.plot(min_val_epoch, min_val_loss, 'ro', markersize=8)
plt.annotate(f'Min: {min_val_loss:.4f}', 
             xy=(min_val_epoch, min_val_loss),
             xytext=(min_val_epoch+1, min_val_loss+0.05),
             arrowprops=dict(facecolor='black', shrink=0.05, width=1.5),
             fontsize=10)

plt.show()

In [33]:
# Prédiction sur l'ensemble de test
model.eval()
predicted = []
test_targets = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, predicted_batch = torch.max(outputs, 1)
        _, labels_batch = torch.max(labels, 1)
        
        predicted.extend(predicted_batch.cpu().numpy())
        test_targets.extend(labels_batch.numpy())

In [None]:
# Évaluation du modèle
print(classification_report(test_targets, predicted))

In [None]:
# Visualisation améliorée de la matrice de confusion
plt.figure(figsize=(10, 8))
cm = confusion_matrix(test_targets, predicted)
labels = ['Réel (0)', 'Faux (1)']

# Calculer les pourcentages pour l'affichage
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Définir les annotations personnalisées
annot = np.empty_like(cm).astype(str)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        annot[i, j] = f'{cm[i, j]}\n({cm_percent[i, j]:.1f}%)'

ax = sns.heatmap(cm, annot=annot, fmt='', cmap='Blues', linewidths=1, linecolor='black',
                 xticklabels=labels, yticklabels=labels, cbar=False, annot_kws={"size": 12})

plt.title('Matrice de Confusion', fontsize=16, pad=20)
plt.xlabel('Valeurs Prédites', fontsize=14)
plt.ylabel('Valeurs Réelles', fontsize=14)
plt.tight_layout()

# Ajouter l'accuracy globale
accuracy = np.trace(cm) / np.sum(cm) * 100
plt.figtext(0.5, 0.01, f'Précision Globale: {accuracy:.2f}%', ha='center', fontsize=12)

plt.show()

In [None]:
# Visualisation de la courbe ROC
model.eval()
y_true = []
y_pred_proba = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        
        _, labels_batch = torch.max(labels, 1)
        y_true.extend(labels_batch.numpy())
        y_pred_proba.extend(outputs.cpu().numpy()[:, 1])

# Calculer la courbe ROC
fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)

# Tracer la courbe ROC
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Courbe ROC (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Ligne de référence')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taux de Faux Positifs', fontsize=12)
plt.ylabel('Taux de Vrais Positifs', fontsize=12)
plt.title('Courbe ROC (Receiver Operating Characteristic)', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# Visualisation des métriques de classification par classe
report = classification_report(test_targets, predicted, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df = report_df.drop('accuracy', errors='ignore')

plt.figure(figsize=(12, 8))
ax = sns.heatmap(report_df.iloc[:-1, :].astype(float), annot=True, cmap='YlGnBu', fmt='.3f')
plt.title('Métriques de Classification par Classe', fontsize=16)
plt.tight_layout()
plt.show()