# Importing Libraries and Datasets

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_curve, confusion_matrix
from xgboost import XGBClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
import warnings

warnings.filterwarnings("ignore")


def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    fxn()

In [None]:
# Load the training dataset
instagram_df_train=pd.read_csv('kaggle/input/instagram-fake-spammer-genuine-accounts/train.csv')
instagram_df_train

In [None]:
# Load the testing data
instagram_df_test=pd.read_csv('kaggle/input/instagram-fake-spammer-genuine-accounts/test.csv')
instagram_df_test

# Statistical Analysis

In [None]:
instagram_df_train.head()

In [None]:
instagram_df_train.tail()

In [None]:
# Getting dataframe info
instagram_df_train.info()

In [None]:
# Get the statistical summary of the dataframe
instagram_df_train.describe()

In [None]:
# Checking if null values exist
instagram_df_train.isnull().sum()

In [None]:
# Get the number of unique values in the "profile pic" feature
instagram_df_train['profile pic'].value_counts()

In [None]:
# Get the number of unique values in "fake" (Target column)
instagram_df_train['fake'].value_counts()

# Data Visualization

In [None]:
# Visualization of real vs fake profiles distribution
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='fake', data=instagram_df_train, palette=['#3498db', '#e74c3c'])
plt.title('Instagram Profile Distribution: Real vs Fake', fontsize=16)
plt.xlabel('Fake Profile (0 = No, 1 = Yes)', fontsize=12)
plt.ylabel('Number of Profiles', fontsize=12)
# Add values on bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():,}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'bottom', fontsize=12)

plt.xticks([0, 1], ['Real (0)', 'Fake (1)'])
plt.show()

In [None]:
# Visualization of real vs fake profiles distribution
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='fake', data=instagram_df_train, palette=['#3498db', '#e74c3c'])
plt.title('Instagram Profile Distribution: Real vs Fake', fontsize=16)
plt.xlabel('Fake Profile (0 = No, 1 = Yes)', fontsize=12)
plt.ylabel('Number of Profiles', fontsize=12)

# Add values on bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():,}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'bottom', fontsize=12)

plt.xticks([0, 1], ['Real (0)', 'Fake (1)'])
plt.show()


In [None]:
# Visualization of real vs fake profiles distribution
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='fake', data=instagram_df_train, palette=['#3498db', '#e74c3c'])
plt.title('Instagram Profile Distribution: Real vs Fake', fontsize=16)
plt.xlabel('Fake Profile (0 = No, 1 = Yes)', fontsize=12)
plt.ylabel('Number of Profiles', fontsize=12)

# Add values on bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():,}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'bottom', fontsize=12)

plt.xticks([0, 1], ['Real (0)', 'Fake (1)'])
plt.show()

In [None]:
# Visualization of the digits/length ratio distribution of usernames
plt.figure(figsize = (14, 8))
ax = sns.histplot(instagram_df_train['nums/length username'], bins=30, kde=True, color='#3498db')
plt.title('Digits/Length Ratio Distribution of Usernames', fontsize=16)
plt.xlabel('Digits/Length Ratio of Username', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', alpha=0.3)

# Add a vertical line for the mean
mean_val = instagram_df_train['nums/length username'].mean()
plt.axvline(x=mean_val, color='#e74c3c', linestyle='--', linewidth=2)
plt.text(mean_val + 0.02, plt.ylim()[1]*0.9, f'Mean: {mean_val:.3f}', color='#e74c3c', fontsize=12)

plt.show()

In [None]:
# Visualization of the digits/length ratio distribution of usernames
plt.figure(figsize = (14, 8))
ax = sns.histplot(instagram_df_train['nums/length username'], bins=30, kde=True, color='#3498db')
plt.title('Digits/Length Ratio Distribution of Usernames', fontsize=16)
plt.xlabel('Digits/Length Ratio of Username', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', alpha=0.3)

# Add a vertical line for the mean
mean_val = instagram_df_train['nums/length username'].mean()
plt.axvline(x=mean_val, color='#e74c3c', linestyle='--', linewidth=2)
plt.text(mean_val + 0.02, plt.ylim()[1]*0.9, f'Mean: {mean_val:.3f}', color='#e74c3c', fontsize=12)

plt.show()

In [None]:
# Enhanced visualization of the correlation matrix
plt.figure(figsize=(16, 14))
mask = np.triu(instagram_df_train.corr())
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(instagram_df_train.corr(), annot=True, fmt='.2f', cmap=cmap, linewidths=0.5, 
            mask=mask, vmin=-1, vmax=1, center=0, square=True, cbar_kws={"shrink": .8})

plt.title('Feature Correlation Matrix', fontsize=18, pad=20)
plt.tight_layout()
plt.show()

# Data Modelling

In [None]:
# Training and testing dataset (inputs)
X_train = instagram_df_train.drop(columns = ['fake'])
X_test = instagram_df_test.drop(columns = ['fake'])
X_train

In [None]:
# Training and testing dataset (Outputs)
y_train = instagram_df_train['fake']
y_test = instagram_df_test['fake']
y_train