# Fraud Detection in Crypto (ETH Blockchain)  
Dataset Link: https://www.kaggle.com/datasets/vagifa/ethereum-frauddetection-dataset/code

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('final.csv')

df.head()

## Correlation Relationship

In [None]:
def cor_matrix(data):
    # Compute the correlation matrix of numeric columns
    corr = data.select_dtypes(include='number').corr()

    # Create mask to hide upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Set up matplotlib figure
    plt.figure(figsize=(15,15))

    # Customize seaborn heatmap
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', mask=mask, cbar_kws={'shrink':0.8})

    # Rotate x-axis labels
    plt.xticks(rotation=80)

    # Set plot title
    plt.title("Correlation Matrix", fontsize=20)

    plt.show()

In [None]:
def corelated(data, target_col, threshold):
    """
    Finds columns in `data` with correlation >= threshold (absolute) with the target column.

    Parameters:
    - data: pandas DataFrame
    - target_col: string, name of the column to compare against
    - threshold: float, minimum absolute correlation to include

    Returns:
    - pandas Series: correlation values sorted by strength
    """
    # Select numeric columns only
    numeric_data = data.select_dtypes(include='number')

    # Check if the target column is numeric and in the DataFrame
    if target_col not in numeric_data.columns:
        raise ValueError(f"'{target_col}' must be a numeric column in the DataFrame")

    # Compute correlation with target column
    corrs = numeric_data.corrwith(numeric_data[target_col])

    # Filter based on threshold
    result = corrs[abs(corrs) >= threshold].drop(labels=[target_col])

    return result.sort_values(ascending=False)

In [None]:
def get_high_corr_pairs(df, threshold=0.3):
    # Only use numeric columns
    corr_matrix = df.select_dtypes(include='number').corr()

    # Unstack the matrix to get pairs
    corr_unstacked = corr_matrix.abs().unstack()

    # Filter pairs above threshold but remove self-correlation (value = 1)
    high_corr_pairs = corr_unstacked[
        (corr_unstacked > threshold) & (corr_unstacked < 1)
    ].drop_duplicates()

    return high_corr_pairs.sort_values(ascending=False)


In [None]:
corelated(df, 'FLAG', 0.05)

In [None]:
def drop_highly_correlated_features(df, threshold=0.8):
    # Step 1: Compute correlation matrix (absolute values)
    corr_matrix = df.select_dtypes(include='number').corr().abs()

    # Step 2: Get upper triangle of the matrix (no duplicates or self-correlations)
    upper = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )

    # Step 3: Find pairs above threshold
    to_drop = set()
    for col in upper.columns:
        for row in upper.index:
            if upper.loc[row, col] > threshold:
                # Drop col if not already dropped; otherwise skip
                if col not in to_drop and row not in to_drop:
                    to_drop.add(col)

    # Step 4: Drop selected columns
    reduced_df = df.drop(columns=to_drop)

    return reduced_df, list(to_drop)

## ML models Comparision

In [None]:
from pycaret.classification import *

In [None]:
setup(df,target="FLAG",session_id=85)

In [None]:
compare_models()

In [None]:
check = df.iloc[:,1:].select_dtypes(include = ['number'])
no_var = check.var() == 0 
zero_var_cols = check.columns[no_var]

In [None]:
#Drop features with Variance = 0
data = df.drop(columns = zero_var_cols)
data.shape

In [None]:
data_, dropped_cols = drop_highly_correlated_features(data, threshold=0.5)
data_.shape

In [None]:
num_data = data_.select_dtypes(include = ['number'])
num_data.head()

In [None]:
# Drop features that have mostly 0s
drop = []
for i in num_data.columns[1:]:
    if len(num_data[i].value_counts()) < 10:
        drop.append(i)
        print(df[i].value_counts())
        print('------------------')

In [None]:
drop = ['min value sent to contract',
 'max val sent to contract',
 'ERC20 uniq sent addr.1']

In [None]:
num_data.drop(columns=drop, inplace = True)
feature = num_data.columns[1:]
print(feature)

In [None]:
cor_matrix(num_data)

## Predictive Model

Class 1 represents the minority class (~22%), which could lead the model to favor predicting Class 0. To address this imbalance, SMOTE (Synthetic Minority Over-sampling Technique) should be applied to resample the training data and ensure better representation of the minority class during model learning.

In [None]:
X = num_data[feature]
y = num_data['FLAG']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score


def model(X, y, pipeline):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    print(classification_report(y_test,y_pred))


    cm = confusion_matrix(y_test, y_pred)
    print(f'Confusion Matrix: \n{cm}')
    print('\n')

    # Predict on training data
    y_train_pred = pipeline.predict(X_train)

    # Accuracy scores
    print(f"Training accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Test accuracy:     {accuracy_score(y_test, y_pred):.4f}")

    #Predict probabilities
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    # ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, y_proba)
    auc_score = roc_auc_score(y_test, y_proba)

    plt.figure(figsize=(6, 5))
    plt.plot(fpr, tpr, label=f"AUC = {auc_score:.2f}")
    plt.plot([0, 1], [0, 1], linestyle="--", color="red")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.title("ROC Curve")
    plt.legend(loc = 'lower right')
    plt.show()

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=10))
])

model(X,y,pipeline)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([('smote', SMOTE(random_state=42)),
                     ('scaler', StandardScaler()),
                     ('lr', LogisticRegression(max_iter=1000))])

model(X,y,pipeline)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier 

pipeline = Pipeline([('smote', SMOTE(random_state=42)),
                     ('scaler', StandardScaler()),
                     ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

model(X,y,pipeline)


### XGBoost
XGBoost achieved the test accuracy of 95%, demonstrating strong overall performance in classifying both fraudulent and non-fraudulent cases.

Based on a test dataset of 1,964 samples:

* Out of 436 actual fraud cases, the model correctly identified 373 cases, achieving a recall of 86%.

* Out of 416 predicted fraud cases, 373 were correct, yielding a precision of 90%.

An AUC (Area Under the ROC Curve) of 0.98 indicates that the model is highly effective at distinguishing between the two classes, suggesting near-perfect separability between fraudulent and non-fraudulent transactions.

In [None]:
from xgboost import XGBClassifier

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()), 
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

model(X, y, pipeline)  

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier  # or your model

model = RandomForestClassifier()
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')  # use 'roc_auc' for AUC

print("Cross-Validation Accuracy Scores:", scores)
print("Mean CV Accuracy:", scores.mean())


In [None]:
logit_data = num_data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn import metrics
import statsmodels.api as sm

pd.set_option('display.float_format', '{:,.6f}'.format)

X = logit_data[feature]
y = logit_data['FLAG']

X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=109)

#Logistic Model
log_model = sm.Logit(y_train,X_train).fit()
print(log_model.summary())

#Predictation on test data
y_pred = log_model.predict(X_test)
    

In [None]:
#Confusion Matrix
X_test= X_test.copy()
X_test.loc[:,'prediction']=0
X_test.loc[y_pred>0.5 ,'prediction']=1
print("\nConfusion Matrix: \n", confusion_matrix(y_test,X_test['prediction']))
print("\n",classification_report(y_test,X_test['prediction'],digits=3))