In [3]:
import numpy as np
import os
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn import svm
from sklearn.datasets import make_moons, make_blobs
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, classification_report, make_scorer, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler

In [15]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

df = pd.read_csv(os.path.join(path, os.listdir(path)[0]))
df['Class'].replace({1: -1}, inplace=True)
df['Class'].replace({0: 1}, inplace=True)

df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,1
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,1
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,1
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,1


In [16]:
df['Class'].value_counts(normalize=True)

 1    0.998273
-1    0.001727
Name: Class, dtype: float64

In [17]:
target = df['Class']
sns.countplot(target)

<Axes: ylabel='count'>

Error in callback <function _draw_all_if_interactive at 0x113c92440> (for post_execute), with arguments args (),kwargs {}:


KeyboardInterrupt: 

In [None]:
df_inliers = df[df['Class']==1]
df_outliers = df[df['Class']==-1]

df_inliers = df_inliers[:20000]
df = pd.concat([df_inliers, df_outliers])
target = df['Class']

print(df_inliers.shape)
print(df_outliers.shape)
print(df.shape)

sns.countplot(target)

In [None]:
frac = df[df['Class']==-1]['Class'].count()/df['Class'].count()
frac


In [None]:
X_train, X_test, y_train, y_test= train_test_split(df, target, test_size=0.2)
X_train = X_train.drop('Class', axis = 1)
X_test = X_test.drop('Class', axis = 1)
X_train_notime = X_train.drop('Time', axis=1)
X_test_notime = X_test.drop('Time', axis=1)


In [None]:
isof = IsolationForest(contamination=0.03, n_estimators=100, n_jobs=-1)


In [None]:
isof.fit(X_train_notime)

y_pred = isof.predict(X_test_notime)

pd.crosstab(y_test, y_pred, rownames=['Classes réelles'], colnames=['Classes prédites'])


In [None]:
print(classification_report(y_test, y_pred))


In [None]:
# We manually cut our cross-validation dataset within the training sample
skf = StratifiedKFold(n_splits=3) 
folds = list(skf.split(X_train_notime, y_train))
forest = IsolationForest()

# In the situation where the contamination parameter is not known a priori,
# we will add the contamination in the search grid

resc = make_scorer(recall_score,pos_label=-1)

params = {'contamination': np.linspace(0.01, 0.05, 10), 'n_estimators': [100,200,300]}

search = GridSearchCV(estimator=forest, param_grid=params, scoring=resc, cv=folds, n_jobs=-1)
search.fit(X_train_notime, y_train)

# predict
optimal_forest = search.best_estimator_
y_pred = optimal_forest.predict(X_test_notime)


pd.crosstab(y_test, y_pred, rownames=['Classes réelles'], colnames=['Classes prédites'])


In [None]:
# We select only the labels that correspond to 1
y_inliers = y_train[y_train.values==1]

# We create a list of indexes to be able to recover the explanatory variables of these indexes
liste = list(y_inliers.index.values)

# We recover these explanatory variables in a new DataFrame
X_train_inliers = X_train.loc[liste]


# Creation of the Scaler object
scaling = MinMaxScaler(feature_range=(-1, 1)).fit(X_train_inliers)

# Training of the scaler from the dataset used in the learning phase
X_train_inliers = scaling.transform(X_train_inliers)

# We transform X_test with the information of the learning DataFrame
X_test = scaling.transform(X_test)


# We define the object OneClassSVM
clf_osvm = svm.OneClassSVM(kernel='rbf', nu=0.005, gamma='scale')

# Model training
clf_osvm.fit(X_train_inliers)

# Prediction from the explanatory variables of the test sample
y_pred = clf_osvm.predict(X_test)

# Display of results in the form of a confusion matrix
pd.crosstab(y_test, y_pred, rownames=['Real classes'], colnames=['Predicted classes'])

