In [59]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

## Load the data

In [60]:
df = pd.read_csv("../data/01_raw/creditcard.csv", sep=',')

In [61]:
df.shape

(284807, 31)

In [62]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Prepare the data

In [63]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

No missing values

Let's scale the columns "Time" and "Money", as the other columns are

In [64]:
scaler = RobustScaler()

In [65]:
df['scaled_amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = scaler.fit_transform(df['Time'].values.reshape(-1,1))

In [66]:
df.drop(['Time','Amount'], axis=1, inplace=True)

In [67]:
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,scaled_amount,scaled_time
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,1.783274,-0.994983
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.269825,-0.994983
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,4.983721,-0.994972
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,1.418291,-0.994972
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,0.670579,-0.99496


## Generate a training / testing set

In [68]:
X = df.drop('Class', axis=1)
y = df['Class']

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [70]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(190820, 30) (190820,)
(93987, 30) (93987,)


## Define evaluation metrics

In [71]:
#Cette fonction est à compléter avec les metrics manquante, qu'on trouve dans la librairie imblearn
def compute_metrics(y_test, y_pred):
    res = []
    res.append(accuracy_score(y_test, y_pred))
    res.append(precision_score(y_test, y_pred, average='weighted'))
    res.append(recall_score(y_test, y_pred, average='weighted'))
    res.append(f1_score(y_test, y_pred,average='weighted'))
    return res

We create a score dataframe were the results will be stored

In [72]:
scores_df = pd.DataFrame(index = ['accuracy', 'precision', 'recall', 'f1_score'], columns=['Logistic Reg', 'SVM', 'Random Forest'])

In [73]:
scores_df.head()

Unnamed: 0,Logistic Reg,SVM,Random Forest
accuracy,,,
precision,,,
recall,,,
f1_score,,,


## Model 1: Logistic Regression

In [74]:
LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)



In [75]:
scores_df['Logistic Reg'] = compute_metrics(y_test, LR.predict(X_test))

In [76]:
scores_df

Unnamed: 0,Logistic Reg,SVM,Random Forest
accuracy,0.999276,,
precision,0.999222,,
recall,0.999276,,
f1_score,0.99922,,


## Model 2: Support Vector Machines

In [77]:
SVM = svm.LinearSVC()
SVM.fit(X, y)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [78]:
scores_df['SVM'] = compute_metrics(y_test, SVM.predict(X_test))

In [79]:
scores_df

Unnamed: 0,Logistic Reg,SVM,Random Forest
accuracy,0.999276,0.999404,
precision,0.999222,0.999371,
recall,0.999276,0.999404,
f1_score,0.99922,0.999376,


## Model 3: Random Forest

In [80]:
RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
RF.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [81]:
scores_df['Random Forest'] = compute_metrics(y_test, RF.predict(X_test))

In [82]:
scores_df

Unnamed: 0,Logistic Reg,SVM,Random Forest
accuracy,0.999276,0.999404,0.999191
precision,0.999222,0.999371,0.999121
recall,0.999276,0.999404,0.999191
f1_score,0.99922,0.999376,0.999114
