In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [3]:
credit_cards = pd.read_csv('creditcard.csv')

In [4]:
credit_cards.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
scaler = StandardScaler() 

In [6]:
new_df = credit_cards.drop(['Amount','Class'],axis='columns',inplace=False)

In [7]:
scaled_df = scaler.fit_transform(new_df) 

In [8]:
scaled_df = pd.DataFrame(scaled_df, columns=new_df.columns)

In [9]:
scaled_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28
0,-1.996583,-0.694242,-0.044075,1.672773,0.973366,-0.245117,0.347068,0.193679,0.082637,0.331128,...,0.496282,0.326118,-0.024923,0.382854,-0.176911,0.110507,0.246585,-0.39217,0.330892,-0.063781
1,-1.996583,0.608496,0.161176,0.109797,0.316523,0.043483,-0.06182,-0.0637,0.071253,-0.232494,...,-0.179086,-0.089611,-0.307377,-0.880077,0.162201,-0.561131,0.320694,0.261069,-0.022256,0.044608
2,-1.996562,-0.6935,-0.811578,1.169468,0.268231,-0.364572,1.351454,0.639776,0.207373,-1.378675,...,-2.778561,0.680975,0.337632,1.063358,1.45632,-1.138092,-0.628537,-0.288447,-0.137137,-0.181021
3,-1.996562,-0.493325,-0.112169,1.182516,-0.609727,-0.007469,0.93615,0.192071,0.316018,-1.262503,...,-1.514205,-0.269855,-0.147443,0.007267,-0.304777,-1.941027,1.241904,-0.460217,0.155396,0.186189
4,-1.996541,-0.59133,0.531541,1.021412,0.284655,-0.295015,0.071999,0.479302,-0.22651,0.744326,...,0.987037,0.529939,-0.012839,1.100011,-0.220123,0.23325,-0.395202,1.041611,0.54362,0.651816


In [10]:
X = scaled_df
y = credit_cards['Class']

# train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [11]:
print( y_train)

258937    0
171605    0
51294     0
67691     0
41636     0
         ..
193408    0
280392    0
12613     0
283892    0
66141     0
Name: Class, Length: 227845, dtype: int64


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

In [13]:
clf = LogisticRegression()
# Fit the model on the trainng data.
lrmodel = clf.fit(X_train, y_train)
# Print the accuracy from the testing data.
print("accuracy:",accuracy_score(clf.predict(X_test), y_test))
print("f1 score:",f1_score(y_test, clf.predict(X_test)))
print("Precision:",precision_score(y_test,clf.predict(X_test)))
print("Recall:",recall_score(y_test,clf.predict(X_test)))
scores = cross_val_score(lrmodel,X_train,y_train,cv=5,scoring='r2')
print('CV scores: ', scores)
params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
logreg_cv = GridSearchCV(clf, params, cv=5)
logreg_cv.fit(X, y)
print('Best Score: ', logreg_cv.best_score_)
print('Logistic Regression parameters: ',logreg_cv.best_params_)

accuracy: 0.9990695551420246
f1 score: 0.7225130890052357
Precision: 0.8414634146341463
Recall: 0.6330275229357798
CV scores:  [0.56506409 0.53870434 0.46663121 0.4926492  0.58371216]
Best Score:  0.9991467905596252
Logistic Regression parameters:  {'C': 0.01}


In [14]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gbmodel = gb.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_test, y_test)))
    print("F1 score:",f1_score( y_test,gb.predict(X_test)))
    print("Precision:",precision_score(y_test,gb.predict(X_test)))
    print("Recall:",recall_score(y_test,gb.predict(X_test)))
    scores = cross_val_score(gbmodel,X_train,y_train,cv=5,scoring='r2')
    print('CV scores: ', scores)
    
    print()

Learning rate:  0.05
Accuracy score (training): 0.999
Accuracy score (validation): 0.999
F1 score: 0.38848920863309355
Precision: 0.9
Recall: 0.24770642201834864
CV scores:  [0.42008545 0.1169483  0.27149629 0.15441533 0.36255925]

Learning rate:  0.1
Accuracy score (training): 0.998
Accuracy score (validation): 0.998
F1 score: 0.4444444444444445
Precision: 0.6129032258064516
Recall: 0.3486238532110092
CV scores:  [-0.97698142  0.05104892  0.16742433 -0.23585451  0.53167618]

Learning rate:  0.25
Accuracy score (training): 0.998
Accuracy score (validation): 0.998
F1 score: 0.5793650793650794
Precision: 0.5104895104895105
Recall: 0.6697247706422018
CV scores:  [-0.19936873 -1.95229226  0.38857724 -0.23585451  0.50565819]

Learning rate:  0.5
Accuracy score (training): 0.998
Accuracy score (validation): 0.998
F1 score: 0.0
Precision: 0.0
Recall: 0.0
CV scores:  [ 0.18284768  0.07740867 -0.15780054  0.12839734  0.54468518]

Learning rate:  0.75
Accuracy score (training): 0.999
Accuracy sc

In [15]:
parameters = {
    "learning_rate": [0.05, 0.1, 0.25, 0.5, 0.75, 1],
    "max_depth":[2],
    "n_estimators":[20]
    }

gb_cv = GridSearchCV(gb, parameters, cv=5)
gb_cv.fit(X, y)
print('Best Score: ', gb_cv.best_score_)
print('GradientBoostingClassifier parameters: ',gb_cv.best_params_)

Best Score:  0.9984586084693488
GradientBoostingClassifier parameters:  {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 20}


In [16]:
from sklearn.ensemble import RandomForestClassifier


In [20]:
clf = RandomForestClassifier(n_estimators=20, random_state = 1,n_jobs=-1)
model_res = clf.fit(X_train, y_train)
y_pred = model_res.predict(X_test)
y_pred_prob = model_res.predict_proba(X_test)
lr_probs = y_pred_prob[:,1]
ac = accuracy_score(y_test, y_pred)
print('Random Forest: Accuracy=%.3f' % (ac))
f1 = f1_score(y_test, y_pred, average='weighted')
print("f1 score",f1_score(y_test, y_pred, average='weighted'))
print("Precision:",precision_score(y_test, y_pred))
print("Recall:",recall_score(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
scores = cross_val_score(model_res,X_train,y_train,cv=5,scoring='r2')
print('CV scores: ', scores)



Random Forest: Accuracy=1.000
f1 score 0.999583442406364
Precision: 0.9479166666666666
Recall: 0.8348623853211009
CV scores:  [0.69686285 0.68368297 0.72681111 0.66176613 0.72681111]


In [21]:
param_grid = { 
    'n_estimators': [20],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2],
    'criterion' :['gini', 'entropy']
}
rf_cv = GridSearchCV(clf, param_grid, cv=5)
rf_cv.fit(X, y)
print('Best Score: ', rf_cv.best_score_)
print('RandomForest parameters: ',rf_cv.best_params_)

Best Score:  0.9990555002740693
RandomForest parameters:  {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'n_estimators': 20}
