# Pipeline

In [15]:
import pandas as pd
import matplotlib.pyplot as plt

### Data input

In [16]:
import pandas as pd

# Importing the dataset
df = pd.read_csv('data/creditcard.csv')

### Data preprocessing

There is no missing data and no categorical data. For the other preprocessing steps, we will use a pipeline.

We will separate the data into into test and train sets, as 20% and 80% of the data respectively.

In [17]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
train_set.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
223361,143352.0,1.955041,-0.380783,-0.315013,0.330155,-0.509374,-0.086197,-0.627978,0.035994,1.05456,...,0.238197,0.968305,0.053208,-0.278602,-0.044999,-0.21678,0.045168,-0.047145,9.99,0
165061,117173.0,-0.400975,-0.626943,1.555339,-2.017772,-0.107769,0.16831,0.017959,-0.401619,0.040378,...,-0.153485,0.421703,0.113442,-1.004095,-1.176695,0.361924,-0.370469,-0.144792,45.9,0
238186,149565.0,0.072509,0.820566,-0.561351,-0.709897,1.080399,-0.359429,0.787858,0.117276,-0.131275,...,-0.314638,-0.872959,0.083391,0.148178,-0.431459,0.11969,0.206395,0.070288,11.99,0
150562,93670.0,-0.535045,1.014587,1.750679,2.76939,0.500089,1.00227,0.847902,-0.081323,0.371579,...,0.063525,0.443431,-0.072754,0.448192,-0.655203,-0.181038,-0.093013,-0.064931,117.44,0
138452,82655.0,-4.026938,1.897371,-0.429786,-0.029571,-0.855751,-0.480406,-0.435632,1.31376,0.536044,...,-0.480691,-0.230369,0.250717,0.066399,0.470787,0.245335,0.286904,-0.322672,25.76,0


### Model pipeline

In [18]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV

pipeline = make_pipeline( PolynomialFeatures(), RobustScaler(), VarianceThreshold(), RandomForestClassifier() )

params_grid = {
    'randomforestclassifier__n_estimators': [10, 40, 70, 100],
    'randomforestclassifier__criterion': ['gini', 'entropy'],
    'variancethreshold__threshold': [0.1, 0.3, 0.5],
    'polynomialfeatures__degree': [2, 3, 4]
}

grid = GridSearchCV(pipeline, params_grid, cv=5, scoring='accuracy')

In [19]:
grid.fit(train_set.drop('Class', axis=1), train_set['Class'])
grid.best_params_

KeyboardInterrupt: 

In [None]:
model = grid.best_estimator_
model.fit(train_set.drop('Class', axis=1), train_set['Class'])
model.score(test_set.drop('Class', axis=1), test_set['Class'])