In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('base_scaled.csv')

In [3]:

df.isFraud.value_counts()

0.0    559392
1.0     20073
Name: isFraud, dtype: int64

In [4]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1
0,0,0.0,0.0,0.0009,0.0,0.100885,0.608,0.381679,0.666667,0.014599,1.0,0.511364,0.836957,0.011521
1,1,0.0,4e-06,0.00184,0.0,0.210566,0.78,0.381679,0.333333,0.481752,0.0,0.522727,0.836957,0.027902
2,2,0.0,6e-06,0.001558,0.0,0.984824,0.934,0.381679,0.666667,0.124088,0.0,0.854545,0.836957,0.011521
3,3,0.0,7e-06,0.001558,0.75,0.201023,0.828,0.381679,0.666667,0.014599,1.0,0.727273,0.836957,0.011521
4,4,0.0,7e-06,0.001526,0.0,0.283801,0.91,0.381679,0.333333,0.919708,0.0,0.390909,0.836957,0.0035
5,5,0.0,8e-06,0.004971,0.0,0.650034,0.52,0.381679,0.333333,0.481752,0.0,0.059091,0.836957,0.0
6,6,0.0,8e-06,0.013221,0.0,0.672281,0.78,0.381679,0.333333,0.919708,0.0,0.511364,0.836957,0.011521
7,7,0.0,9e-06,0.000462,0.75,0.103645,0.0,0.381679,0.333333,0.919708,0.0,0.538636,0.836957,0.011521
8,8,0.0,9e-06,0.003656,0.0,0.942688,0.022,0.381679,0.666667,0.905109,0.0,0.236364,0.836957,0.001847
9,9,0.0,9e-06,0.002368,0.25,0.890779,0.504,0.129771,0.666667,0.248175,1.0,0.452273,0.836957,0.011521


In [5]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
df.columns

Index(['isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1',
       'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1'],
      dtype='object')

### tirando amostra estratificada

In [10]:
dados_teste = df[df['isFraud'] == 1].sample(20073)

In [11]:
dados_teste1 = df[df['isFraud'] == 0].sample(40000)

In [12]:
dados = pd.concat([dados_teste, dados_teste1])

In [14]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60073 entries, 290694 to 397397
Data columns (total 13 columns):
isFraud           60073 non-null float64
TransactionDT     60073 non-null float64
TransactionAmt    60073 non-null float64
ProductCD         60073 non-null float64
card1             60073 non-null float64
card2             60073 non-null float64
card3             60073 non-null float64
card4             60073 non-null float64
card5             60073 non-null float64
card6             60073 non-null float64
addr1             60073 non-null float64
addr2             60073 non-null float64
dist1             60073 non-null float64
dtypes: float64(13)
memory usage: 6.4 MB


In [15]:
classe = dados['isFraud']
previsores = dados[["TransactionDT", "TransactionAmt", "ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2", "dist1"]]

In [17]:
SEED = 5
np.random.seed(SEED)
treino_x, teste_x, treino_y, teste_y = train_test_split(previsores, classe, test_size = 0.25,
                                                         stratify = classe)
print("Treino com %d elementos e teste com %d elementos" % (len(treino_x), len(teste_x)))

Treino com 45054 elementos e teste com 15019 elementos


In [18]:
modelo = LinearSVC()
modelo.fit(treino_x, treino_y)
previsoes = modelo.predict(teste_x)

acuracia = accuracy_score(teste_y, previsoes) * 100
print("A acurácia foi %.2f%%" % acuracia)

A acurácia foi 73.14%


### SVM com standardScaler

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

SEED = 5
np.random.seed(SEED)
raw_treino_x, raw_teste_x, treino_y, teste_y = train_test_split(previsores, classe, test_size = 0.25,
                                                         stratify = classe)
print("Treinaremos com %d elementos e testaremos com %d elementos" % (len(treino_x), len(teste_x)))

scaler = StandardScaler()
scaler.fit(raw_treino_x)
treino_x = scaler.transform(raw_treino_x)
teste_x = scaler.transform(raw_teste_x)

modelo = SVC()
modelo.fit(treino_x, treino_y)
previsoes = modelo.predict(teste_x)

acuracia = accuracy_score(teste_y, previsoes) * 100
print("A acurácia foi %.2f%%" % acuracia)

Treinaremos com 45054 elementos e testaremos com 15019 elementos
A acurácia foi 76.74%


### Modelo com DecisionTreeClassifier

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

SEED = 5
np.random.seed(SEED)
raw_treino_x, raw_teste_x, treino_y, teste_y = train_test_split(previsores, classe, test_size = 0.25,
                                                         stratify = classe)
print("Treinaremos com %d elementos e testaremos com %d elementos" % (len(treino_x), len(teste_x)))

modelo = DecisionTreeClassifier(max_depth=3)
modelo.fit(raw_treino_x, treino_y)
previsoes = modelo.predict(raw_teste_x)

acuracia = accuracy_score(teste_y, previsoes) * 100
print("A acurácia foi %.2f%%" % acuracia)

Treinaremos com 45054 elementos e testaremos com 15019 elementos
A acurácia foi 74.55%


### Modelo com RandomForest

In [24]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.15, random_state=0)

from sklearn.ensemble import RandomForestClassifier
classificador = RandomForestClassifier(n_estimators=40, criterion='entropy', random_state=0)
classificador.fit(previsores_treinamento, classe_treinamento)
previsoes = classificador.predict(previsores_teste)

from sklearn.metrics import confusion_matrix, accuracy_score
precisao = accuracy_score(classe_teste, previsoes)
matriz = confusion_matrix(classe_teste, previsoes)

print('precisao {}'.format(precisao * 100))

precisao 87.33769836866053


In [26]:
data = pd.concat([dados_teste,dados_teste1])

In [27]:
data.shape

(60073, 13)

In [28]:
data.to_csv('base_scaled.csv')