# Hackaton Bernas da Street

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from IPython.display import Image
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn import metrics
import matplotlib.pyplot as plt

import category_encoders as ce
from utils_SLU13 import *

In [2]:
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,id,timestamp,product_id,product_department,product_category,card_id,user_id,C15,C16,C17,C18,C19,C20,C21,amount,isfraud
0,202,1413851368636,fa41b2d8,ef5ed6b8,50e219e0,ecad2386,bd544220,300,50,2312,0,167,100075,16,195.65,0
1,9909,1413851725946,fa41b2d8,ef5ed6b8,50e219e0,ecad2386,bd544220,300,50,2312,0,167,100075,16,195.65,0
2,22692,1413851873924,fa41b2d8,ef5ed6b8,50e219e0,ecad2386,bd544220,300,50,2312,0,167,100075,16,195.65,0
3,22938,1413851651534,fa41b2d8,ef5ed6b8,50e219e0,ecad2386,bd544220,300,50,2312,0,167,100075,16,195.65,0
4,29187,1413853025188,fa41b2d8,ef5ed6b8,50e219e0,ecad2386,bd544220,300,50,2312,0,167,100075,16,195.65,0


# Data Cleaning 101

In [3]:
df.nunique()

id                    522412
timestamp             521650
product_id              1959
product_department      1251
product_category           4
card_id                 1668
user_id                82591
C15                        8
C16                        9
C17                      293
C18                        4
C19                       56
C20                      157
C21                       50
amount                  1187
isfraud                    2
dtype: int64

In [4]:
df.card_id.value_counts()[:5]

f53417e1    162255
ecad2386     63340
7358e05e     25178
d292c32f     20410
e2fcccd2     18810
Name: card_id, dtype: int64

# Droping duplicates and ID

In [5]:
df.head()

Unnamed: 0,id,timestamp,product_id,product_department,product_category,card_id,user_id,C15,C16,C17,C18,C19,C20,C21,amount,isfraud
0,202,1413851368636,fa41b2d8,ef5ed6b8,50e219e0,ecad2386,bd544220,300,50,2312,0,167,100075,16,195.65,0
1,9909,1413851725946,fa41b2d8,ef5ed6b8,50e219e0,ecad2386,bd544220,300,50,2312,0,167,100075,16,195.65,0
2,22692,1413851873924,fa41b2d8,ef5ed6b8,50e219e0,ecad2386,bd544220,300,50,2312,0,167,100075,16,195.65,0
3,22938,1413851651534,fa41b2d8,ef5ed6b8,50e219e0,ecad2386,bd544220,300,50,2312,0,167,100075,16,195.65,0
4,29187,1413853025188,fa41b2d8,ef5ed6b8,50e219e0,ecad2386,bd544220,300,50,2312,0,167,100075,16,195.65,0


## Sort timestamp and split test/train

In [6]:
df = df.sort_values(by = "timestamp").reset_index(drop = True)
df.head()

Unnamed: 0,id,timestamp,product_id,product_department,product_category,card_id,user_id,C15,C16,C17,C18,C19,C20,C21,amount,isfraud
0,96186,1413849604595,c4e18dd6,85f751fd,50e219e0,92e72531,a99f214a,320,50,2480,3,297,100111,61,191.77,0
1,114679,1413849611766,c4e18dd6,85f751fd,50e219e0,e71aba61,a99f214a,320,50,1722,0,35,-1,79,191.77,1
2,60688,1413849613367,dd7026ee,15d93b0b,50e219e0,ecad2386,5c7c1b02,320,50,2495,2,167,100173,23,227.63,0
3,45825,1413849619068,c4e18dd6,85f751fd,50e219e0,5e3f096f,ba2d210a,320,50,2161,0,35,100051,157,191.77,0
4,87991,1413849625209,c4e18dd6,85f751fd,50e219e0,39947756,0ddad6d9,320,50,1955,3,163,100192,71,191.77,0


In [7]:
df_train = df.iloc[0:int(0.8*len(df))].copy()

#df_test = df.iloc[int(0.8*len(df)):].copy()
df_test = pd.read_csv("data/test.csv") #uncoment if you are blind testing
df_test['isfraud']=0  #uncoment if you are blind testing


x_train = df_train.drop("isfraud",axis=1)
y_train = df_train[["isfraud"]]

x_test = df_test.drop("isfraud",axis=1)



y_test = df_test[["isfraud"]]


# Feature Engeneering

In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
x_train_scaled = scaler.fit_transform(x_train[["amount"]])
x_train_scaled = pd.DataFrame(x_train_scaled)
x_train.amount = x_train_scaled


In [9]:
user_by_card = x_train.groupby('card_id').nunique().user_id.to_dict()

x_train['number_users_by_card'] = x_train.card_id.map(user_by_card)


numberoftimes_card_was_used = x_train.card_id.value_counts()

x_train['numberoftimes_card_was_used'] = x_train.card_id.map(numberoftimes_card_was_used)

In [10]:
x_test['number_users_by_card'] = x_test.card_id.map(user_by_card).fillna(1)

x_test['numberoftimes_card_was_used'] = x_test.card_id.map(numberoftimes_card_was_used).fillna(1)


In [11]:
columns_drop = ['timestamp','user_id']

In [12]:
x_train = x_train.drop(columns=columns_drop,axis=1)
x_test = x_test.drop(columns=columns_drop,axis=1)


In [13]:
df = df.drop(["id"],axis=1).drop_duplicates(keep = "first")
df = df_test.drop(["id"],axis=1).drop_duplicates(keep = "first")


In [14]:
# initialize transformer with desired options
ordinalencoder = ce.ordinal.OrdinalEncoder()
# fit transformer to data
ordinalencoder.fit(x_train)
# create new feature by transforming the datab
x_train = ordinalencoder.transform(x_train)
x_test = ordinalencoder.transform(x_test)

# Baseline Basic Model for Dummies

In [15]:
x_test

Unnamed: 0,id,product_id,product_department,product_category,card_id,C15,C16,C17,C18,C19,C20,C21,amount,number_users_by_card,numberoftimes_card_was_used
0,36412270,1,1,1,557,320,50,2647,2,35,100148,23,191.77,5431.0,23615.0
1,36444068,1,1,1,557,320,50,2647,2,35,100148,23,191.77,5431.0,23615.0
2,37853449,1,1,1,557,320,50,2647,2,39,100148,23,191.77,5431.0,23615.0
3,36396815,1,1,1,557,320,50,2647,2,35,100148,23,191.77,5431.0,23615.0
4,36432507,1,1,1,557,320,50,2647,2,35,100148,105,191.77,5431.0,23615.0
5,38314434,1,1,1,557,320,50,2741,0,163,-1,17,191.77,5431.0,23615.0
6,38317400,1,1,1,557,320,50,2741,0,163,-1,105,191.77,5431.0,23615.0
7,36932920,1,1,1,177,320,50,2726,3,803,100200,229,191.77,94.0,347.0
8,36678951,1,1,1,177,320,50,2726,3,803,100200,229,191.77,94.0,347.0
9,36623925,1,1,1,15,320,50,2726,3,803,100233,229,191.77,8611.0,16879.0


## Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight={0:0.9,1:0.1})
lr.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight={0: 0.9, 1: 0.1}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
predicted_proba_logistic = lr.predict_proba(x_test)[:,1]
predicted_proba_logistic

array([1.18100031e-04, 1.17571561e-04, 9.63577620e-05, ...,
       1.58074901e-03, 7.73396855e-04, 3.24397645e-04])

In [18]:
y_pred_train = lr.predict(x_train)

In [21]:
y_pred_test = lr.predict(x_test)


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
pd.DataFrame({'id:data})
y_pred_test.savetxt("out.csv", a, delimiter=",",hea)

In [20]:
roc_auc_logistic = metrics.roc_auc_score(y_test,predicted_proba_logistic)
roc_auc_logistic

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
lr.predict(x_test)

fpr, tpr, thresholds = metrics.roc_curve(y_score=predicted_proba_logistic, y_true=y_test.isfraud)
def plot_roc_curve(roc_auc, fpr, tpr):
    # Function to plot ROC Curve
    # Note: this is ALL just matplotlib stuff, nothing scientific about it! 
    
    # Inputs: 
    #     roc_auc - AU ROC value (float)
    #     fpr - false positive rate (output of roc_curve()) array
    #     tpr - true positive rate (output of roc_curve()) array
    
    plt.figure(figsize=(8,6))
    lw = 2
    plt.plot(fpr, tpr, color='orange', lw=lw, label='ROC curve (AUROC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', label='random')
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.grid()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
plot_roc_curve(roc_auc_logistic, fpr, tpr)

## Random Forest

# 

In [None]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(criterion='entropy')

clf.fit(x_train, y_train)

In [None]:
feature_importances = pd.Series(data=clf.feature_importances_, index=x_train.columns)
feature_importances.sort_values(ascending=False)

In [None]:
y_pred_train = clf.predict(x_train)
y_pred_test = clf.predict(x_test)
roc_auc_logistic = metrics.roc_auc_score(y_test,lr.predict_proba(x_test)[:,1])
roc_auc_logistic

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_score=predicted_proba_logistic, y_true=y_train.isfraud)
def plot_roc_curve(roc_auc, fpr, tpr):
    # Function to plot ROC Curve
    # Note: this is ALL just matplotlib stuff, nothing scientific about it! 
    
    # Inputs: 
    #     roc_auc - AU ROC value (float)
    #     fpr - false positive rate (output of roc_curve()) array
    #     tpr - true positive rate (output of roc_curve()) array
    
    plt.figure(figsize=(8,6))
    lw = 2
    plt.plot(fpr, tpr, color='orange', lw=lw, label='ROC curve (AUROC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', label='random')
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.grid()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
plot_roc_curve(roc_auc_logistic, fpr, tpr)