# Hackaton Bernas da Street

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from IPython.display import Image
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn import metrics
import matplotlib.pyplot as plt

import category_encoders as ce
from utils_SLU13 import *

In [None]:
df = pd.read_csv("data/train.csv")
df.head()

# Data Cleaning 101

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
df.card_id.value_counts()

In [None]:
df = df.drop("id", axis=1)

In [None]:
df.head()

## Card id investigation

## Sort timestamp and split test/train

In [None]:
df = df.sort_values(by = "timestamp").reset_index(drop = True)
df.head()

In [None]:
len(df)

In [None]:
len(df.iloc[0:int(0.8*len(df))])

In [None]:
len(df.iloc[int(0.8*len(df)):])

In [None]:
df_train = df.iloc[0:int(0.8*len(df))].copy()
df_test = df.iloc[int(0.8*len(df)):].copy()
x_train = df_train.drop("isfraud",axis=1)
y_train = df_train[["isfraud"]]
x_test = df_test.drop("isfraud",axis=1)
y_test = df_test[["isfraud"]]

In [None]:
# initialize transformer with desired options
ordinalencoder = ce.ordinal.OrdinalEncoder()
# fit transformer to data
ordinalencoder.fit(x_train)
# create new feature by transforming the datab
x_train = ordinalencoder.transform(x_train)
x_test = ordinalencoder.transform(x_test)

# Baseline Basic Model for Dummies

In [None]:
x_train

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

x_cleaned = x_train.drop(['id'])

lr = LogisticRegression(class_weight={0:0.9,1:0.1})
lr.fit(x_cleaned, y_train)

In [None]:
predicted_proba_logistic = lr.predict_proba(x_train)[:,1]
predicted_proba_logistic

In [None]:
y_pred_train = lr.predict(x_train)

In [None]:
y_pred_test = lr.predict(x_test)

In [None]:
roc_auc_logistic = metrics.roc_auc_score(y_train,lr.predict_proba(x_train)[:,1])
roc_auc_logistic

In [None]:
lr.predict(x_test)

fpr, tpr, thresholds = metrics.roc_curve(y_score=predicted_proba_logistic, y_true=y_train.isfraud)
def plot_roc_curve(roc_auc, fpr, tpr):
    # Function to plot ROC Curve
    # Note: this is ALL just matplotlib stuff, nothing scientific about it! 
    
    # Inputs: 
    #     roc_auc - AU ROC value (float)
    #     fpr - false positive rate (output of roc_curve()) array
    #     tpr - true positive rate (output of roc_curve()) array
    
    plt.figure(figsize=(8,6))
    lw = 2
    plt.plot(fpr, tpr, color='orange', lw=lw, label='ROC curve (AUROC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', label='random')
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.grid()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
plot_roc_curve(roc_auc_logistic, fpr, tpr)

## Random Forest

# 

In [None]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor


clf = DecisionTreeClassifier(criterion='entropy')

clf.fit(x_train, y_train)

In [None]:
feature_importances = pd.Series(data=clf.feature_importances_, index=x_train.columns)
feature_importances.sort_values(ascending=False)

In [None]:
y_pred_train = clf.predict(x_train)
y_pred_test = clf.predict(x_test)
roc_auc_logistic = metrics.roc_auc_score(y_train,lr.predict_proba(x_train)[:,1])
roc_auc_logistic

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_score=predicted_proba_logistic, y_true=y_train.isfraud)
def plot_roc_curve(roc_auc, fpr, tpr):
    # Function to plot ROC Curve
    # Note: this is ALL just matplotlib stuff, nothing scientific about it! 
    
    # Inputs: 
    #     roc_auc - AU ROC value (float)
    #     fpr - false positive rate (output of roc_curve()) array
    #     tpr - true positive rate (output of roc_curve()) array
    
    plt.figure(figsize=(8,6))
    lw = 2
    plt.plot(fpr, tpr, color='orange', lw=lw, label='ROC curve (AUROC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', label='random')
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.grid()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
plot_roc_curve(roc_auc_logistic, fpr, tpr)