https://www.kaggle.com/c/ieee-fraud-detection/leaderboard#score

In [1]:
%autosave 120

Autosaving every 120 seconds


In [2]:
%load_ext blackcellmagic

In [3]:
import pandas as pd
from sklearn import preprocessing
import numpy as np

In [4]:
%%time
train_transaction = pd.read_csv('data/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('data/test_transaction.csv', index_col='TransactionID')

train_identity = pd.read_csv('data/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('data/test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv('data/sample_submission.csv', index_col='TransactionID')

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print(train.shape)
print(test.shape)

y_train = train['isFraud'].copy()
del train_transaction, train_identity, test_transaction, test_identity

# Drop target, fill in NaNs
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test

X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))   

(590540, 433)
(506691, 432)
Wall time: 1min 37s


# TODO
split data frames into transactions with identity and transactions without.

### Majority Class baseline

In [17]:
#1 - t.isFraud.sum()/590540

In [18]:
#sample.isFraud = np.ones((sample.isFraud.shape[0],))

In [19]:
#sample.to_csv("majority_class.csv",index=False)

#### Result
majority baseline scored 50% on the kaggle board. I was hoping it would be imbalanced and easier to score high. oh well.


### Stupid XGBoost 
- just seeing what happens
- inspired by this https://www.kaggle.com/xhlulu/ieee-fraud-xgboost-with-gpu-fit-in-40s

In [12]:
import xgboost as xgb

In [13]:
clf = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    missing=-999,
    random_state=2019,
    tree_method='gpu_hist'  # THE MAGICAL PARAMETER
)

In [14]:
%time clf.fit(X_train, y_train)

NameError: name 'X_train' is not defined

### More analysis

In [None]:
import seaborn
import matplotlib.pyplot as plt
import pandas as pd


class CrawtoML:
    def __init__(self, data, target,id_column=None):
        self.data = data
        self.target = target
        self.id_column = id_column
        self.features = self.get_features()
        self.numeric_columns = self.numerics()
        self.other_types = self.other_types()
        self.nan_free = self.nan_free()

    def get_features(self):
        a = list(t.columns.values)
        a.remove(self.target)
        if self.id_column is not None:
            a.remove(self.id_column)
        return a

    def numerics(self):
        numerics = []
        for i in self.features:
            if self.data[i].dtypes in ["int64", "float64"]:
                numerics.append(i)
        self.numeric_columns = numerics
        return self.numeric_columns

    def other_types(self):
        others = [i for i in self.features if i not in self.numeric_columns]
        self.other_types = others
        return self.other_types

    def nan_free(self):
        s = self.data[self.features].isna().sum()
        z = list(zip(s.index, s.values))
        return [i for i, j in z if j == 0]

    def nan_chart(self):
        p = (self.data[self.features].isna().sum().sort_values() / len(self.data)).plot(
            title="NAN as a % of all values", xticks=[]
        )
        p.set_yticklabels(['{:,.2%}'.format(x) for x in p.get_yticks()])
        

    def correlation_report(self):
        seaborn.heatmap(self.data[self.numeric_columns].corr())

    def distribution_report(self):
        self.distribution_r()
        print(seaborn.distplot(self.data[self.target]))
        print(
            seaborn.PairGrid(self.data, x_vars=self.features, y_vars=self.target).map(
                seaborn.scatterplot
            )
        )

    def distribution_r(self):
        import pandas

        print(
            pandas.DataFrame(
                [
                    self.distribution_fit(self.data, i)
                    for i in self.numeric_columns + [self.target]
                ],
                index=self.numeric_columns + [self.target],
            )
        )

    def distribution_fit(self,columns):
        from scipy.stats import shapiro

        """
        x is a column_name
        """
        shapiro_values = shapiro(self.data[columns])
        test_indication = True if shapiro_values[1] > 0.05 else False

        distribution_types = ["norm", "expon", "logistic", "gumbel"]
        # anderson_values = anderson(automl.data[numeric_column], dist=i)

        return {
            "Shapiro-Wilks_Test_Statistic": shapiro_values[0],
            "Shapiro-Wilks_p_Value": shapiro_values[1],
            "Normal distribution ?": test_indication
            # "Anderson_Darling_Test_Statistic_Normal": anderson_values[0][0],
        }

    def __repr__(self):
        return "Target Column: %s \n \
            Feature columns: %s\n \
            Numeric Columns: %s"(
            self.target, self.features, self.numeric_columns
        )

In [None]:
c = CrawtoML(data=t,target="isFraud",id_column="TransactionID")

In [None]:
c.data[c.nan_free]

In [None]:
c.distribution_fit(c.nan_free)