In [None]:
pip install plotly

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import plotly
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
import cufflinks
import cufflinks as cf
import plotly.figure_factory as ff

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from imblearn import over_sampling, under_sampling
from imblearn.pipeline import Pipeline

In [None]:
# hàm dùng để plot và thống kê
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

def plot_distribution(df, var_select=None, title=None, bins=1.0): 
    # Calculate the correlation coefficient between the new variable and the target
    tmp_fraud = df[df['isFraud'] == 1]
    tmp_no_fraud = df[df['isFraud'] == 0]    
    corr = df['isFraud'].corr(df[var_select])
    corr = np.round(corr,3)
    tmp1 = tmp_fraud[var_select].dropna()
    tmp2 = tmp_no_fraud[var_select].dropna()
    hist_data = [tmp1, tmp2]
    
    group_labels = ['Fraud', 'No Fraud']
    colors = ['seagreen','indianred', ]

    fig = ff.create_distplot(hist_data,
                             group_labels,
                             colors = colors, 
                             show_hist = True,
                             curve_type='kde', 
                             bin_size = bins
                            )
    
    fig['layout'].update(title = title+' '+'(corr target ='+ str(corr)+')')

    iplot(fig, filename = 'Density plot')
    
def plot_dist_churn(df, col, binary=None):
    tmp_churn = df[df[binary] == 1]
    tmp_no_churn = df[df[binary] == 0]
    tmp_attr = round(tmp_churn[col].value_counts().sort_index() / df[col].value_counts().sort_index(),2)*100
    print(f'Distribution of {col}: ')
    trace1 = go.Bar(
        x=tmp_churn[col].value_counts().sort_index().index,
        y=tmp_churn[col].value_counts().sort_index().values, 
        name='Fraud',opacity = 0.8, marker=dict(
            color='seagreen',
            line=dict(color='#000000',width=1)))

    trace2 = go.Bar(
        x=tmp_no_churn[col].value_counts().sort_index().index,
        y=tmp_no_churn[col].value_counts().sort_index().values,
        name='No Fraud', opacity = 0.8, 
        marker=dict(
            color='indianred',
            line=dict(color='#000000',
                      width=1)
        )
    )

    trace3 =  go.Scatter(   
        x=tmp_attr.sort_index().index,
        y=tmp_attr.sort_index().values,
        yaxis = 'y2', 
        name='% Fraud', opacity = 0.6, 
        marker=dict(
            color='black',
            line=dict(color='#000000',
                      width=2 )
        )
    )
    
    layout = dict(title =  f'Distribution of {str(col)} feature by %Fraud',
              xaxis=dict(type='category'), 
              yaxis=dict(title= 'Count'), 
              yaxis2=dict(range= [0, 15], 
                          overlaying= 'y', 
                          anchor= 'x', 
                          side= 'right',
                          zeroline=False,
                          showgrid= False, 
                          title= 'Percentual Fraud Transactions'
                         ))

    fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
    iplot(fig)

In [None]:
train_transaction_data_file = "../input/ieee-fraud-detection/train_transaction.csv"
test_transaction_data_file = "../input/ieee-fraud-detection/test_transaction.csv"
train_identity_data_file = "../input/ieee-fraud-detection/train_identity.csv"
test_identity_data_file = "../input/ieee-fraud-detection/test_identity.csv"
sample_submission_file = "../input/ieee-fraud-detection/sample_submission.csv"

# **Load data**

In [None]:
# Read data
train_transaction_data = pd.read_csv(train_transaction_data_file)
train_identity_data = pd.read_csv(train_identity_data_file)
test_transaction_data = pd.read_csv(test_transaction_data_file)
test_identity_data = pd.read_csv(test_identity_data_file)
sample_submission = pd.read_csv(sample_submission_file)
del train_transaction_data_file, test_transaction_data_file, train_identity_data_file, test_identity_data_file, sample_submission_file

# **Khám phá data**

In [None]:
print('train_transaction shape is {}'.format(train_transaction_data.shape))
print('test_transaction shape is {}'.format(test_transaction_data.shape))
print('train_identity shape is {}'.format(train_identity_data.shape))
print('test_identity shape is {}'.format(test_identity_data.shape))

In [None]:
#train_transaction_data.head()

In [None]:
#test_transaction_data.head()

In [None]:
#train_identity_data.head()

In [None]:
#test_identity_data.head()

OK, there are a lot of **NaN** and **interesting columns**:
* C1, C2 ... D1, V300, V339 ...
* id_01 ... id_38

**Missing value**

In [None]:
#train_transaction_data.info()

In [None]:
missing_values_count = train_transaction_data.isnull().sum()
print (missing_values_count[0:10])
total_cells = np.product(train_transaction_data.shape)
total_missing = missing_values_count.sum()
print ("Phần trăm của missing data = ",(total_missing/total_cells) * 100)

In [None]:
del missing_values_count, total_cells, total_missing

Train_identity_data

In [None]:
train_identity_data.info()

In [None]:
missing_values_count = train_identity_data.isnull().sum()
print (missing_values_count[0:10])
total_cells = np.product(train_identity_data.shape)
total_missing = missing_values_count.sum()
print ("% of missing data = ",(total_missing/total_cells) * 100)

In [None]:
del missing_values_count, total_cells, total_missing

**Vấn đề thứ hai: Mất cân bằng giữa các class.**

Lưu ý rằng tập dữ liệu ban đầu mất cân bằng như thế nào! Hầu hết các giao dịch là không gian lận. Nếu ta sử dụng khung dữ liệu này làm cơ sở cho các mô hình dự đoán và phân tích, ta có thể gặp rất nhiều lỗi và các thuật toán có thể sẽ bị thừa vì nó sẽ "giả định" rằng hầu hết các giao dịch không phải là gian lận. Nhưng ở đây tôi không muốn mô hình của mình giả định, tôi muốn mô hình của mình phát hiện ra các mẫu có dấu hiệu gian lận!

Không cân bằng có nghĩa là số lượng điểm dữ liệu có sẵn cho các lớp khác nhau là khác nhau.

In [None]:
counts = train_transaction_data['isFraud'].value_counts().values
ax = sns.barplot([0,1], counts)
ax.set(title='Fraud class Histogram', xlabel = 'Classes', ylabel='Frequency')

print("The ratio of class 0: class 1 is:",len(train_transaction_data[train_transaction_data['isFraud']==1])/len(train_transaction_data)*100,":",
     len(train_transaction_data[train_transaction_data['isFraud']==0])/len(train_transaction_data)*100)

**Time vs fe**

> TransactionDT features là thời gian giao dịch từ một mốc thời gian tham chiếu (không phải là một mốc thời gian thực tế).


Train: min = 86400 max = 15811131

Test: min = 18403224 max = 34214345

Sự khác biệt giữa train.min () và test.max () là x = 34214345 - 86400 = 34127945 nhưng chúng ta không biết nó tính bằng giây, phút hay giờ.

* Khoảng thời gian của total dataset là 394.9993634259259 ngày
* Khoảng thời gian của train set là 181.99920138888888 ngày
* Khoảng thời gian của test set là 182.99908564814814 ngày
* Khoảng cách giữa train và test là 30.00107638888889 ngày

If it is in seconds then dataset timespan will be x/(3600*24*365) = 1.0821 years which seems reasonable to me. So if the transactionDT is in seconds then

* Time span of the total dataset is 394.9993634259259 days
* Time span of Train dataset is  181.99920138888888 days
* Time span of Test dataset is  182.99908564814814 days
* The gap between train and test is 30.00107638888889 days


In [None]:
# Here we confirm that all of the transactions in `train_identity`
print(np.sum(train_transaction_data.index.isin(train_identity_data.index.unique())))
print(np.sum(test_transaction_data.index.isin(test_identity_data.index.unique())))

24.4% of TransactionIDs in train (144233 / 590540) have an associated train_identity.

28.0% of TransactionIDs in test (144233 / 590540) have an associated train_identity.

In [None]:
#train_transaction_data['TransactionDT'].head()

In [None]:
train_transaction_data['TransactionDT'].shape[0] , train_transaction_data['TransactionDT'].nunique()

TransactionDT không phải là mốc thời gian thực tế, nhưng tôi sẽ vận dụng nó để tính toán thời gian

In [None]:
train_transaction_data['TransactionDT'].value_counts().head(10)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

time_val = train_transaction_data['TransactionDT'].values

sns.distplot(time_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of TransactionDT', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])

sns.distplot(np.log(time_val), ax=ax[1], color='b')
ax[1].set_title('Distribution of LOG TransactionDT', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])

plt.show()


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

time_val = train_transaction_data.loc[train_transaction_data['isFraud'] == 1]['TransactionDT'].values

sns.distplot(np.log(time_val), ax=ax[0], color='r')
ax[0].set_title('Distribution of LOG TransactionDT, isFraud=1', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])

time_val = train_transaction_data.loc[train_transaction_data['isFraud'] == 0]['TransactionDT'].values

sns.distplot(np.log(time_val), ax=ax[1], color='b')
ax[1].set_title('Distribution of LOG TransactionDT, isFraud=0', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])


plt.show()

TransactionDT là thời gian giao dịch từ một ngày giờ tham chiếu nhất định (không phải dấu thời gian thực tế). Một khám phá ban đầu về dữ liệu là train và test dường như được phân chia theo thời gian. Có một khoảng cách nhỏ ở giữa, nhưng nếu không thì tập huấn luyện là từ một khoảng thời gian trước đó và kiểm tra là từ một khoảng thời gian sau. Điều này sẽ không ảnh hưởng đến phương pháp cross-validation nào nên được sử dụng.

In [None]:
train_transaction_data['TransactionDT'].plot(kind='hist',
                                        figsize=(15, 5),
                                        label='train',
                                        bins=50,
                                        title='Train vs Test TransactionDT distribution')
test_transaction_data['TransactionDT'].plot(kind='hist',
                                       label='test',
                                       bins=50)
plt.legend()
plt.show()

**C features: C1-C14**

In [None]:
# Dùng np.corrcoef để xem xét độ tương quan giữa C features và TransactionDT
c_features = list(train_transaction_data.columns[17:31])
c_features
for i in c_features:
    cor = np.corrcoef(train_transaction_data['TransactionDT'], train_transaction_data[i])[0,1]
    train_transaction_data.set_index('TransactionDT')[i].plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3))
    test_transaction_data.set_index('TransactionDT')[i].plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3))
    plt.show()

In [None]:
del c_features

**D features: D1 ... D15**

In [None]:
d_features = list(train_transaction_data.columns[31:46])

for i in d_features:
    cor = np.corrcoef(train_transaction_data['TransactionDT'], train_transaction_data[i])[0,1]
    train_transaction_data.set_index('TransactionDT')[i].plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3))
    test_transaction_data.set_index('TransactionDT')[i].plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3))
    plt.show()

Vấn đề ở đây là cấc giá trị của D features chủ yếu là Nans.

In [None]:
train_transaction_data[d_features].head()

In [None]:
# Click output to see the number of missing values in each column
missing_values_count = train_transaction_data[d_features].isnull().sum()
missing_values_count

In [None]:
# how many total missing values of D features?
total_cells = np.product(train_transaction_data[d_features].shape)
total_missing = missing_values_count.sum()
# percent of data that is missing
(total_missing/total_cells) * 100

In [None]:
del d_features, cor

**M features: M1 .. M9**

In [None]:
m_features = list(train_transaction_data.columns[46:55])
train_transaction_data[m_features].head()

In [None]:
del m_features

**V150**

In [None]:
i = "V150"
cor_tr = np.corrcoef(train_transaction_data['TransactionDT'], train_transaction_data[i].fillna(-1))[0,1]
cor_te = np.corrcoef(test_transaction_data['TransactionDT'], test_transaction_data[i].fillna(-1))[0,1]
train_transaction_data.set_index('TransactionDT')[i].fillna(-1).plot(style='.', title=i+" corr_tr= "+str(round(cor_tr,3))+" || corr_te= "+str(round(cor_te,3)), figsize=(15, 3))
test_transaction_data.set_index('TransactionDT')[i].fillna(-1).plot(style='.', title=i+" corr_tr= "+str(round(cor_tr,3))+"  || corr_te= "+str(round(cor_te,3)), figsize=(15, 3))
plt.show()

In [None]:
#train_transaction_data.loc[:,train_transaction_data.columns[train_transaction_data.columns.str.startswith('V')]].isnull().sum().head(20)

**TransactionAmt**

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

time_val = train_transaction_data['TransactionAmt'].values

sns.distplot(time_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of TransactionAmt', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])

sns.distplot(np.log(time_val), ax=ax[1], color='b')
ax[1].set_title('Distribution of LOG TransactionAmt', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])

plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

time_val = train_transaction_data.loc[train_transaction_data['isFraud'] == 1]['TransactionAmt'].values

sns.distplot(np.log(time_val), ax=ax[0], color='r')
ax[0].set_title('Distribution of LOG TransactionAmt, isFraud=1', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])

time_val = train_transaction_data.loc[train_transaction_data['isFraud'] == 0]['TransactionAmt'].values

sns.distplot(np.log(time_val), ax=ax[1], color='b')
ax[1].set_title('Distribution of LOG TransactionAmt, isFraud=0', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])


plt.show()

In [None]:
del time_val

In [None]:
tmp = train_transaction_data[['TransactionAmt', 'isFraud']][0:100000]
plot_distribution(tmp[(tmp['TransactionAmt'] <= 800)], 'TransactionAmt', 'Transaction Amount Distribution', bins=10.0,)
del tmp

# **Preprocessing**

**Reduce data's size**

In [None]:
# reduce memory data function
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train_transaction_data = reduce_mem_usage(train_transaction_data)
train_identity_data = reduce_mem_usage(train_identity_data)
test_transaction_data = reduce_mem_usage(test_transaction_data)
test_identity_data = reduce_mem_usage(test_identity_data)

**Merge data**

In [None]:
train_data = train_transaction_data.merge(train_identity_data, how="left", on="TransactionID")
test_data = test_transaction_data.merge(test_identity_data, how="left", on="TransactionID")

del train_transaction_data, train_identity_data, test_transaction_data, test_identity_data

In [None]:
# Some attribute of test data is not similar to train_data and we need rename it.
test_data.rename({'id-01':'id_01','id-02':'id_02','id-03':'id_03','id-04':'id_04','id-05':'id_05','id-06':'id_06','id-07':'id_07','id-08':'id_08','id-09':'id_09','id-10':'id_10','id-11':'id_11','id-12':'id_12','id-13':'id_13','id-14':'id_14','id-15':'id_15','id-16':'id_16','id-17':'id_17','id-18':'id_18','id-19':'id_19','id-20':'id_20','id-21':'id_21','id-22':'id_22','id-23':'id_23','id-24':'id_24','id-25':'id_25','id-26':'id_26','id-27':'id_27','id-28':'id_28','id-29':'id_29','id-30':'id_30','id-31':'id_31', 'id-32':'id_32', 'id-33':'id_33', 'id-34':'id_34', 'id-35':'id_35', 'id-36':'id_36', 'id-37':'id_37', 'id-38':'id_38'}, axis=1, inplace=True)

In datasets,there are some missing values. And there are some solutions for this:
1. Drop all missing value
2. Assign default value for "missing data" by using fillna() function
3. Replace mising value by median of that columns

In this code, I will choose second solution.

In [None]:
Y = train_data['isFraud'].copy()
X = train_data.drop('isFraud', axis=1)
X.fillna(-999)

Categorical and numerical data has different effects and result of model. So we need handle 2 types of data separately.
With numerical, we can normalize it by "StandardScaler" or "MinMaxScaler" 

In [None]:
# information of data after reducing memory
train_data.info()
del train_data

In [None]:
# Select categorical columns
categorical_cols = [cname for cname in X.columns if
                   X[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X.columns if 
                  X[cname].dtype in ['int8', 'int16', 'int32', 'float16', 'float32']]

In [None]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')), ('scale', StandardScaler())])


# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')),
                                           ('onehot', OneHotEncoder(dtype=np.int8, handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

**Split data**

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size=0.3, train_size=0.7, random_state=0)
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

del X, Y

**Oversampling**

In [None]:
oversampling = over_sampling.RandomOverSampler(sampling_strategy=0.05)
undersampling = under_sampling.RandomUnderSampler(sampling_strategy=0.4)
steps = [('o', oversampling)]
# ('u', undersampling)
pipeline = Pipeline(steps=steps)
X_train_oversample, Y_train_oversample = pipeline.fit_resample(X_train, Y_train.ravel())

# **Train and inference**

In [None]:
model = XGBClassifier(n_estimators=900,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    missing=-999,
    random_state=2020,
    tree_method='gpu_hist')
model.fit(X_train, Y_train)

In [None]:
predict = model.predict(X_val)
del X_train, Y_train

In [None]:
print(f1_score(predict, Y_val))
print(roc_auc_score(predict, Y_val))
print(accuracy_score(predict, Y_val))
# del X_val, Y_val

In [None]:
print(confusion_matrix(predict, Y_val))

In [None]:
model = XGBClassifier(n_estimators=900,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    missing=-999,
    random_state=2020,
    tree_method='gpu_hist')
model.fit(X_train_oversample, Y_train_oversample)

In [None]:
predict = model.predict(X_val)
del X_train_oversample, Y_train_oversample

In [None]:
print(f1_score(predict, Y_val))
print(roc_auc_score(predict, Y_val))
print(accuracy_score(predict, Y_val))
del X_val

In [None]:
print(confusion_matrix(predict, Y_val))
del Y_val

We can see that results is better when we don't use methods that related sampling methods

In [None]:
test_data.fillna(-999)
X_test = preprocessor.transform(test_data)
del test_data

In [None]:
test_predict = model.predict_proba(X_test)
del X_test

In [None]:
submission = pd.DataFrame({
    'TransactionID' : sample_submission.TransactionID,
    'isFraud' : test_predict[:,1]
})
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)