In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

import umap

from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

from utils import crowdai_score

In [None]:
sns.set_context('talk')

# Load data

## Train data

In [None]:
df_raw = pd.read_csv('data/train.csv').set_index('customer')

In [None]:
del_columns = ['category', 'nationality', 'is_pep']
for var_ in del_columns:
    df_raw.drop(var_, axis=1, inplace=True)

In [None]:
df_raw['suspicious'].astype(int).sum()

In [None]:
df_raw.head()

## Test data

In [None]:
df_test = pd.read_csv('data/test.csv').set_index('customer')

In [None]:
for var_ in del_columns:
    df_test.drop(var_, axis=1, inplace=True)

In [None]:
df_test.shape

## Subset for custom benchmark

In [None]:
exclude_customers = pd.read_csv('data/validation_customer_ids.csv', header=None, names=['customer'])['customer']
exclude_customers.head()

In [None]:
df = df_raw.loc[set(df_raw.index) - set(exclude_customers)]
df_cust = df_raw.loc[exclude_customers]
print(df.shape, df_cust.shape)

# Basic statistics

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.boxplot(df['age'])

# Create balanced dataset

In [None]:
def extract_balanced_data(df):
    cases_susp = df[df['suspicious']==1]
    cases_norm = df[df['suspicious']==0].sample(n=cases_susp.shape[0])

    print('Normal:', cases_norm.shape)
    print('Suspicious:', cases_susp.shape)
    return pd.concat([cases_norm, cases_susp])

In [None]:
df_bal = extract_balanced_data(df)
df_bal.shape

In [None]:
# df_bal = pd.get_dummies(df_bal, columns=['category', 'nationality', 'is_pep'])
# df_bal.shape

# Model

## Train

In [None]:
sub = df_bal  #.sample(n=100)

In [None]:
X = sub.drop('suspicious', axis=1)
y = sub['suspicious']

In [None]:
X.shape

### Create pipeline

In [None]:
clf = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=1000, n_jobs=2, verbose=1))
])

### Splitting

In [None]:
skf = StratifiedKFold(n_splits=2)

In [None]:
%%time
for train_index, test_index in tqdm(skf.split(X, y), total=skf.get_n_splits(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred)
    
    print(report)

### Final

In [None]:
%%time
clf.fit(X, y)

### Custom benchmark

In [None]:
df_cust_bal = extract_balanced_data(df_cust)

In [None]:
# only fit with non-excluded customers
X_cust = df_cust_bal.drop('suspicious', axis=1)
y_cust = df_cust_bal['suspicious']
print(X_cust.shape, y_cust.shape)

In [None]:
pred_cust_proba = clf.predict_proba(X_cust)

In [None]:
pred_cust = np.argmax(pred_cust_proba, axis=1)

In [None]:
crowdai_score(y_cust, pred_cust)

In [None]:
pd.DataFrame(confusion_matrix(y_cust, pred_cust), index=['norm_true', 'susp_true'], columns=['norm_pred', 'susp_pred'])

## Test

In [None]:
%%time
# now fit with all data
df_raw_bal = extract_balanced_data(df_raw)
X_raw = df_raw_bal.drop('suspicious', axis=1)
y_raw = df_raw_bal['suspicious']
print(X_raw.shape, y_raw.shape)

clf.fit(X_raw, y_raw)

In [None]:
df_test.shape

In [None]:
%%time
predictions = clf.predict_proba(df_test)

In [None]:
# sort by `suspicious` probability
df_pred = pd.DataFrame(
    predictions,
    columns=['normal_prob', 'suspicious_prob'],
    index=df_test.index
).sort_values(by='suspicious_prob', ascending=False)
df_pred.head()

In [None]:
threshold = 0.3  # somwhat the maximum of f1-score
fraud_customers = df_pred[df_pred['suspicious_prob'] > threshold].index.to_frame()
fraud_customers.to_csv('fraudulent_customers.txt', index=False)

In [None]:
fraud_customers.shape

## Visualizations

### Class probability distribution

In [None]:
sns.distplot(df_pred['suspicious_prob'], kde=False)
plt.ylabel('Count')

### ROC

In [None]:
print(y_cust.shape)  # true
print(pred_cust_proba.shape)  # predicted

In [None]:
susp_proba = pred_cust_proba[:,-1]

In [None]:
fpr, tpr, thres = roc_curve(y_cust, susp_proba)

In [None]:
plt.plot(fpr, tpr)

plt.title(f'ROC (AUC = {round(roc_auc_score(y_cust, susp_proba), 2)})')
plt.xlabel('fpr')
plt.ylabel('tpr')

### Precision/recall curve

In [None]:
precision, recall, thresholds = precision_recall_curve(y_cust, susp_proba)

In [None]:
plt.plot(precision, recall)
plt.xlabel('Precision')
plt.ylabel('Recall')

In [None]:
plt.figure(figsize=(8,6))

plt.plot(thresholds, precision[:-1], label='precision')
plt.plot(thresholds, recall[:-1], label='recall')
plt.plot(thresholds, (2 * precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1]), label='f1-score')

plt.legend(loc='best')
plt.xlabel('Threshold')

In [None]:
data = []
for t in np.linspace(0, .1, 100):
    pred_cust_tmp = np.zeros_like(susp_proba)
    pred_cust_tmp[susp_proba>=t] = 1
    
    cs = crowdai_score(y_cust, pred_cust_tmp)
    data.append((t, cs))

In [None]:
plt.plot(*zip(*data))

plt.xlabel('Threshold')
plt.ylabel('CS measure')

### Feature importance

In [None]:
df_featimp = pd.DataFrame(
    clf.named_steps['clf'].feature_importances_,
    index = X_raw.columns, 
    columns=['importance']
).sort_values('importance', ascending=False).reset_index()
df_featimp.head()

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x='importance', y='index', data=df_featimp, orient='h', color=sns.color_palette()[0])

### Embedding

In [None]:
%%time
um = umap.UMAP()
X_trans = um.fit_transform(df_raw_bal.drop('suspicious', axis=1))

In [None]:
df_trans = pd.DataFrame(
    X_trans, 
    index=df_raw_bal.index, columns=[f'AXIS_{i}' for i in range(X_trans.shape[1])])
df_trans['suspicious'] = df_raw_bal['suspicious']
df_trans.head()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x='AXIS_0', y='AXIS_1', hue='suspicious', data=df_trans, alpha=.02)

### Specific variables

In [None]:
log_columns = ['turnover', 'transaction_count', 'atm_withdrawal', 'atm_deposit', 'inactive_days_average', 'inactive_days_max']
for col in log_columns:
    df_raw[col] = np.log10(df_raw[col] + 1)

In [None]:
df_raw_bal = extract_balanced_data(df_raw)
df_raw_bal.head(3)

In [None]:
tmp = pd.melt(df_raw_bal, id_vars=['suspicious'])
tmp.head()

In [None]:
g = sns.FacetGrid(
    data=tmp, col='variable', col_wrap=4,
    sharex=False, sharey=False, height=4)

g.map(sns.distplot, 'value', kde=False)

In [None]:
g = sns.FacetGrid(
    data=tmp, col='variable', col_wrap=4,
    sharey=False, height=4)

g.map_dataframe(sns.boxplot, x='suspicious', y='value', showfliers=False)

In [None]:
sns.pairplot(df_raw_bal, hue='suspicious', plot_kws=dict(alpha=.1))