In [None]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

from utils import crowdai_score

In [None]:
sns.set_context('talk')

# Load data

## Train data

In [None]:
df_raw = pd.read_csv('data/train.csv').set_index('customer')

In [None]:
del_columns = ['category', 'nationality', 'is_pep']
for var_ in del_columns:
    df_raw.drop(var_, axis=1, inplace=True)

In [None]:
df_raw['suspicious'].astype(int).sum()

In [None]:
df_raw.head()

## Test data

In [None]:
df_test = pd.read_csv('data/test.csv').set_index('customer')

In [None]:
for var_ in del_columns:
    df_test.drop(var_, axis=1, inplace=True)

In [None]:
df_test.shape

## Subset for custom benchmark

In [None]:
exclude_customers = pd.read_csv('data/validation_customer_ids.csv', header=None, names=['customer'])['customer']
exclude_customers.head()

In [None]:
df = df_raw.loc[set(df_raw.index) - set(exclude_customers)]
df_cust = df_raw.loc[exclude_customers]
print(df.shape, df_cust.shape)

# Basic statistics

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.boxplot(df['age'])

# Create balanced dataset

In [None]:
def extract_balanced_data(df):
    cases_susp = df[df['suspicious']==1]
    cases_norm = df[df['suspicious']==0].sample(n=cases_susp.shape[0])

    print('Normal:', cases_norm.shape)
    print('Suspicious:', cases_susp.shape)
    return pd.concat([cases_norm, cases_susp])

In [None]:
df_bal = extract_balanced_data(df)
df_bal.shape

In [None]:
# df_bal = pd.get_dummies(df_bal, columns=['category', 'nationality', 'is_pep'])
# df_bal.shape

# Model

## Train

In [None]:
sub = df_bal  #.sample(n=100)

In [None]:
X = sub.drop('suspicious', axis=1)
y = sub['suspicious']

In [None]:
X.shape

### Create pipeline

In [None]:
clf = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=1000, n_jobs=2, verbose=1))
])

### Splitting

In [None]:
skf = StratifiedKFold(n_splits=2)

In [None]:
%%time
for train_index, test_index in tqdm(skf.split(X, y), total=skf.get_n_splits(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred)
    
    print(report)

### Final

In [None]:
%%time
clf.fit(X, y)

### Custom benchmark

In [None]:
df_cust_bal = extract_balanced_data(df_cust)

In [None]:
# only fit with non-excluded customers
X_cust = df_cust_bal.drop('suspicious', axis=1)
y_cust = df_cust_bal['suspicious']
print(X_cust.shape, y_cust.shape)

In [None]:
pred_cust = clf.predict(X_cust)

In [None]:
crowdai_score(y_cust, pred_cust)

In [None]:
pd.DataFrame(confusion_matrix(y_cust, pred_cust), index=['norm_true', 'susp_true'], columns=['norm_pred', 'susp_pred'])

## Test

In [None]:
%%time
# now fit with all data
df_raw_bal = extract_balanced_data(df_raw)
X_raw = df_raw_bal.drop('suspicious', axis=1)
y_raw = df_raw_bal['suspicious']
print(X_raw.shape, y_raw.shape)

clf.fit(X_raw, y_raw)

In [None]:
%%time
predictions = clf.predict_proba(df_test)

In [None]:
# sort by `suspicious` probability
df_pred = pd.DataFrame(
    predictions,
    columns=['normal_prob', 'suspicious_prob'],
    index=df_test.index
).sort_values(by='suspicious_prob', ascending=False)
df_pred.head()

In [None]:
N = 1000
idx = df_pred['suspicious_prob'].head(N).index
fraud_rows = df_test.loc[idx]

In [None]:
(pd.Series(fraud_rows.index)
 .to_frame()
 .to_csv('fraudulent_customers.txt', index=False))

## Visualizations

In [None]:
sns.distplot(df_pred['suspicious_prob'], kde=False)
plt.ylabel('Count')

In [None]:
df_featimp = pd.DataFrame(
    clf.named_steps['clf'].feature_importances_,
    index = X_raw.columns, 
    columns=['importance']
).sort_values('importance', ascending=False).reset_index()
df_featimp.head()

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x='importance', y='index', data=df_featimp, orient='h', color=sns.color_palette()[0])