In [None]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

In [None]:
sns.set_context('talk')

# Load data

In [None]:
df = pd.read_csv('data/train.csv').set_index('customer')

In [None]:
del_columns = ['category', 'nationality', 'is_pep']
for var_ in del_columns:
    df.drop(var_, axis=1, inplace=True)

In [None]:
df['suspicious'].astype(int).sum()

In [None]:
df.head()

# Basic statistics

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.boxplot(df['age'])

# Create balanced dataset

In [None]:
cases_susp = df[df['suspicious']==1]
cases_norm = df[df['suspicious']==0].sample(n=cases_susp.shape[0])
print(cases_susp.shape, cases_norm.shape)

In [None]:
df_bal = pd.concat([cases_norm, cases_susp])
df_bal.shape

# Model

In [None]:
test_features = pd.read_csv('data/test.csv').set_index('customer')

In [None]:
for var_ in del_columns:
    test_features.drop(var_, axis=1, inplace=True)

In [None]:
test_features.shape

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

### Train

In [None]:
sub = df_bal  #.sample(n=100)

In [None]:
X = sub.drop('suspicious', axis=1)
y = sub['suspicious']

In [None]:
X.shape

#### Create pipeline

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
clf = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=1000, n_jobs=2, verbose=1))
])

#### Splitting

In [None]:
skf = StratifiedKFold(n_splits=2)

In [None]:
%%time
for train_index, test_index in tqdm(skf.split(X, y), total=skf.get_n_splits(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred)
    
    print(report)

#### Final

In [None]:
%%time
clf.fit(X, y)

### Test

In [None]:
sub_test = test_features  # [['nationality', 'atm_withdrawal']]
sub_test.shape

In [None]:
%%time
predictions = clf.predict_proba(sub_test)

In [None]:
# sort by `suspicious` probability
df_pred = pd.DataFrame(
    predictions,
    columns=['normal_prob', 'suspicious_prob'],
    index=sub_test.index
).sort_values(by='suspicious_prob', ascending=False)
df_pred.head()

In [None]:
N = 1000
idx = df_pred['suspicious_prob'].head(N).index
fraud_rows = test_features.loc[idx]

In [None]:
(pd.Series(fraud_rows.index)
 .to_frame()
 .to_csv('fraudulent_customers.txt', index=False))

In [None]:
sns.distplot(df_pred['suspicious_prob'], kde=False)
plt.ylabel('Count')