In [None]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import StratifiedKFold

In [None]:
sns.set_context('talk')

# Load data

In [None]:
df = pd.read_csv('data/train.csv')

In [None]:
cat_vars = ['customer', 'category', 'suspicious', 'nationality', 'is_pep']
for var_ in cat_vars:
    df[var_] = df[var_].astype('category')

In [None]:
df['suspicious'].astype(int).sum()

In [None]:
df.head()

# Basic statistics

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.boxplot(df['age'])

# Model

In [None]:
test_features = pd.read_csv('data/test.csv')

In [None]:
test_features.shape

## SVM

In [None]:
from sklearn import svm

In [None]:
sub = df.sample(n=100_000)
sub['suspicious'].astype(int).sum()

### Train

In [None]:
X = sub[['nationality', 'atm_withdrawal']]
y = sub['suspicious']

In [None]:
X.shape

In [None]:
skf = StratifiedKFold(n_splits=2)

In [None]:
%%time
for train_index, test_index in tqdm(skf.split(X, y), total=skf.get_n_splits(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    clf = svm.SVC(gamma='scale', class_weight='balanced')
    clf.fit(X_train, y_train)
    
    print(clf.score(X_test, y_test))

In [None]:
%%time
clf = svm.SVC(kernel='rbf', gamma='scale', class_weight='balanced')
clf.fit(X, y)

### Test

In [None]:
sub_test = test_features[['nationality', 'atm_withdrawal']]
sub_test.shape

In [None]:
%%time
predictions = clf.predict(sub_test)

In [None]:
predictions.sum()

### Save results

In [None]:
# TODO