## Basic Initialization

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import graphviz
from feature_engineering import *

In [None]:
hist_trans_df = pd.read_csv('data/unzipped/historical_transactions.csv',
                            parse_dates=['purchase_date'])
merchants_df = pd.read_csv('data/unzipped/merchants.csv',
                           index_col='merchant_id')
merch_trans_df = pd.read_csv('data/unzipped/new_merchant_transactions.csv',
                             parse_dates=['purchase_date'])
train_and_validation_df = pd.read_csv('data/unzipped/train.csv',
                                      index_col='card_id',
                                      parse_dates=['first_active_month'])
test_df = pd.read_csv('data/unzipped/test.csv',
                      index_col='card_id',
                      parse_dates=['first_active_month'])

In [None]:
aggregators = {
    'purchase_amount': ['sum', 'mean', 'min', 'max', 'std', 'count'],
    'installments': ['sum', 'mean', 'min', 'max', 'std'],
    'month_lag': ['mean', 'min', 'max'],
    'merchant_id': ['nunique'],
    'merchant_category_id': ['nunique'],
    'state_id': ['nunique'],
    'city_id': ['nunique'],
    'subsector_id': ['nunique'],
}

In [None]:
add_aggregated_numerical_fields(train_and_validation_df, hist_trans_df, aggregators=aggregators)

In [None]:
add_aggregated_categorical_fields(train_and_validation_df,
                                  hist_trans_df,
                                  column_names=['authorized_flag', 'category_1', 'category_2', 'category_3'])

In [None]:
# category_2 and category_3 contain nan values, so let's skip those for now.
add_top_categories(train_and_validation_df,
                   hist_trans_df,
                   column_names=['authorized_flag', 'category_1', 'subsector_id', 'city_id', 'state_id'])

## Outlier Separation

Outliers are the ones where the target value is below -30. Most target values are clustered around 0, but then we have that bunch of values around -33, with the same value. And apparently predicting those correctly is crucial for good performance in the competition.

In [None]:
train_and_validation_df['outlier'] = np.where(train_and_validation_df['target'] < -30, 'Outlier', 'Normal')
train_and_validation_df.head()

Separate the outliers into their own table

In [None]:
outlier_df = train_and_validation_df.loc[train_and_validation_df['outlier'] == 'Outlier'].copy()
normal_df = train_and_validation_df.loc[train_and_validation_df['outlier'] == 'Normal'].copy()

In [None]:
outlier_df.head()

Trying to see if there are any differences in the correlations between the full dataset and the outlier dataset. Practically everything looks exactly the same.

In [None]:
plt.figure(figsize = (14, 10))
sns.heatmap(normal_df.corr(), vmin=-1, vmax=1, cmap='PiYG')

In [None]:
plt.figure(figsize = (14, 10))
sns.heatmap(outlier_df.corr(), vmin=-1, vmax=1, cmap='PiYG')

Since there are no real visible differences, it's a good idea to plot the differences between the correlations. Note that the scale is different (from -0.4 to 0.4) to highlight the differences. There may be something interesting going on with `month_lag`, but otherwise the differences confirm the visual inspection of the previous graphs.

In [None]:
plt.figure(figsize = (14, 10))
sns.heatmap(normal_df.corr() - outlier_df.corr(), vmin=-0.4, vmax=0.4, cmap='PiYG')

## Trying a Decision Tree

Checking if a decision tree can be used to separate the outliers from the rest. Measure all of accuracy (proportion of correct predictions in all predictions), precision (proportion of true positives in predicted positives), and recall (proportion of predicted positives in actual positives).

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

def draw_decision_tree(clf, feature_names):
    dot_data = tree.export_graphviz(clf, out_file=None, feature_names=feature_names,
                                    class_names=clf.classes_, filled=True, rounded=True,
                                    proportion=False)
    return graphviz.Source(dot_data)

def evaluate_classification_results(clf, X_test, y_test):
    y_pred = clf.predict(X_test)

    print('Accuracy: {:.4f}'.format(
        accuracy_score(y_test, y_pred)))
    print('Precision: {:.4f}'.format(precision_score(y_test, y_pred, pos_label='Outlier')))
    print('Recall: {:.4f}'.format(recall_score(y_test, y_pred, pos_label='Outlier')))
    print()

    C = confusion_matrix(y_test, y_pred)
    cm_row_labels = ['True ' + x for x in clf.classes_]
    cm_column_labels = ['Predicted ' + x for x in clf.classes_]
    print(pd.DataFrame(C, index=cm_row_labels, columns=cm_column_labels))

In [None]:
train_and_validation_df.dtypes

The decision tree classifier likes only numeric features, so we need to remove all the non-numeric features (including datetime).

In [None]:
tree_df = train_and_validation_df[train_and_validation_df.columns.difference(['first_active_month',
                                                                              'target',
                                                                              'authorized_flag_top',
                                                                              'category_1_top',
                                                                              'outlier'
                                                                             ])]
tree_df.fillna(tree_df.mean(), inplace=True)
data_train, data_test, label_train, label_test = train_test_split(tree_df, train_and_validation_df['outlier'], test_size=0.2)
tree_df.head()


Fiddling with the `max_depth` value gives more or less granularity to the decision tree. It doesn't take very long to build even a deep tree.

In [None]:
clf = DecisionTreeClassifier(max_depth=9)
clf.fit(data_train, label_train)

draw_decision_tree(clf, data_train.columns.values)

In [None]:
evaluate_classification_results(clf, data_train, label_train)

In [None]:
evaluate_classification_results(clf, data_test, label_test)

The result is that a decision tree is not very good. In general, decision trees are probably a bad idea to predict rare events. By increasing the depth of the tree, we can get good-ish results on the training set, but the tree is horribly overfitted and produces bad results on the validation set.

## Penalizing Outlier Prediction Mistakes

There are [techniques for handling imbalanced classes](https://elitedatascience.com/imbalanced-classes). Let's
try another one: penalizing mistakes made in predicting outliers.

In [None]:
from sklearn.svm import SVC

In [None]:
train_and_validation_df.shape

In [None]:
sampled_train_and_validation_df = train_and_validation_df.sample(frac=0.02, random_state=281316)
reduced_df = sampled_train_and_validation_df[sampled_train_and_validation_df.columns.difference(['first_active_month',
                                                                                              'target',
                                                                                              'authorized_flag_top',
                                                                                              'category_1_top',
                                                                                              'outlier'
                                                                                             ])]
reduced_df.fillna(reduced_df.mean(), inplace=True)
data_train, data_test, label_train, label_test = train_test_split(reduced_df,
                                                                  sampled_train_and_validation_df['outlier'],
                                                                  test_size=0.2)
data_train.shape

In [None]:
svc_clf = SVC(kernel='linear', class_weight='balanced', probability=True)
svc_clf.fit(data_train, label_train)

In [None]:
evaluate_classification_results(svc_clf, data_train, label_train)

In [None]:
evaluate_classification_results(svc_clf, data_test, label_test)

The problem with this is that, according to the documentation, "The fit time complexity is more than quadratic with the number of samples", meaning that it doesn't seem possible to use a large enough dataset. In any case, this method does not seem to do so well with precision, though recall is much better than with a decision tree.

## Upsampling

One more option is upsampling, that is, artificially inflating the number of outlier samples. Assuming there is any signal in the outlier data, this should enhance it so that it doesn't get lost in the noise.

In [None]:
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression

In [None]:
train_df, validation_df = train_test_split(train_and_validation_df, test_size=0.2)
normal_df = train_df[train_df['outlier'] == 'Normal']
outlier_df = train_df[train_df['outlier'] == 'Outlier']
resampled_outlier_df = resample(outlier_df,
                               replace=True,
                               n_samples=normal_df.outlier.count(),
                               random_state=281316)
resampled_df = pd.concat([normal_df, resampled_outlier_df])
resampled_df.outlier.value_counts()

In [None]:
reduced_df = resampled_df[resampled_df.columns.difference(['first_active_month',
                                                           'target',
                                                           'authorized_flag_top',
                                                           'category_1_top',
                                                           'outlier'
                                                          ])]
reduced_df.fillna(reduced_df.mean(), inplace=True)

log_clf = LogisticRegression()
log_clf.fit(reduced_df, resampled_df['outlier'])

In [None]:
evaluate_classification_results(log_clf, reduced_df, resampled_df['outlier'])

In [None]:
reduced_validation_df = validation_df[validation_df.columns.difference(['first_active_month',
                                                                        'target',
                                                                        'authorized_flag_top',
                                                                        'category_1_top',
                                                                        'outlier'
                                                                       ])]
reduced_validation_df.fillna(reduced_validation_df.mean(), inplace=True)
evaluate_classification_results(log_clf, reduced_validation_df, validation_df['outlier'])

Precision again appears to be a problem, but recall is quite good.