In [9]:
# https://www.kaggle.com/joparga3/in-depth-skewed-data-classif-93-recall-acc-now/notebook

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


**Why do I use this kernel?**

I have a deeply interest in find out if a transaction is a fraud or not. 
About the Data set: 
*     it contains transactions made by credit cards in September 2013 by european cardholders.
*     It's highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.
*     It contains only numerical input variables which are the result of a PCA transformation (confidential data).

In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
# Read data
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')

In [12]:
# Check the columns and general structure
df.head()

## As we can see, the data is confidential, we don't know the column names.

In [13]:
# Search missing values
df.isnull().sum().max()

## No missing values

# See if the data is balanced or unbalanced

In [14]:
from matplotlib.legend_handler import HandlerBase
from matplotlib.text import Text
class TextHandler(HandlerBase):
    def create_artists(self, legend, tup ,xdescent, ydescent, width, height, fontsize,trans):
        tx = Text(width/2.,height/2,tup[0], fontsize=fontsize,
                  ha="center", va="center", color=tup[1], fontweight="bold")
        return [tx]
    
ax = sns.countplot(x="Class", data=df)
plt.title("Frauds vs Not. Frauds")
plt.xlabel("Target")
plt.ylabel("Count")
sns.set(rc = {'figure.figsize':(8,8)})
#Add value counts to each bar
for p in ax.patches:
   ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+0.01))

# Add right legend
handltext = ["0", "1"]
labels = ["Not Fraud", "Fraud"]

t = ax.get_xticklabels()
labeldic = dict(zip(handltext, labels))
labels = [labeldic[h.get_text()]  for h in t]
handles = [(h.get_text(),c.get_fc()) for h,c in zip(t,ax.patches)]

ax.legend(handles, labels, handler_map={tuple : TextHandler()}) 



In [15]:
# The classes are heavily skewed we need to solve this issue later.
print('No Frauds', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

**Distributions:** By seeing the distributions we can have an idea how skewed are these features, we can also see further distributions of the other features. 

In [16]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

amount_val = df['Amount'].values
time_val = df['Time'].values

sns.distplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of Transaction Amount', fontsize=14)
ax[0].set_xlim([min(amount_val), max(amount_val)])

sns.distplot(time_val, ax=ax[1], color='b')
ax[1].set_title('Distribution of Transaction Time', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])

# Unbalance Data

## What can we do?

* Collect more data.
* Use a correct metric (don't use accuracy):
    * Use the confusio nmatrix to calculate Precision, Recall
    * F1score (weighted average of precision recall)
    * Use Kappa - which is a classification accuracy normalized by the imbalance of the classes in the data
    * ROC curves - calculates sensitivity/specificity ratio.
* Resampling the dataset (process the data to have an approximate 50-50 ratio).
    * OVER-sampling, adding copies of the under-represented class (better when you have little data).
    * UNDER-sampling, deletes instances from the over-represented class (better when he have lot's of data).


## Scaling and Distributing
It's necesary to scale columns Time and Amount. The other columns are already scaled. 

In [17]:
from sklearn.preprocessing import StandardScaler, RobustScaler

# IMPORTANT: RobustScaler is less prone to outliers.

std_scaler = StandardScaler()
rob_scaler = RobustScaler()

df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

# Erase old not scaled columns Time and Amount
df.drop(['Time','Amount'], axis=1, inplace=True)

In [18]:
scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

# Amount and Time are Scaled!

df.head()

## Sub-Sample the data

It will be a dataframe with a 50/50 ratio of fraud and non-fraud transactions. 

## Why do we create a sub-Sample?

Using the original dataframe will cause the following issues:

* **Overfitting:** Our classification models will assume that in most cases there are no frauds! What we want for our model is to be certain when a fraud occurs.
* **Wrong Correlations:** Although we don't know what the "V" features stand for, it will be useful to understand how each of this features influence the result (Fraud or No Fraud) by having an imbalance dataframe we are not able to see the true correlations between the class and features.

# Splitting the Data (Original DataFrame)

Before proceeding with the Random UnderSampling technique we have to separate the orginal dataframe. Why? for testing purposes, remember although we are splitting the data when implementing Random UnderSampling or OverSampling techniques, we want to test our models on the original testing set not on the testing set created by either of these techniques. The main goal is to fit the model either with the dataframes that were undersample and oversample (in order for our models to detect the patterns), and test it on the original testing set.

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold, StratifiedKFold

print('No Frauds', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

X = df.drop('Class', axis=1)
y = df['Class']

sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

# We already have X_train and y_train for undersample data thats why I am using original to distinguish and to not overwrite these variables.
# original_Xtrain, original_Xtest, original_ytrain, original_ytest = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the Distribution of the labels


# Turn into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))