# Handling imbalanced datasets

- https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud?resource=download
- https://www.youtube.com/watch?v=pDw_JHHvj-0
- https://github.com/krishnaik06/Handle-Imbalanced-Dataset/blob/master/handling-imbalanced.ipynb

In [1]:
import pandas as pd
import numpy as np
import os

pd.options.display.float_format = '{:,.2f}'.format

## Loading dataset

In [2]:
cwd = os.getcwd()
os.chdir(os.path.dirname(os.path.dirname(cwd)))
data = pd.read_csv('_datasets\creditcard.csv')
os.chdir(cwd)
print(data.shape)
print(data.info())
data.head()

(284807, 31)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 2

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.36,-0.07,2.54,1.38,-0.34,0.46,0.24,0.1,0.36,...,-0.02,0.28,-0.11,0.07,0.13,-0.19,0.13,-0.02,149.62,0
1,0.0,1.19,0.27,0.17,0.45,0.06,-0.08,-0.08,0.09,-0.26,...,-0.23,-0.64,0.1,-0.34,0.17,0.13,-0.01,0.01,2.69,0
2,1.0,-1.36,-1.34,1.77,0.38,-0.5,1.8,0.79,0.25,-1.51,...,0.25,0.77,0.91,-0.69,-0.33,-0.14,-0.06,-0.06,378.66,0
3,1.0,-0.97,-0.19,1.79,-0.86,-0.01,1.25,0.24,0.38,-1.39,...,-0.11,0.01,-0.19,-1.18,0.65,-0.22,0.06,0.06,123.5,0
4,2.0,-1.16,0.88,1.55,0.4,-0.41,0.1,0.59,-0.27,0.82,...,-0.01,0.8,-0.14,0.14,-0.21,0.5,0.22,0.22,69.99,0


## EDA

In [3]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time,284807.0,94813.86,47488.15,0.0,54201.5,84692.0,139320.5,172792.0
V1,284807.0,0.0,1.96,-56.41,-0.92,0.02,1.32,2.45
V2,284807.0,0.0,1.65,-72.72,-0.6,0.07,0.8,22.06
V3,284807.0,-0.0,1.52,-48.33,-0.89,0.18,1.03,9.38
V4,284807.0,0.0,1.42,-5.68,-0.85,-0.02,0.74,16.88
V5,284807.0,0.0,1.38,-113.74,-0.69,-0.05,0.61,34.8
V6,284807.0,0.0,1.33,-26.16,-0.77,-0.27,0.4,73.3
V7,284807.0,-0.0,1.24,-43.56,-0.55,0.04,0.57,120.59
V8,284807.0,0.0,1.19,-73.22,-0.21,0.02,0.33,20.01
V9,284807.0,-0.0,1.1,-13.43,-0.64,-0.05,0.6,15.59


In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

```python
sns.pairplot(data, aspect=1)
```
![pairplot](pairplot.png)

The data is imbalanced?

In [5]:
target = "Class"

In [6]:
data_distribution = pd.concat([data[target].value_counts(),data[target].value_counts(normalize=True)],axis=1)
data_distribution.columns = ['freq','freq_p']
data_distribution

Unnamed: 0,freq,freq_p
0,284315,1.0
1,492,0.0


In [7]:
print(f"ratio(class=min,class=greater) = 1:{int(round(data_distribution['freq'][0] / data_distribution['freq'][1],0))}")

ratio(class=min,class=greater) = 1:578


## Metrics

In [8]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [9]:
def check_metrics(model,X_test,y_test):
    y_pred=model.predict(X_test)
    metrics = dict()
    metrics['confusion_matrix'] = confusion_matrix(y_test,y_pred)
    metrics['accuracy_score'] = accuracy_score(y_test,y_pred)
    metrics['classification_report'] = classification_report(y_test,y_pred, output_dict=True)
    return metrics

## Preparing data

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
def split_data(data,target):
    X = data.drop(target,axis=1)
    y = data[target]
    print("X.shape=", X.shape)
    print("y.shape=", y.shape)
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7)
    print(f"X_train={X_train.shape}, X_test={X_test.shape}, y_train={y_train.shape}, y_test={y_test.shape}")
    return X, y, X_train, X_test, y_train, y_test

# Implementacion de modelos

In [12]:
metrics = dict()

In [13]:
X, y, X_train, X_test, y_train, y_test = split_data(data,target)

X.shape= (284807, 30)
y.shape= (284807,)
X_train=(199364, 30), X_test=(85443, 30), y_train=(199364,), y_test=(85443,)


#### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
m_log = LogisticRegression()
m_log.fit(X_train,y_train)
metrics['log_baseline'] = check_metrics(model=m_log, X_test=X_test, y_test=y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
m_rf=RandomForestClassifier()
m_rf.fit(X_train,y_train)
metrics['rf_baseline'] = check_metrics(model=m_rf, X_test=X_test, y_test=y_test)

## Undersampling

### Manual

In [None]:
def undersampling_data(data,target):
    dist_target = dict(data[target].value_counts())
    dist_target = list((k,v) for k,v in dist_target.items())
    print(dist_target)
    k,s = dist_target[1]
    data_u = pd.concat([data[data[target]==k], data[data[target]!=k].sample(s)], axis=0).reset_index(drop=False)
    print(f"Data undersampled = {dict(data_u[target].value_counts())}")
    return data_u

In [None]:
data_u = undersampling_data(data,target)
data_u.shape

In [None]:
X, y, X_train, X_test, y_train, y_test = split_data(data=data_u,target=target)

#### Test LogistictRegresion

In [None]:
m_log_u = LogisticRegression()
m_log_u.fit(X_train,y_train)
metrics['log_undersampling_manual'] = check_metrics(model=m_log_u, X_test=X_test, y_test=y_test)

#### Test RandomForest

In [None]:
m_rf=RandomForestClassifier()
m_rf.fit(X_train,y_train)
metrics['rf_undersampling_manual'] = check_metrics(model=m_rf, X_test=X_test, y_test=y_test)

### imblearn

In [None]:
from imblearn.under_sampling import NearMiss
u_s=NearMiss()

In [None]:
X, y, X_train, X_test, y_train, y_test = split_data(data,target)

In [None]:
X_train_s,y_train_s=u_s.fit_resample(X_train,y_train)
print("X original=",dict(y_train.value_counts()), ", X undersampled=",dict(y_train_s.value_counts()))
X_test_s,y_test_s=u_s.fit_resample(X_test,y_test)
print("y original=",dict(y_test.value_counts()), ", y undersampled=",dict(y_test_s.value_counts()))

#### Test LogistictRegresion

In [None]:
m_log_u = LogisticRegression()
m_log_u.fit(X_train_s,y_train_s)
metrics['log_undersampling_nearmiss'] = check_metrics(model=m_log_u, X_test=X_test_s, y_test=y_test_s)

#### Test RandomForest

In [None]:
m_rf=RandomForestClassifier()
m_rf.fit(X_train_s,y_train_s)
metrics['rf_undersampling_nearmiss'] = check_metrics(model=m_rf, X_test=X_test_s, y_test=y_test_s)

## OverSampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
o_s=RandomOverSampler()

In [None]:
X_train_s,y_train_s=o_s.fit_resample(X_train,y_train)
print("X original=",dict(y_train.value_counts()), ", X overdampled=",dict(y_train_s.value_counts()))
X_test_s,y_test_s=o_s.fit_resample(X_test,y_test)
print("y original=",dict(y_test.value_counts()), ", y overdampled=",dict(y_test_s.value_counts()))

#### Test LogistictRegresion

In [None]:
m_log_o = LogisticRegression()
m_log_o.fit(X_train_s,y_train_s)
metrics['log_oversampling_randomsampler'] = check_metrics(model=m_log_o, X_test=X_test_s, y_test=y_test_s)

#### Test RandomForest

In [None]:
m_rf_o=RandomForestClassifier()
m_rf_o.fit(X_train_s,y_train_s)
metrics['rf_oversampling_randomsampler'] = check_metrics(model=m_rf_o, X_test=X_test_s, y_test=y_test_s)

## SMOTETomek

In [None]:
from imblearn.combine import SMOTETomek
o_s=SMOTETomek()

In [None]:
X_train_s,y_train_s=o_s.fit_resample(X_train,y_train)
print("X original=",dict(y_train.value_counts()), ", X overdampled=",dict(y_train_s.value_counts()))
X_test_s,y_test_s=o_s.fit_resample(X_test,y_test)
print("y original=",dict(y_test.value_counts()), ", y overdampled=",dict(y_test_s.value_counts()))

#### Test LogistictRegresion

In [None]:
m_log_o = LogisticRegression()
m_log_o.fit(X_train_s,y_train_s)
metrics['log_smotetomek'] = check_metrics(model=m_log_o, X_test=X_test_s, y_test=y_test_s)

#### Test RandomForest

In [None]:
m_rf_o=RandomForestClassifier()
m_rf_o.fit(X_train_s,y_train_s)
metrics['rf_smotetomek'] = check_metrics(model=m_rf_o, X_test=X_test_s, y_test=y_test_s)

# Results

In [None]:
metrics['log_baseline']['classification_report']

In [None]:
metrics_df = dict()
for k,v in metrics.items():
    vals = dict()
    vals.update({"accuracy":v['classification_report']['accuracy']})
    vals.update(dict(("Class=0,"+k,v) for k,v in v['classification_report']['0'].items()))
    vals.update(dict(("Class=1,"+k,v) for k,v in v['classification_report']['1'].items()))
    metrics_df[k] = vals
metrics_df = pd.DataFrame.from_dict(metrics_df).T
metrics_df = metrics_df.sort_index()

In [None]:
metrics_df.sort_values(['Class=1,f1-score','Class=1,recall','Class=1,precision'],ascending=[False,False,False])