### Handling Imbalanced Data- Over Sampling

In [None]:
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,accuracy_score
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

In [None]:
data = pd.read_csv('creditcards.csv',sep=',')
data.head()

In [None]:
data.info()

In [None]:
#Create independent and Dependent Features
columns = data.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["Class"]]
# Store the variable we are predicting 
target = "Class"
# Define a random state 
state = np.random.RandomState(42)
X = data[columns]
Y = data[target]
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)

Exploratory Data Analysis

In [None]:
data.isnull().values.any()

In [None]:
count_classes = pd.value_counts(data['Class'], sort = True)

count_classes.plot(kind = 'bar', rot=0)

plt.title("Transaction Class Distribution")

plt.xticks(range(2), LABELS)

plt.xlabel("Class")

plt.ylabel("Frequency")

In [None]:
## Get the Fraud and the normal dataset 

fraud = data[data['Class']==1]

normal = data[data['Class']==0]

In [None]:
print(fraud.shape,normal.shape)

In [None]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

In [None]:
# Implementing Oversampling for Handling Imbalanced 
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_resample(X,Y)

In [None]:
X_res.shape,y_res.shape

In [None]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

### Using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from collections import Counter

# Create a synthetic imbalanced dataset
X, y = make_classification(n_classes=2, weights=[0.1, 0.9], 
                            n_informative=3, n_redundant=1, flip_y=0, n_features=20, 
                             n_samples=1000, random_state=10)


In [None]:
# Print the class distribution
print('Original dataset shape %s' % Counter(y))

# Apply SMOTE to the dataset
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

# Print the class distribution after applying SMOTE
print('Resampled dataset shape %s' % Counter(y_res))