# **Rare event**

Undersampling et Oversampling method with logistic regression


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns # for intractve graphs

from sklearn.preprocessing import MinMaxScaler # Pour centrer et réduire les données 
from imblearn.over_sampling import SMOTE # Algorithme SMOTE

from sklearn import linear_model 
from sklearn import metrics
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Importation des donnees
df = pd.read_table('../input/creditcard.csv', header=0, sep=',', engine='python')

# Presentation de la base
df.describe()  # Description de la base de donnée
df.shape       # Dimension de la base 
type(df)       # Le type de la bdd
df.dtypes      # Affichage du type de chaque colonne
print(df.columns.tolist()) # Affichage du nom des colonnes

# Recuperation du nombre de fraude 
n0 = len(df[df["Class"]==0]) 
n1 = len(df[df["Class"]==1])
print("n_fraudes = ",n1)
print("n_ok = ",n0)
sns.countplot(x='Class',data=df, palette='hls')
# Check valeur nulle 
df.isnull().sum()
# Checking for independence between features
sns.heatmap(df.corr()) 

# Fréquence relative de la variable cible
my_tab = pd.crosstab(index=df['Class'],  # Make a crosstab
                     columns="count")      # Name the count column
my_tab


In [None]:
sns.countplot(x='Class',data=df, palette='hls')

# Description des méthodes : 

** Oversampling ** : la classe minoritaire est ré échantillonnée de sorte que celle-ci soit « sur-représentées » en terme de fréquence dans la base d’apprentissage.

** Undersampling ** : la classe majoritaire est rééchantillonnée de sorte que sa fréquence dans la base d’apprentissage soit la plus proche possible de la classe minoritaire.

In [None]:
# On veut 50% de fraudes et 50% de non fraude par défaut
def undersampling(data,target,times=0.75):
    # Echantillonnage sur les cibles
    samp1 = df[df[target]==1].sample(frac=1, replace=True)
    # Recuperation du nombre de cible sélectionné
    n1 = samp1.shape[0]
    # Petit calcul pour déterminer le nombre de non cible à sélectionner
    # new_n0 = int(n1*(1-pct_target)/pct_target)
    facteur = times*2/0.5
    # Echantillonnage sur les non-cibles
    samp0 = df[df[target]==0].sample(n=int(samp1.shape[0]*facteur), replace=True)
    # Concatenation des deux tables 
    samp = pd.concat([samp0,samp1])
    """
    print("Distribution of class labels before resampling \n", pd.crosstab(index=data[target], columns="count") )
    print("Distribution of class labels after resampling \n", pd.crosstab(index=samp[target], columns="count") )
    """
    return(samp)

# Construction de la base d'apprentissage
# Exemple de lancement : 
df_over = undersampling(df,"Class",times=0.5)
df_over.shape
# df_over = undersampling(df,"Class",times=0.75)

In [None]:
# Pour l'oversampling, on ne garde qu'une partie des non fraudes
# On détermine le nombre de fraude par rapport à la fraction restante
def oversample(data,target,times = 0.75,frac_nontarget = 0.5):#times denote the normal data = times*fraud data
    # Echantillonnage sur les non cibles
    samp0 = data[data[target]==0].sample(frac=frac_nontarget, replace=True)
    # Recuperation du nombre de cible sélectionné
    facteur = times*2/0.5
    # Echantillonnage sur les cibles
    samp1 = data[data[target]==1].sample(n=int(samp0.shape[0]/facteur), replace=True)
    # Concatenation des deux tables 
    samp = pd.concat([samp0,samp1])
    """
    print("Distribution of class labels before resampling \n", pd.crosstab(index=data[target], columns="count") )
    print("Distribution of class labels after resampling \n ", pd.crosstab(index=samp[target], columns="count") )
    """
    return(samp)

# Times  = % de non fraude 

# Exemple de lancement : 
df_under = oversample(df,"Class",times = 0.75)
df_under.shape

In [None]:
"""
Preparation des données : centrer et réduire 
"""
from sklearn.preprocessing import StandardScaler # for preprocessing the data
df["Normalized Amount"] = StandardScaler().fit_transform(df['Amount'].reshape(-1, 1))
df.drop(["Time","Amount"],axis=1,inplace=True)
df.head()


In [None]:
"""
Preparation des données : creation des bases d'apprentissage et  
"""
def dataprep(data, target, times, type_sample = "over"):
    if "under" in type_sample: 
        print ("Creation des bases undersampling\n")
        data = undersampling(data,target,times=times)
    else: 
        print ("Creation des bases oversampling\n")
        data = oversample(data,target,times=times)
        
    """ Target vs Design Matrix """ 
    y = data[target]
    X = data.drop([target], axis=1)

    """ Preparation de la base d'apprentissage et de validation """
    # On split a nouveau en une base de train et de test
    print ("\nSplit en base train et validation ")
    x_train, x_val, y_train, y_val = train_test_split(X,y,  test_size = .2, random_state=12)
    """
    print("Dimension de la design matrix d'apprentissage ", x_train.shape)
    print("Répartition des targets dans l'apprentissage \n", pd.crosstab(y_train, columns="count") )
    print("Dimension de la design matrix de validation ", x_val.shape)
    print("Répartition des targets dans l'apprentissage \n", pd.crosstab(y_val, columns="count") )
    """
    return(x_train, x_val, y_train, y_val)

x_train, x_val, y_train, y_val = dataprep(df, "Class", type_sample = "over",times=0.75)

In [None]:
def model_accuracy(trained_model, features, targets):
    accuracy_score = trained_model.score(features, targets)
    return accuracy_score

In [None]:
## first make a model function for modeling with confusion matrix
def model(model,features_train,features_test,labels_train,labels_test):

    model.fit(features_train,labels_train.values.ravel())
    pred=model.predict(features_test)
    cnf_matrix=confusion_matrix(labels_test,pred)

    print("\nAUC ----------------------------------- ")
    # Sur la base d'apprentissage
    fpr, tpr, thresholds = metrics.roc_curve(labels_train, model.predict(features_train))
    print("Train AUC : ", metrics.auc(fpr, tpr))
    # Sur la base de validation
    fpr, tpr, thresholds = metrics.roc_curve(labels_test, model.predict(features_test))
    print("Validation AUC : ",metrics.auc(fpr, tpr))

    print("Accuracy ------------------------------ ")
    train_accuracy = model_accuracy(model, features_train, labels_train)
    print ("Train Accuracy : ", train_accuracy)
    val_accuracy = model_accuracy(model, features_test, labels_test)
    print ("Validation Accuracy : ", val_accuracy)

    print("Recall ------------------------------ ")
    print("the recall for this model is :",cnf_matrix[1,1]/(cnf_matrix[1,1]+cnf_matrix[1,0]))
    fig= plt.figure(figsize=(6,3))# to plot the graph
    """
    print("TP",cnf_matrix[1,1,]) # no of fraud transaction which are predicted fraud
    print("TN",cnf_matrix[0,0]) # no. of normal transaction which are predited normal
    print("FP",cnf_matrix[0,1]) # no of normal transaction which are predicted fraud
    print("FN",cnf_matrix[1,0]) # no of fraud Transaction which are predicted normal
    """
    sns.heatmap(cnf_matrix,cmap="coolwarm_r",annot=True,linewidths=0.5)
    plt.title("Confusion_matrix sur la base de test ")
    plt.xlabel("Predicted_class")
    plt.ylabel("Real class")
    plt.show()
    """
    print("\n----------Classification Report------------------------------------")
    print(classification_report(labels_test,pred))
    """

# Exemple de lancement : 
# model(linear_model.LogisticRegression(),x_train,x_val,y_train,y_val)

In [None]:
# Driver générique pour lancer le modèle
def driver(data, target, typemodel, method_sample = "over", times = 0.75):
    print ("---------------------------------- \n Lancement pour le modèle avec ", times, "\n ---------------------------------- ")
    print("\n---------- Information concernant le split de la BDD  \n ")
    x_train, x_val, y_train, y_val = dataprep(data, target, type_sample = method_sample, times = times)
    print("\n---------- Information concernant le modele ---------- \n ")
    model(typemodel,x_train,x_val,y_train,y_val)

typemodel = linear_model.LogisticRegression()
driver(df, "Class", typemodel, method_sample = "over", times = 0.85)
"""
driver(df, "Class", typemodel, method_sample = "over", times = 0.75)
driver(df, "Class", typemodel, method_sample = "over", times = 0.65)
driver(df, "Class", typemodel, method_sample = "over", times = 0.50)
"""


In [None]:
"""
ALGO SMOTE avec une régression logistique
"""
# Chargement des modules

"""
TO DO:
- revoir le pgm 
"""
LogReg = linear_model.LogisticRegression()

# Lancement de l'algorithme SMOTE
sm = SMOTE(random_state=42)

""" Target vs Design Matrix """ 
y = df["Class"]
X = df.drop(["Class"], axis=1)

x_res, y_res = sm.fit_sample(X, y)

# Apprentissage régression logistique sur les données SMOTE
LogReg.fit(x_res, y_res)
fpr, tpr, thresholds = metrics.roc_curve(y_res, LogReg.predict(x_res))
print('Train Results\n', metrics.auc(fpr, tpr),'\n', recall_score(y_res, LogReg.predict(x_res)))

# Pred sur la base de validation
fpr, tpr, thresholds = metrics.roc_curve(y_val, LogReg.predict(x_val))
print('\nValidation Results\n', metrics.auc(fpr, tpr),'\n', recall_score(y_val, LogReg.predict(x_val)))

# SUr la population
fpr, tpr, thresholds = metrics.roc_curve(y, LogReg.predict(X))
print('\nBase Results\n', metrics.auc(fpr, tpr),'\n', recall_score(y, LogReg.predict(X)))

In [None]:
"""
ALGO SMOTE - Comparaison des différents algorithmes SMOTE 
- regular
- borderline1 et 2
- SVM
"""
LogReg = linear_model.LogisticRegression()

# Definition des différentes méthodes "kind"
kind = ['regular', 'borderline1', 'borderline2'] # 'svm' non executé car trop long

print("Demarrage de l'analyse avec differentes méthodes ", kind)
# Comparaison des différentes méthodes
for k in kind:
    sm = SMOTE(random_state=42, kind=k)
    x_res, y_res = sm.fit_sample(x_train, y_train)
    LogReg.fit(x_res, y_res)
    fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_res, LogReg.predict(x_res))
    fpr_val, tpr_val, thresholds_val = metrics.roc_curve(y_val, LogReg.predict(x_val))
    fpr_pop, tpr_pop, thresholds_pop = metrics.roc_curve(y, LogReg.predict(X))
    print("Résultat pour le modèle %s -------------- " % (k))
    print("AUC sur la base d'apprentissage : %f" % (metrics.auc(fpr_train, tpr_train)))
    print("AUC sur la base de validation : %f" % (metrics.auc(fpr_val, tpr_val)))
    print("AUC sur la population : %f" % (metrics.auc(fpr_pop, tpr_pop)))
    
    print("\nRecall sur la base d'apprentissage" , recall_score(y_res, LogReg.predict(x_res)))
    print("Recall sur la base de validation " , recall_score(y_val, LogReg.predict(x_val)))
    print("Recall sur la population " , recall_score(y, LogReg.predict(X)))
    
    confusion_matrix_train = confusion_matrix(y_res, LogReg.predict(x_res))
    print("\nMatrice de confusion sur la base de train : \n", confusion_matrix_train)
    confusion_matrix_val = confusion_matrix(y_val, LogReg.predict(x_val))
    print("Matrice de confusion sur la base de train : \n", confusion_matrix_val)
    confusion_matrix_pop = confusion_matrix(y, LogReg.predict(X))
    print("Matrice de confusion sur la base de train : \n", confusion_matrix_pop)
    print("---------------------------------------------\n")


In [None]:
"""
ALGO SMOTE - Comparaison des différents ratio 
- 'minority'
- 'majority'
- 'not minority'
- 'all'
- 'auto'
"""
from imblearn.over_sampling import SMOTE
from sklearn import linear_model
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

LogReg = linear_model.LogisticRegression()

# Definition des différentes méthodes "kind"
ratio = ['minority','not minority','all','auto'] 

print("Demarrage de l'analyse avec differentes méthodes ", ratio)
# Comparaison des différentes méthodes
for r in ratio:
    sm = SMOTE(random_state=42, ratio=r)
    x_res, y_res = sm.fit_sample(x_train, y_train)
    LogReg.fit(x_res, y_res)
    fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_res, LogReg.predict(x_res))
    fpr_val, tpr_val, thresholds_val = metrics.roc_curve(y_val, LogReg.predict(x_val))
    fpr_pop, tpr_pop, thresholds_pop = metrics.roc_curve(y, LogReg.predict(X))
    print("\nRésultat pour le modèle %s -------------- " % (r))
    print("AUC sur la base d'apprentissage : %f" % (metrics.auc(fpr_train, tpr_train)))
    print("AUC sur la base de validation : %f" % (metrics.auc(fpr_val, tpr_val)))
    print("AUC sur la population : %f" % (metrics.auc(fpr_pop, tpr_pop)))

In [None]:
"""
ADASYN
"""
X_res, y_res = method.fit_sample(X, y)
X_resampled.append(X_res)
y_resampled.append(y_res)
LogReg.fit(X_resampled, y_resampled)
fpr, tpr, thresholds = metrics.roc_curve(y_train_res, LogReg.predict(x_train_res))
print(metrics.auc(fpr, tpr))
