In [2]:

#https://elitedatascience.com/imbalanced-classes?__s=4wpewqovpywibtz3pssh

import pandas as pd
import numpy as np

df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data', 
                 names=['balance', 'var1', 'var2', 'var3', 'var4'])

In [5]:
df.head()

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [4]:
df.describe()

Unnamed: 0,var1,var2,var3,var4
count,625.0,625.0,625.0,625.0
mean,3.0,3.0,3.0,3.0
std,1.415346,1.415346,1.415346,1.415346
min,1.0,1.0,1.0,1.0
25%,2.0,2.0,2.0,2.0
50%,3.0,3.0,3.0,3.0
75%,4.0,4.0,4.0,4.0
max,5.0,5.0,5.0,5.0


In [3]:
# Répartition des enregistrements suivant 'balance'

df['balance'].value_counts()

R    288
L    288
B     49
Name: balance, dtype: int64

In [6]:
# Problème de classification binaire

df['balance'] = [1 if b=='B' else 0 for b in df.balance]
df['balance'].value_counts()

0    576
1     49
Name: balance, dtype: int64

In [8]:
# Régression logistique

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# y : variable à prévoir, X variables prédictives ( features matrix )

y = df.balance
X = df.drop('balance', axis=1)

# Train model

clf_0 = LogisticRegression().fit(X, y)
 
# Predict on training set

pred_y_0 = clf_0.predict(X)

# Accuracy ( 92 % )

print( accuracy_score(pred_y_0, y) )

# prévisions ( 0 unique valeur prédite )

print( np.unique( pred_y_0 ) )

0.9216
[0]


In [11]:
# method 1 : up sampling minority class

from sklearn.utils import resample

# Séparation des classes

df_majority = df[df.balance==0]
df_minority = df[df.balance==1]

# Sur échantillonage de la classe minoritaire

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=576,    # to match majority class
                                 random_state=123) # reproducible results

# Constitution d'un nouveau df avec classe minoritaire sur échantillonnée

df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Vérification de la répartition par classe dans ce nouveau df

print( df_upsampled.balance.value_counts() )

# Régression logistique sur ce nouveau df

y = df_upsampled.balance
X = df_upsampled.drop('balance', axis=1)

clf_1 = LogisticRegression().fit(X, y)
pred_y_1 = clf_1.predict(X)

print( np.unique( pred_y_1 ) )

print( accuracy_score(y, pred_y_1) )

1    576
0    576
Name: balance, dtype: int64
[0 1]
0.5138888888888888


In [13]:
# method 2 : down-sampling majority class

# Séparation des classes

df_majority = df[df.balance==0]
df_minority = df[df.balance==1]

# Sur échantillonage de la classe minoritaire

df_majority_downsampled = resample( df_majority, 
                                    replace=False,    # sample without replacement
                                    n_samples=49,     # to match minority class
                                    random_state=123) # reproducible results

# Constitution d'un nouveau df avec classe majoritaire sous échantillonnée

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Vérification de la répartition par classe dans ce nouveau df

print( df_downsampled.balance.value_counts() )

# Régression logistique sur ce nouveau df

y = df_downsampled.balance
X = df_downsampled.drop('balance', axis=1)

clf_2 = LogisticRegression().fit(X, y)
pred_y_2 = clf_2.predict(X)


print( np.unique( pred_y_2 ) )

print( accuracy_score(y, pred_y_2) )

1    49
0    49
Name: balance, dtype: int64
[0 1]
0.5816326530612245


In [14]:
# method 3 : change the performance metric

In [15]:
# method 4 : cost_sensitive training ( penalize algorithms )

from sklearn.svm import SVC

y = df.balance
X = df.drop('balance', axis=1)

# Train model
clf_3 = SVC(kernel='linear', 
            class_weight='balanced', # penalize
            probability=True)

clf_3.fit(X, y)

# Predict on training set
pred_y_3 = clf_3.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_3 ) )
 
# How's our accuracy?
print( accuracy_score(y, pred_y_3) )

[0 1]
0.688


In [16]:
# method 5 : use tree-based algorithms

from sklearn.ensemble import RandomForestClassifier

y = df.balance
X = df.drop('balance', axis=1)

# Train model
clf_4 = RandomForestClassifier()
clf_4.fit(X, y)
 
# Predict on training set
pred_y_4 = clf_4.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_4 ) )
 
# How's our accuracy?
print( accuracy_score(y, pred_y_4) )


[0 1]
0.9744
