https://towardsdatascience.com/cluster-then-predict-for-classification-tasks-142fdfdc87d6

In [0]:
from sklearn.datasets import make_classification

# Dataset

In [0]:
X , y = make_classification( n_samples = 1000 ,
                             n_features = 8 , 
                             n_informative = 5 ,
                             n_classes = 4 )

In [0]:
import pandas as pd

In [4]:
[ 'f{}'.format( i ) for i in range( 8 ) ]

['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7']

In [0]:
df = pd.DataFrame( X , columns = [ 'f{}'.format( i ) for i in range( 8 ) ] )

In [6]:
df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7
0,-0.542005,0.04209,-1.506616,-0.79205,0.323103,-0.763647,0.845846,1.060575
1,-1.836127,-0.950482,-0.065574,-1.29839,0.62686,-0.642845,-0.906949,2.956616
2,0.143967,-0.25642,0.982163,0.260823,0.604143,1.204704,-1.248373,0.570111
3,-4.484864,2.620505,2.673152,-1.074445,-0.988431,-0.52111,-2.565923,0.365904
4,-0.38774,-0.663064,1.4454,-0.715071,0.385936,0.054391,-0.463272,-1.235807


# Train / test split

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train , X_test , y_train , y_test = train_test_split( df , y , test_size = 0.25 , random_state = 999 )

In [0]:
import numpy as np

from sklearn.cluster import KMeans

from typing import Tuple

# K-means

In [0]:
def get_clus( X_train , X_test , n_clus ) :
    
    clus = KMeans( n_clusters = n_clus , random_state = 999 , n_jobs = -1 )
    clus.fit( X_train )
    train_lbl = clus.labels_
    
    X_train_clus = X_train.copy()
    X_train_clus[ 'clus' ] = train_lbl
    
    test_lbl = clus.predict( X_test )
    X_test_clus = X_test.copy()
    X_test_clus[ 'clus' ] = test_lbl
    
    return X_train_clus , X_test_clus
    

In [0]:
# 2 clusters :
X_train_clus , X_test_clus = get_clus( X_train , X_test , 2 )

In [12]:
X_train_clus.head( 5 )

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,clus
46,-0.080589,1.08757,1.379187,2.175724,0.610439,-1.178127,-0.387127,-1.088224,0
748,-2.68146,0.093431,1.70345,-0.97445,0.047871,-0.50556,-1.870434,1.578552,0
524,1.585826,-0.516692,-0.686463,0.761094,0.26702,1.036201,0.128366,0.541362,1
568,2.99595,-3.091753,-0.016933,1.520942,0.432967,0.137151,0.403003,0.519155,1
711,2.07723,-0.0204,-2.549757,-0.329206,-0.441495,1.168087,1.817687,-0.528918,1


In [16]:
X_train_clus[ 'clus' ].value_counts()

1    474
0    276
Name: clus, dtype: int64

In [14]:
X_test_clus.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,clus
842,-1.792062,-0.128642,-0.068266,-1.298384,-0.283309,-0.662054,-0.450778,1.540921,0
68,-2.713788,0.593431,0.93633,-2.557917,1.030547,-0.347749,-0.547587,-0.608864,0
308,-0.900324,0.269769,-1.45709,-1.498429,-0.133377,-0.472758,0.854815,0.612812,1
881,0.74921,-0.650619,-0.210405,1.173258,-1.432702,1.12444,-1.227657,2.903169,0
350,2.53249,-2.219729,0.075568,0.036139,0.735937,0.28109,1.276558,-2.274817,1


In [17]:
X_test_clus[ 'clus' ].value_counts()

1    161
0     89
Name: clus, dtype: int64

# Scaling

In [0]:
from sklearn.preprocessing import StandardScaler

In [0]:
def scal( X_train , X_test ) :
    
    scal = StandardScaler()
    
    col_2_scal = [ 'f' + str( i ) for i in range( 8 ) ]
    
    scal.fit( X_train[ col_2_scal ] )
    X_train[ col_2_scal ] = scal.transform( X_train[ col_2_scal ] )
    X_test[ col_2_scal ] = scal.transform( X_test[ col_2_scal ] )
    
    return X_train , X_test

In [0]:
X_train_clus_scal , X_test_clus_scal = scal( X_train_clus , X_test_clus )

In [21]:
X_train_clus_scal.head( 5 )

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,clus
46,-0.038967,0.784119,1.038051,1.433721,0.590009,-0.680567,-0.317154,-1.005254,0
748,-1.445477,0.207443,1.241747,-0.607735,0.030849,-0.215379,-1.31931,0.657687,0
524,0.862205,-0.146475,-0.259552,0.516976,0.248671,0.850996,0.031126,0.010919,1
568,1.624779,-1.640205,0.161034,1.009392,0.413612,0.229159,0.216677,-0.002929,1
711,1.127949,0.141412,-1.430038,-0.189588,-0.455551,0.942216,1.172471,-0.656483,1


# Modèle global vs. modèles par cluster

In [0]:
train_clus_scal = X_train_clus_scal.copy()
test_clus_scal = X_test_clus_scal.copy()

train_clus_scal[ 'y' ] = y_train
test_clus_scal[ 'y' ] = y_test

In [23]:
train_clus_scal.head( 5 )

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,clus,y
46,-0.038967,0.784119,1.038051,1.433721,0.590009,-0.680567,-0.317154,-1.005254,0,3
748,-1.445477,0.207443,1.241747,-0.607735,0.030849,-0.215379,-1.31931,0.657687,0,1
524,0.862205,-0.146475,-0.259552,0.516976,0.248671,0.850996,0.031126,0.010919,1,0
568,1.624779,-1.640205,0.161034,1.009392,0.413612,0.229159,0.216677,-0.002929,1,1
711,1.127949,0.141412,-1.430038,-0.189588,-0.455551,0.942216,1.172471,-0.656483,1,3


In [0]:
# cluster 0 :

train_clus_scal_0 = train_clus_scal.loc[ train_clus_scal[ 'clus' ] == 0 ]
test_clus_scal_0  = test_clus_scal.loc[ test_clus_scal[ 'clus' ] == 0 ]

y_train_clus_scal_0 = train_clus_scal_0[ 'y' ].values
y_test_clus_scal_0  = test_clus_scal_0[ 'y' ].values

X_train_clus_scal_0 = train_clus_scal_0.drop( columns = [ 'y' , 'clus' ] )
X_test_clus_scal_0  = test_clus_scal_0.drop( columns = [ 'y' , 'clus' ] )

In [25]:
X_train_clus_scal_0.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7
46,-0.038967,0.784119,1.038051,1.433721,0.590009,-0.680567,-0.317154,-1.005254
748,-1.445477,0.207443,1.241747,-0.607735,0.030849,-0.215379,-1.31931,0.657687
560,-0.422499,1.437338,1.769861,1.725315,-0.626695,0.827598,-1.928487,0.038711
35,-1.161952,1.882392,0.23363,-0.10524,0.940483,-0.307441,-0.159923,-0.808304
535,-1.450609,-0.284404,1.194272,-0.923841,-0.34282,-0.881724,-0.849979,0.324026


In [0]:
# cluster 1 :

train_clus_scal_1 = train_clus_scal.loc[ train_clus_scal[ 'clus' ] == 1 ]
test_clus_scal_1  = test_clus_scal.loc[ test_clus_scal[ 'clus' ] == 1 ]

y_train_clus_scal_1 = train_clus_scal_1[ 'y' ].values
y_test_clus_scal_1  = test_clus_scal_1[ 'y' ].values

X_train_clus_scal_1 = train_clus_scal_1.drop( columns = [ 'y' , 'clus' ] )
X_test_clus_scal_1  = test_clus_scal_1.drop( columns = [ 'y' , 'clus' ] )

In [27]:
X_train_clus_scal_1.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7
524,0.862205,-0.146475,-0.259552,0.516976,0.248671,0.850996,0.031126,0.010919
568,1.624779,-1.640205,0.161034,1.009392,0.413612,0.229159,0.216677,-0.002929
711,1.127949,0.141412,-1.430038,-0.189588,-0.455551,0.942216,1.172471,-0.656483
738,2.323368,-1.758196,0.725327,0.481944,0.111383,1.39212,0.459849,-2.143872
530,-0.183336,0.537353,-1.200637,0.045359,0.809648,-0.943462,0.943221,0.331559


In [0]:
from sklearn.linear_model import LogisticRegression

In [0]:
mdl = LogisticRegression( class_weight = 'balanced' , solver = 'lbfgs' , random_state = 999 , max_iter = 250 )

In [0]:
from sklearn import model_selection

In [0]:
kfold = model_selection.KFold( n_splits = 5 , shuffle = True , random_state = 999 )

In [0]:
scoring = [ 'accuracy' , 'precision_weighted' , 'recall_weighted' , 'f1_weighted' ]

In [0]:
X_train = X_train_clus_scal_0

y_train = y_train_clus_scal_0

cv_results = model_selection.cross_validate( mdl , X_train , y_train , cv = kfold , scoring = scoring)