In [1]:
# Let's find out which classifiers from sklearn support class_weight
# as part of the __init__ method, that is, when we set the m up

from sklearn.utils.discovery import all_estimators

estimators = all_estimators(type_filter='classifier')

for name, class_ in estimators:
    try:
        if hasattr(class_(), 'class_weight'):
            print(name)
    except:
        pass

DecisionTreeClassifier
ExtraTreeClassifier
ExtraTreesClassifier
HistGradientBoostingClassifier
LinearSVC
LogisticRegression
LogisticRegressionCV
NuSVC
PassiveAggressiveClassifier
Perceptron
RandomForestClassifier
RidgeClassifier
RidgeClassifierCV
SGDClassifier
SVC


In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv("C:\\Users\\dipsa\\Downloads\\cv new sayanti\\python project\\ML_Jupyter_2024\\kdd2004.csv").sample(10000)

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,target
15125,60.0,28.57,0.29,-19.5,-14.5,783.6,0.37,0.04,6.0,-62.5,...,488.3,1.25,-0.71,2.0,-20.0,-39.2,1.94,0.2,0.05,-1
77877,63.39,30.26,-2.1,-7.0,35.0,593.1,0.53,0.92,8.5,-56.0,...,979.5,-0.96,1.36,4.0,-47.0,573.5,-0.7,0.03,0.26,-1
39564,79.72,31.21,2.08,33.5,-73.5,1737.4,-0.08,5.06,62.5,-145.5,...,906.4,0.75,12.06,80.0,-183.0,227.1,1.6,0.53,0.86,1
115114,48.95,28.77,0.5,22.0,-0.5,1143.5,0.69,-0.75,-13.0,-58.0,...,1110.8,-0.01,0.85,1.0,-15.0,107.9,-2.93,0.3,0.14,-1
65988,43.0,27.27,-0.52,2.5,6.0,894.1,-0.79,0.07,13.5,-47.5,...,636.1,-0.2,-1.48,2.0,-18.0,173.1,0.4,0.02,-0.43,-1


In [5]:
# imbalanced target

data.target.value_counts() / len(data)

target
-1    0.9911
 1    0.0089
Name: count, dtype: float64

In [6]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),  # drop the target
    data['target'],  # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((7000, 74), (3000, 74))

In [7]:
# Logistic Regression with class_weight

# we initialize the cost / weights when we set up the transformer

def run_Logit(X_train, X_test, y_train, y_test, class_weight):
    
    # weights introduced here
    logit = LogisticRegression(
        penalty='l2',
        solver='newton-cg',
        random_state=0,
        max_iter=10,
        n_jobs=4,
        class_weight=class_weight # weights / cost
    )
    
    logit.fit(X_train, y_train)

    print('Train set')
    pred = logit.predict_proba(X_train)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    pred = logit.predict_proba(X_test)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

In [8]:
# evaluate performance of algorithm built
# using imbalanced dataset

run_Logit(X_train,
          X_test,
          y_train,
          y_test,
          class_weight=None)

Train set
Random Forests roc-auc: 0.8806277368434854
Test set
Random Forests roc-auc: 0.8552964028566838


In [9]:
# evaluate performance of algorithm built
# cost estimated as imbalance ratio

# 'balanced' indicates that we want same amount of 
# each observation, thus, imbalance ratio

run_Logit(X_train,
          X_test,
          y_train,
          y_test,
          class_weight='balanced')

Train set
Random Forests roc-auc: 0.9751160290531506
Test set
Random Forests roc-auc: 0.9564341107914299


In [10]:
# evaluate performance of algorithm built
# cost estimated as imbalance ratio

# alternatively, we can pass a different cost
# in a dictionary, if we know it already

run_Logit(X_train,
          X_test,
          y_train,
          y_test,
          class_weight={-1:1, 1:10})

Train set
Random Forests roc-auc: 0.9515365655400267
Test set
Random Forests roc-auc: 0.9549882431978503


In [11]:
# Logistic Regression + sample_weight

# we pass the weights / cost, when we train the algorithm

def run_Logit(X_train, X_test, y_train, y_test, sample_weight):
    
    logit = LogisticRegression(
        penalty='l2',
        solver='newton-cg',
        random_state=0,
        max_iter=10,
        n_jobs=4,
    )
    
    # costs are passed here
    logit.fit(X_train, y_train, sample_weight=sample_weight)

    print('Train set')
    pred = logit.predict_proba(X_train)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    pred = logit.predict_proba(X_test)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

In [13]:
# evaluate performance of algorithm built
# using imbalanced dataset

run_Logit(X_train,
          X_test,
          y_train,
          y_test,
          sample_weight=None)

Train set
Random Forests roc-auc: 0.8806277368434854
Test set
Random Forests roc-auc: 0.8552964028566838


In [14]:
# evaluate performance of algorithm built
# cost estimated as imbalance ratio

# with numpy.where, we introduce a cost of 99 to
# each observation of the minority class, and 1
# otherwise.

run_Logit(X_train,
          X_test,
          y_train,
          y_test,
          sample_weight=np.where(y_train==1,99,1))

Train set
Random Forests roc-auc: 0.9739360725804337
Test set
Random Forests roc-auc: 0.956711600531612


In [None]:
# You can see, for both approaches, Cost-sensitive learning has improved the performance of the model.

