# 4. Classification (Campaign response)

In [45]:
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [46]:
path = "https://raw.githubusercontent.com/ekaratnida/Automated_Tools-/keep/Mini-bads3/cust_sensitivity.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,CUST_CODE,BASKET_ID,PROD_CODE,STORE_CODE,price_sensitivity
0,CUST0000001098,2,8,1,1
1,CUST0000001392,1,17,1,0
2,CUST0000001437,1,2,1,0
3,CUST0000002218,1,8,1,0
4,CUST0000002678,1,2,1,0


In [47]:
x = df.iloc[:,1:-1]
print(x.shape)

(1860, 3)


In [48]:
y = df.iloc[:,-1:]
print(y.shape)

(1860, 1)


In [49]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
print(y_train.value_counts())
print(y_test.value_counts())

price_sensitivity
0                    1169
1                     319
dtype: int64
price_sensitivity
0                    292
1                     80
dtype: int64


## Apply negative downsampling

In [50]:
y_train_pos = y_train[y_train['price_sensitivity']==1]
y_train_neg = y_train[y_train['price_sensitivity']==0]
#print("origin ", y_train_neg)
print(y_train_neg.shape)
y_train_neg = y_train_neg.sample(int(y_train_neg.shape[0] * 0.30), random_state=42)
print("down to ", y_train_neg.shape)
y_train_new = pd.concat([y_train_pos, y_train_neg])
x_train_new = x_train.loc[y_train_new.index]

print(y_train_new.value_counts())
print(y_test.value_counts())

(1169, 1)
down to  (350, 1)
price_sensitivity
0                    350
1                    319
dtype: int64
price_sensitivity
0                    292
1                     80
dtype: int64


<H1> Train model

1. Logistic Regression

In [51]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
target_names = ['not_sensitivity (0)', 'sensitivity (1)']

In [52]:
model = LogisticRegression()
train_result = model.fit(x_train_new, y_train_new)

#Evaluation
y_pred = train_result.predict(x_test)
#print(y_pred)

confusion_matrix_train = confusion_matrix(y_test, y_pred)
print(confusion_matrix_train)

TruePositive = confusion_matrix_train[1,1]
print("TP=",TruePositive)

FalseNegative = confusion_matrix_train[1,0]
print("FN=",FalseNegative)

TrueNegative = confusion_matrix_train[0,0]
print("TN=",TrueNegative)

FalsePositive = confusion_matrix_train[0,1]
print("FP=",FalsePositive)

print(classification_report(y_test, y_pred, target_names=target_names))

[[143 149]
 [ 41  39]]
TP= 39
FN= 41
TN= 143
FP= 149
                     precision    recall  f1-score   support

not_sensitivity (0)       0.78      0.49      0.60       292
    sensitivity (1)       0.21      0.49      0.29        80

           accuracy                           0.49       372
          macro avg       0.49      0.49      0.45       372
       weighted avg       0.65      0.49      0.53       372



2. Grid search

In [53]:
from sklearn.model_selection import GridSearchCV

grid = { 'C': [0.0001,0.001,0.01,0.1,1,10,1e2,1e3,1e6,1e9,1e12], 
        "penalty":["l1","l2"], 
        "solver":["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
        }# l1 lasso l2 ridge

logreg=LogisticRegression()

model=GridSearchCV(logreg, grid, cv=5)

train_result = model.fit(x_train_new,y_train_new)

print("tuned hpyerparameters :(best parameters) ",train_result.best_params_)
print("accuracy :",train_result.best_score_)

#Evaluation
y_pred = train_result.predict(x_test)
#print(y_pred)

confusion_matrix_train = confusion_matrix(y_test, y_pred)
print(confusion_matrix_train)

TruePositive = confusion_matrix_train[1,1]
print("TP=",TruePositive)

FalseNegative = confusion_matrix_train[1,0]
print("FN=",FalseNegative)

TrueNegative = confusion_matrix_train[0,0]
print("TN=",TrueNegative)

FalsePositive = confusion_matrix_train[0,1]
print("FP=",FalsePositive)

print(classification_report(y_test, y_pred, target_names=target_names))

tuned hpyerparameters :(best parameters)  {'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}
accuracy : 0.5710357984513523
[[143 149]
 [ 41  39]]
TP= 39
FN= 41
TN= 143
FP= 149
                     precision    recall  f1-score   support

not_sensitivity (0)       0.78      0.49      0.60       292
    sensitivity (1)       0.21      0.49      0.29        80

           accuracy                           0.49       372
          macro avg       0.49      0.49      0.45       372
       weighted avg       0.65      0.49      0.53       372



Grid search for multiple algorithms

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Just initialize the pipeline with any estimator you like
pipe = Pipeline(steps=[('estimator', SVC())])

# Add a dict of estimator and estimator related parameters in this list
params_grid = [{
'estimator':[SVC()],
'estimator__C': [1, 10, 100, 1000],
'estimator__gamma': [0.001, 0.0001],
},
{
'estimator': [DecisionTreeClassifier()],
'estimator__max_depth': [1,2,3,4,5],
'estimator__max_features': [None, "auto", "sqrt", "log2"],
}
# {'estimator':[Any_other_estimator_you_want],
# 'estimator__valid_param_of_your_estimator':[valid_values]

 ]

model = GridSearchCV(pipe, params_grid,cv=7)

train_result = model.fit(x_train_new,y_train_new)
print("tuned hpyerparameters :(best parameters) ",train_result.best_params_)
print("accuracy :",train_result.best_score_)

#Evaluation
y_pred = train_result.predict(x_test)
#print(y_pred)

confusion_matrix_train = confusion_matrix(y_test, y_pred)
print(confusion_matrix_train)

TruePositive = confusion_matrix_train[1,1]
print("TP=",TruePositive)

FalseNegative = confusion_matrix_train[1,0]
print("FN=",FalseNegative)

TrueNegative = confusion_matrix_train[0,0]
print("TN=",TrueNegative)

FalsePositive = confusion_matrix_train[0,1]
print("FP=",FalsePositive)

print(classification_report(y_test, y_pred, target_names=target_names))

tuned hpyerparameters :(best parameters)  {'estimator': DecisionTreeClassifier(max_depth=5, max_features='sqrt'), 'estimator__max_depth': 5, 'estimator__max_features': 'sqrt'}
accuracy : 0.5981672932330827
[[208  84]
 [ 49  31]]
TP= 31
FN= 49
TN= 208
FP= 84
                     precision    recall  f1-score   support

not_sensitivity (0)       0.81      0.71      0.76       292
    sensitivity (1)       0.27      0.39      0.32        80

           accuracy                           0.64       372
          macro avg       0.54      0.55      0.54       372
       weighted avg       0.69      0.64      0.66       372



# Exercise and Homework
# Dataset
https://raw.githubusercontent.com/ekaratnida/Applied-machine-learning/master/Dataset/heart.csv

In [55]:
path = "https://raw.githubusercontent.com/ekaratnida/Applied-machine-learning/master/Dataset/heart.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


## Feature selection

In [56]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=2, step=1)
selector = selector.fit(x_train_new, y_train_new)
print(selector.get_feature_names_out())
print(selector.ranking_)
print(selector.support_)

['BASKET_ID' 'STORE_CODE']
[1 2 1]
[ True False  True]
