In [52]:
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import re
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

from imblearn.pipeline import Pipeline
from hyperopt import hp

In [53]:

cf = pd.read_csv('clean_features.csv').set_index('PetID')
cf.AdoptionSpeed.value_counts()

4    4197
2    4037
3    3259
1    3090
0     410
Name: AdoptionSpeed, dtype: int64

In [3]:
X = cf.drop(columns = 'AdoptionSpeed')
y= cf['AdoptionSpeed']


X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2, random_state = 42)


# Logistic Regression

In [6]:
from sklearn.model_selection import cross_val_score

logis = LogisticRegression(solver = 'lbfgs',max_iter = 1000)
cross_val_score(logis, X_train, y_train , cv=5)

array([0.34274673, 0.34542212, 0.36385256, 0.35136742, 0.35731272])

In [7]:
from sklearn.model_selection import cross_val_score

logis = LogisticRegression(solver = 'lbfgs',max_iter = 1000)
logis.fit(X_train, y_train)
y_pred = logis.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))

print(classification_report(y_test, y_pred))

Accuracy: 0.31210
              precision    recall  f1-score   support

           0       0.07      0.50      0.13        92
           1       0.30      0.27      0.28       627
           2       0.37      0.13      0.19       806
           3       0.32      0.29      0.30       641
           4       0.46      0.52      0.49       833

    accuracy                           0.31      2999
   macro avg       0.30      0.34      0.28      2999
weighted avg       0.36      0.31      0.31      2999



In [8]:
y_pred_tr = logis.predict(X_train)
accuracy = metrics.accuracy_score(y_train, y_pred_tr)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_train, y_pred_tr))

Accuracy: 0.36641
              precision    recall  f1-score   support

           0       0.40      0.57      0.47      3364
           1       0.33      0.31      0.32      3364
           2       0.30      0.12      0.17      3364
           3       0.36      0.33      0.34      3364
           4       0.38      0.51      0.43      3364

    accuracy                           0.37     16820
   macro avg       0.35      0.37      0.35     16820
weighted avg       0.35      0.37      0.35     16820



In [18]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
clf = GridSearchCV(LogisticRegression(penalty='l2',solver = 'lbfgs',max_iter = 1000), param_grid)
clf.fit(X_train_s, y_train)
y_pred = clf.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

Accuracy: 0.44699
              precision    recall  f1-score   support

           0       0.66      0.77      0.71       840
           1       0.34      0.25      0.29       829
           2       0.34      0.34      0.34       828
           3       0.43      0.23      0.30       871
           4       0.41      0.65      0.50       829

    accuracy                           0.45      4197
   macro avg       0.44      0.45      0.43      4197
weighted avg       0.44      0.45      0.43      4197



# Linear SVC

In [20]:
svc = SVC(kernel='linear')
svc.fit(X_train_s, y_train)
y_pred = svc.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

Accuracy: 0.43650
              precision    recall  f1-score   support

           0       0.67      0.78      0.72       840
           1       0.31      0.25      0.28       829
           2       0.34      0.36      0.35       828
           3       0.43      0.15      0.22       871
           4       0.40      0.65      0.49       829

    accuracy                           0.44      4197
   macro avg       0.43      0.44      0.41      4197
weighted avg       0.43      0.44      0.41      4197



#  Polynomial Kernel

In [5]:
svcpoly = SVC(kernel='poly', degree=8)
svcpoly.fit(X_train_s, y_train)
y_pred = svcpoly.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

Accuracy: 0.28568
              precision    recall  f1-score   support

           0       0.25      0.97      0.39       840
           1       0.41      0.10      0.17       829
           2       0.36      0.10      0.16       828
           3       0.49      0.13      0.20       871
           4       0.48      0.12      0.19       829

    accuracy                           0.29      4197
   macro avg       0.40      0.29      0.22      4197
weighted avg       0.40      0.29      0.22      4197



# Gaussian Kernel

In [6]:
svcrbf = SVC(kernel='rbf',probability=True)
svcrbf.fit(X_train_s, y_train)
y_pred_prob = svcrbf.predict_proba(X_test_s)

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred_prob, multi_class = 'ovr')

0.7837217269071836

In [11]:
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(SVC(kernel='rbf'), param_grid)
grid.fit(X_train_s, y_train)
y_pred = grid.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

roc_auc_score(y_true, y_scores)

Accuracy: 0.49297
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       840
           1       0.40      0.38      0.39       829
           2       0.34      0.36      0.35       828
           3       0.43      0.31      0.36       871
           4       0.47      0.52      0.49       829

    accuracy                           0.49      4197
   macro avg       0.48      0.49      0.49      4197
weighted avg       0.48      0.49      0.48      4197



In [88]:
y_pred_tr = grid.predict(X_train_s)
accuracy = metrics.accuracy_score(y_train, y_pred_tr)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_train, y_pred_tr))

Accuracy: 0.80754
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      3357
           1       0.79      0.74      0.77      3368
           2       0.74      0.77      0.76      3369
           3       0.85      0.69      0.76      3326
           4       0.79      0.86      0.82      3368

    accuracy                           0.81     16788
   macro avg       0.81      0.81      0.81     16788
weighted avg       0.81      0.81      0.81     16788



# Sigmoid Kernel

In [7]:
svcsig = SVC(kernel='sigmoid')
svcsig.fit(X_train_s, y_train)
y_pred = svcsig.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

Accuracy: 0.34977
              precision    recall  f1-score   support

           0       0.60      0.66      0.63       840
           1       0.25      0.27      0.26       829
           2       0.23      0.19      0.21       828
           3       0.26      0.19      0.22       871
           4       0.35      0.44      0.39       829

    accuracy                           0.35      4197
   macro avg       0.34      0.35      0.34      4197
weighted avg       0.34      0.35      0.34      4197



In [54]:
imagef = pd.read_csv('img_features.csv').set_index('Unnamed: 0')

imagef.index.name = 'PetID'
imagef.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
86e1089a3,0.002,0.1678,0.019715,0.015896,0.068162,0.002216,0.005042,0.004828,0.05076,0.047626,...,0.787699,0.176626,0.575706,1.088628,0.439556,0.52046,1.547071,0.832573,0.599093,0.763348
6296e909a,0.002858,0.10745,0.019916,0.023482,0.174765,0.002297,0.005031,0.006338,0.083378,0.049948,...,0.628259,0.686865,0.564,0.96819,1.070276,1.545742,0.894409,0.838595,0.468238,0.916672
3422e4906,0.002734,0.072015,0.024455,0.018021,0.154207,0.001946,0.004211,0.001576,0.100046,0.039717,...,0.579116,0.557625,1.131405,0.720513,1.496671,0.870955,1.289683,1.184462,0.465114,0.892826
5842f1ff5,0.002106,0.274519,0.054815,0.013727,0.089969,0.00165,0.005506,0.004295,0.118727,0.03479,...,1.295853,0.326143,0.291669,1.608086,1.119176,1.470889,0.591444,0.832755,0.483021,1.134126
850a43f90,0.002185,0.174022,0.044818,0.016244,0.169775,0.002075,0.004421,0.004157,0.099671,0.034441,...,1.092663,0.669894,0.395784,0.886075,1.21973,1.033966,1.065686,0.304054,0.438069,0.676817


In [55]:
nmerge = imagef.merge(cf, on='PetID', how='outer')

In [56]:
nmerge.head()

Unnamed: 0_level_0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,10_y,11_y,12_y,13_y,14_y,15_y,16_y,17_y,18_y,19_y
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
86e1089a3,0.002,0.1678,0.019715,0.015896,0.068162,0.002216,0.005042,0.004828,0.05076,0.047626,...,0.009754,-0.002826,-0.018925,-0.012456,0.000584,-0.014139,-0.00755,0.018987,0.009588,0.009349
6296e909a,0.002858,0.10745,0.019916,0.023482,0.174765,0.002297,0.005031,0.006338,0.083378,0.049948,...,-0.010259,0.008178,0.010272,0.003087,-0.001423,-0.005038,-0.003081,0.009825,0.003104,0.006718
3422e4906,0.002734,0.072015,0.024455,0.018021,0.154207,0.001946,0.004211,0.001576,0.100046,0.039717,...,0.044125,-0.050633,0.041106,0.008243,0.024158,-0.030801,0.012797,-0.050066,-0.023363,-0.057766
5842f1ff5,0.002106,0.274519,0.054815,0.013727,0.089969,0.00165,0.005506,0.004295,0.118727,0.03479,...,-0.000498,0.044688,-0.03498,0.002213,-0.094938,0.060292,0.026551,-0.040345,-0.087958,2.2e-05
850a43f90,0.002185,0.174022,0.044818,0.016244,0.169775,0.002075,0.004421,0.004157,0.099671,0.034441,...,-0.009017,0.015145,0.099046,0.026728,0.011302,0.002314,0.000365,0.017789,0.06642,0.038391


In [7]:
nmerge.AdoptionSpeed.value_counts()

4    4197
2    4037
3    3259
1    3090
0     410
Name: AdoptionSpeed, dtype: int64

In [57]:
X = nmerge.drop(columns = 'AdoptionSpeed')
y= nmerge['AdoptionSpeed']


X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2, random_state = 42)



In [58]:
def build_model(input_model, param_dist):

    # define which resampling method and which ML model to use in the pipeline
    scaler = StandardScaler()
    smote = SMOTE(random_state=42)
    model = input_model

    # define the pipeline, tell it to combine ADASYN with the Logistic Regression model
    pipeline = Pipeline([('Scaler', scaler), ('Oversampling', smote), ('Classifier', model)])

    # tune hyperparameter
    pipeline = RandomizedSearchCV(pipeline, param_distributions=param_dist, scoring='f1_weighted', \
                                  cv=5, n_iter=30, n_jobs=-1, random_state=42)

    # fit the data
    pipeline.fit(X_train, y_train)
    pipeline = pipeline.best_estimator_
    
    return pipeline

In [59]:
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score

def score(model):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    cv_f1 = np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted', verbose=True, n_jobs=-1))
    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    test_recall = recall_score(y_test, y_test_pred, average='weighted')
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    test_rocauc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    
    print(train_f1, cv_f1,test_precision,test_recall,test_f1, test_rocauc )
    
    

In [14]:
param_grid = {'Classifier__solver': ['lbfgs'],
              'Classifier__max_iter': [1000]}
model0 = build_model(LogisticRegression(), param_grid)



In [28]:
score(model0)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   16.4s remaining:   24.6s


0.4045456750820359 0.33965494919160993 0.38053644716843565 0.34811603867955987 0.35764570711600374 0.6508891241166782


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.3s finished


In [30]:
param_grid = {'Classifier__C': [0.15], #0.1: 0.35,0.44
              'Classifier__gamma': [0.005]}
model1 = build_model(SVC(kernel='rbf',probability=True),param_grid )
score(model1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 10.8min remaining: 16.2min


0.4800288385222484 0.3387185837999129 0.35588388705284507 0.3691230410136712 0.3535336075921675 0.6451352827480191


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 10.8min finished


In [60]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(random_state = 42)


In [61]:
param_grid = {
    'Classifier__min_data_in_leaf': [10],
    'Classifier__lambda_l1': [0.1],
    'Classifier__num_leaves': [5],
    'Classifier__max_depth':[7],
    'Classifier__feature_fraction': [0.4]
}
model3 = build_model(lgbm,param_grid )
score(model3)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   16.3s remaining:   24.5s


0.4564296538527003 0.3786055511719018 0.4083473829338526 0.41147049016338777 0.395069829230671 0.6803671237065052


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   19.0s finished


In [47]:
model3

Pipeline(memory=None,
         steps=[('Scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('Oversampling',
                 SMOTE(k_neighbors=5, n_jobs=None, random_state=42,
                       sampling_strategy='auto')),
                ('Classifier',
                 LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                colsample_bytree=1.0, feature_fraction=0.4,
                                importance_type='split', lambda_l1=0.1,
                                learning_rate=0.1, max_depth=7,
                                min_child_samples=20, min_child_weight=0.001,
                                min_data_in_leaf=10, min_split_gain=0.0,
                                n_estimators=100, n_jobs=-1, num_leaves=5,
                                objective=None, random_state=42, reg_alpha=0.0,
                                reg_lambda=0.0, silent=True, subsample=1.0,
                         

In [72]:
import shap
import lightgbm as lgb
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

smt = SMOTE(random_state=42, k_neighbors=5)
X_train_sm, y_train_sm = smt.fit_sample(X_train_s, y_train)

lgbm2 =LGBMClassifier(random_state= 42,min_data_in_leaf=10,lambda_l1=0.1, num_leaves=5,max_depth=7,feature_fraction=0.4).fit(X_train_sm, y_train_sm )

%time shap_values = shap.TreeExplainer(lgbm2).shap_values(X_test_s)



Setting feature_perturbation = "tree_path_dependent" because no background data was given.


CPU times: user 2.65 s, sys: 59.8 ms, total: 2.71 s
Wall time: 413 ms
