In [1]:
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import re

Using TensorFlow backend.


In [3]:

cf = pd.read_csv('clean_features.csv').set_index('PetID')
cf.AdoptionSpeed.value_counts()

4    4197
2    4037
3    3259
1    3090
0     410
Name: AdoptionSpeed, dtype: int64

In [3]:
X = cf.drop(columns = 'AdoptionSpeed')
y= cf['AdoptionSpeed']

smt = SMOTE(random_state=42, k_neighbors=5)
X_SMOTE, y_SMOTE = smt.fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_SMOTE, y_SMOTE , test_size = 0.2, random_state = 42)


In [4]:
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Logistic Regression

In [15]:
logis = LogisticRegression(solver = 'lbfgs',max_iter = 1000)
logis.fit(X_train_s, y_train)
y_pred = logis.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

Accuracy: 0.44699
              precision    recall  f1-score   support

           0       0.66      0.77      0.71       840
           1       0.34      0.25      0.29       829
           2       0.34      0.34      0.34       828
           3       0.43      0.23      0.30       871
           4       0.41      0.65      0.50       829

    accuracy                           0.45      4197
   macro avg       0.44      0.45      0.43      4197
weighted avg       0.44      0.45      0.43      4197



In [14]:
y_pred_tr = logis.predict(X_train_s)
accuracy = metrics.accuracy_score(y_train, y_pred_tr)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_train, y_pred_tr))

Accuracy: 0.45133
              precision    recall  f1-score   support

           0       0.68      0.79      0.73      3357
           1       0.35      0.26      0.30      3368
           2       0.34      0.32      0.33      3369
           3       0.40      0.22      0.28      3326
           4       0.42      0.67      0.52      3368

    accuracy                           0.45     16788
   macro avg       0.44      0.45      0.43     16788
weighted avg       0.44      0.45      0.43     16788



In [18]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
clf = GridSearchCV(LogisticRegression(penalty='l2',solver = 'lbfgs',max_iter = 1000), param_grid)
clf.fit(X_train_s, y_train)
y_pred = clf.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

Accuracy: 0.44699
              precision    recall  f1-score   support

           0       0.66      0.77      0.71       840
           1       0.34      0.25      0.29       829
           2       0.34      0.34      0.34       828
           3       0.43      0.23      0.30       871
           4       0.41      0.65      0.50       829

    accuracy                           0.45      4197
   macro avg       0.44      0.45      0.43      4197
weighted avg       0.44      0.45      0.43      4197



# Linear SVC

In [20]:
svc = SVC(kernel='linear')
svc.fit(X_train_s, y_train)
y_pred = svc.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

Accuracy: 0.43650
              precision    recall  f1-score   support

           0       0.67      0.78      0.72       840
           1       0.31      0.25      0.28       829
           2       0.34      0.36      0.35       828
           3       0.43      0.15      0.22       871
           4       0.40      0.65      0.49       829

    accuracy                           0.44      4197
   macro avg       0.43      0.44      0.41      4197
weighted avg       0.43      0.44      0.41      4197



#  Polynomial Kernel

In [5]:
svcpoly = SVC(kernel='poly', degree=8)
svcpoly.fit(X_train_s, y_train)
y_pred = svcpoly.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

Accuracy: 0.28568
              precision    recall  f1-score   support

           0       0.25      0.97      0.39       840
           1       0.41      0.10      0.17       829
           2       0.36      0.10      0.16       828
           3       0.49      0.13      0.20       871
           4       0.48      0.12      0.19       829

    accuracy                           0.29      4197
   macro avg       0.40      0.29      0.22      4197
weighted avg       0.40      0.29      0.22      4197



# Gaussian Kernel

In [87]:
svcrbf = SVC(kernel='rbf',probability=True)
svcrbf.fit(X_train_s, y_train)
y_pred = svcrbf.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

KeyboardInterrupt: 

In [6]:
svcrbf = SVC(kernel='rbf',probability=True)
svcrbf.fit(X_train_s, y_train)
y_pred_prob = svcrbf.predict_proba(X_test_s)

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred_prob, multi_class = 'ovr')

0.7837217269071836

In [11]:
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(SVC(kernel='rbf'), param_grid)
grid.fit(X_train_s, y_train)
y_pred = grid.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

roc_auc_score(y_true, y_scores)

Accuracy: 0.49297
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       840
           1       0.40      0.38      0.39       829
           2       0.34      0.36      0.35       828
           3       0.43      0.31      0.36       871
           4       0.47      0.52      0.49       829

    accuracy                           0.49      4197
   macro avg       0.48      0.49      0.49      4197
weighted avg       0.48      0.49      0.48      4197



In [88]:
y_pred_tr = grid.predict(X_train_s)
accuracy = metrics.accuracy_score(y_train, y_pred_tr)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_train, y_pred_tr))

Accuracy: 0.80754
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      3357
           1       0.79      0.74      0.77      3368
           2       0.74      0.77      0.76      3369
           3       0.85      0.69      0.76      3326
           4       0.79      0.86      0.82      3368

    accuracy                           0.81     16788
   macro avg       0.81      0.81      0.81     16788
weighted avg       0.81      0.81      0.81     16788



# Sigmoid Kernel

In [7]:
svcsig = SVC(kernel='sigmoid')
svcsig.fit(X_train_s, y_train)
y_pred = svcsig.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))

Accuracy: 0.34977
              precision    recall  f1-score   support

           0       0.60      0.66      0.63       840
           1       0.25      0.27      0.26       829
           2       0.23      0.19      0.21       828
           3       0.26      0.19      0.22       871
           4       0.35      0.44      0.39       829

    accuracy                           0.35      4197
   macro avg       0.34      0.35      0.34      4197
weighted avg       0.34      0.35      0.34      4197



In [90]:
cf.corr(method ='pearson') 

Unnamed: 0,Type,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,...,10,11,12,13,14,15,16,17,18,19
Type,1.000000,-0.147162,0.060843,-0.171811,0.003036,0.102907,0.025508,0.006737,-0.006864,0.036423,...,0.082922,0.071945,0.050523,0.151931,0.113246,-0.143978,0.013360,0.004071,-0.038479,1.215197e-01
Age,-0.147162,1.000000,-0.123423,0.093673,0.153092,-0.136061,-0.053360,-0.189450,0.103215,-0.113076,...,0.001542,0.087166,-0.045454,-0.031455,-0.112678,0.069132,0.021058,0.082916,-0.013357,4.835250e-02
Gender,0.060843,-0.123423,1.000000,-0.091819,-0.030404,0.078702,0.091431,0.040645,-0.045177,0.494489,...,-0.062768,-0.199294,0.063513,-0.016449,0.052270,-0.182385,-0.056296,-0.119073,-0.072385,4.914900e-02
MaturitySize,-0.171811,0.093673,-0.091819,1.000000,0.095142,-0.087722,-0.067352,-0.066321,-0.012957,-0.042832,...,0.011957,0.025390,-0.033443,-0.013765,-0.017168,0.058451,-0.000356,-0.005636,-0.002092,-5.635050e-02
FurLength,0.003036,0.153092,-0.030404,0.095142,1.000000,-0.006010,0.016463,0.032092,0.028096,-0.038388,...,-0.004947,0.032719,-0.006048,0.052962,-0.023061,0.006839,0.021377,0.067560,0.016085,-2.423558e-02
Vaccinated,0.102907,-0.136061,0.078702,-0.087722,-0.006010,1.000000,0.722596,0.471204,0.078893,0.125287,...,-0.095077,-0.072602,0.022933,0.033177,-0.001962,-0.078480,-0.028372,0.012375,0.007709,-7.003437e-03
Dewormed,0.025508,-0.053360,0.091431,-0.067352,0.016463,0.722596,1.000000,0.426256,0.072188,0.141355,...,-0.110021,-0.080326,0.055516,0.005389,-0.073952,-0.083498,-0.019783,0.001895,0.001648,-3.323037e-03
Sterilized,0.006737,-0.189450,0.040645,-0.066321,0.032092,0.471204,0.426256,1.000000,0.057916,0.101919,...,-0.072746,-0.081121,0.044072,0.070864,-0.010398,-0.064101,-0.018195,-0.010531,0.036004,-3.832732e-02
Health,-0.006864,0.103215,-0.045177,-0.012957,0.028096,0.078893,0.072188,0.057916,1.000000,-0.034567,...,-0.026486,-0.002195,-0.013140,0.001603,-0.051623,-0.002032,-0.017303,0.048682,0.031747,2.169995e-03
Quantity,0.036423,-0.113076,0.494489,-0.042832,-0.038388,0.125287,0.141355,0.101919,-0.034567,1.000000,...,-0.104328,-0.236784,0.071570,-0.024744,0.025704,-0.192270,-0.073438,-0.140456,-0.068715,7.668913e-03


In [2]:
imagef = pd.read_csv('img_features.csv').set_index('Unnamed: 0')

imagef.index.name = 'PetID'
imagef.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
86e1089a3,0.002,0.1678,0.019715,0.015896,0.068162,0.002216,0.005042,0.004828,0.05076,0.047626,...,0.787699,0.176626,0.575706,1.088628,0.439556,0.52046,1.547071,0.832573,0.599093,0.763348
6296e909a,0.002858,0.10745,0.019916,0.023482,0.174765,0.002297,0.005031,0.006338,0.083378,0.049948,...,0.628259,0.686865,0.564,0.96819,1.070276,1.545742,0.894409,0.838595,0.468238,0.916672
3422e4906,0.002734,0.072015,0.024455,0.018021,0.154207,0.001946,0.004211,0.001576,0.100046,0.039717,...,0.579116,0.557625,1.131405,0.720513,1.496671,0.870955,1.289683,1.184462,0.465114,0.892826
5842f1ff5,0.002106,0.274519,0.054815,0.013727,0.089969,0.00165,0.005506,0.004295,0.118727,0.03479,...,1.295853,0.326143,0.291669,1.608086,1.119176,1.470889,0.591444,0.832755,0.483021,1.134126
850a43f90,0.002185,0.174022,0.044818,0.016244,0.169775,0.002075,0.004421,0.004157,0.099671,0.034441,...,1.092663,0.669894,0.395784,0.886075,1.21973,1.033966,1.065686,0.304054,0.438069,0.676817


In [7]:
nmerge = imagef.merge(cf, on='PetID', how='outer')

In [8]:
nmerge.head()

Unnamed: 0_level_0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,10_y,11_y,12_y,13_y,14_y,15_y,16_y,17_y,18_y,19_y
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
86e1089a3,0.002,0.1678,0.019715,0.015896,0.068162,0.002216,0.005042,0.004828,0.05076,0.047626,...,0.009754,-0.002826,-0.018925,-0.012456,0.000584,-0.014139,-0.00755,0.018987,0.009588,0.009349
6296e909a,0.002858,0.10745,0.019916,0.023482,0.174765,0.002297,0.005031,0.006338,0.083378,0.049948,...,-0.010259,0.008178,0.010272,0.003087,-0.001423,-0.005038,-0.003081,0.009825,0.003104,0.006718
3422e4906,0.002734,0.072015,0.024455,0.018021,0.154207,0.001946,0.004211,0.001576,0.100046,0.039717,...,0.044125,-0.050633,0.041106,0.008243,0.024158,-0.030801,0.012797,-0.050066,-0.023363,-0.057766
5842f1ff5,0.002106,0.274519,0.054815,0.013727,0.089969,0.00165,0.005506,0.004295,0.118727,0.03479,...,-0.000498,0.044688,-0.03498,0.002213,-0.094938,0.060292,0.026551,-0.040345,-0.087958,2.2e-05
850a43f90,0.002185,0.174022,0.044818,0.016244,0.169775,0.002075,0.004421,0.004157,0.099671,0.034441,...,-0.009017,0.015145,0.099046,0.026728,0.011302,0.002314,0.000365,0.017789,0.06642,0.038391


In [10]:
nmerge.AdoptionSpeed.value_counts()

4    4197
2    4037
3    3259
1    3090
0     410
Name: AdoptionSpeed, dtype: int64

In [12]:
X = nmerge.drop(columns = 'AdoptionSpeed')
y= nmerge['AdoptionSpeed']

smt = SMOTE(random_state=42, k_neighbors=5)
X_SMOTE, y_SMOTE = smt.fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_SMOTE, y_SMOTE , test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

In [14]:
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(SVC(kernel='rbf'), param_grid)
grid.fit(X_train_s, y_train)
bestg = grid.best_estimator_
y_pred = bestg.predict(X_test_s)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_test, y_pred))


Accuracy: 0.62092
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       840
           1       0.61      0.59      0.60       829
           2       0.42      0.41      0.41       828
           3       0.61      0.50      0.55       871
           4       0.51      0.62      0.56       829

    accuracy                           0.62      4197
   macro avg       0.62      0.62      0.62      4197
weighted avg       0.62      0.62      0.62      4197



NameError: name 'roc_auc_score' is not defined

In [16]:
from sklearn.metrics import roc_auc_score
#roc_auc_score(y_test, y_pred_prob, multi_class = 'ovr')

y_pred_tr = bestg.predict(X_train_s)
accuracy = metrics.accuracy_score(y_train, y_pred_tr)
print('Accuracy: {:.5f}'.format(accuracy))
print(classification_report(y_train, y_pred_tr))

Accuracy: 0.99643
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3357
           1       1.00      0.99      1.00      3368
           2       1.00      0.99      1.00      3369
           3       1.00      1.00      1.00      3326
           4       0.99      1.00      0.99      3368

    accuracy                           1.00     16788
   macro avg       1.00      1.00      1.00     16788
weighted avg       1.00      1.00      1.00     16788

