In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import confusion_matrix, classification_report
import os

data = pd.read_csv('preprocess_done/train_preprocess_done.csv')
data = pd.DataFrame(data)
test_data = pd.read_csv('preprocess_done/test_preprocess_done.csv')
test_data = pd.DataFrame(test_data)

features_col = ['Name','AgeuponOutcome','Breed','Color','Intact Female',
                        'Intact Male','Neutered Male','Spayed Female','Unknown','Cat','Dog']

folder_name = 'Output'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [2]:
# 正式的train test
X = data.loc[:, features_col]
y = data.OutcomeType
X_test = test_data.loc[:, features_col]

# 自己切來調整的train val
from sklearn.model_selection import train_test_split
train, val = train_test_split(data, test_size=0.2, random_state=32)
X_train = train.loc[:, features_col]
y_train = train.OutcomeType
X_val = val.loc[:, features_col]
y_val = val.OutcomeType

In [3]:
# RF
RandomForestModel = RandomForestClassifier(
                            n_estimators=500,
                            random_state=0,
                            max_depth=8,)
                             
RandomForestModel.fit(X_train, y_train)

val_proba = RandomForestModel.predict_proba(X_val)
print('Log Loss', log_loss(y_val,val_proba))

y_valid_pred=RandomForestModel.predict(X_val)
print(classification_report(y_val, y_valid_pred))
print(confusion_matrix(y_val, y_valid_pred))

# -----------------------------
RandomForestModel = RandomForestClassifier(
                            n_estimators=500,
                            random_state=0,
                            max_depth=8,)
RandomForestModel.fit(X, y)
test_proba = RandomForestModel.predict_proba(X_test)
sub = pd.read_csv('data/sample_submission.csv')
sub.iloc[:,1:] = test_proba
sub.to_csv((os.path.join(folder_name, 'submission_rf.csv')), index=False)

Log Loss 0.865436672513836


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.63      0.84      0.72      2089
           1       0.00      0.00      0.00        46
           2       0.68      0.11      0.19       335
           3       0.50      0.50      0.50       965
           4       0.77      0.61      0.68      1911

    accuracy                           0.65      5346
   macro avg       0.52      0.41      0.42      5346
weighted avg       0.65      0.65      0.63      5346

[[1762    0    2  249   76]
 [   7    0    1    4   34]
 [  52    0   36   78  169]
 [ 406    0    2  487   70]
 [ 570    0   12  158 1171]]


In [4]:
# ExtraTrees
ExtraTreesModel = ExtraTreesClassifier(
                            n_estimators=500,
                            random_state=0,
                            max_depth=9,)                      
ExtraTreesModel.fit(X_train, y_train)

val_proba = ExtraTreesModel.predict_proba(X_val)
print('Log Loss', log_loss(y_val,val_proba))

y_valid_pred=ExtraTreesModel.predict(X_val)
print(classification_report(y_val, y_valid_pred))
print(confusion_matrix(y_val, y_valid_pred))  

# -----------------------------
ExtraTreesModel = ExtraTreesClassifier(
                            n_estimators=500,
                            random_state=0,
                            max_depth=9,)
ExtraTreesModel.fit(X, y)
test_proba = ExtraTreesModel.predict_proba(X_test)
sub = pd.read_csv('data/sample_submission.csv')
sub.iloc[:,1:] = test_proba
sub.to_csv((os.path.join(folder_name, 'submission_ExtraTree.csv')), index=False)

Log Loss 0.8882159387876221


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.59      0.91      0.72      2089
           1       0.00      0.00      0.00        46
           2       0.63      0.09      0.16       335
           3       0.53      0.31      0.39       965
           4       0.76      0.61      0.68      1911

    accuracy                           0.64      5346
   macro avg       0.50      0.38      0.39      5346
weighted avg       0.64      0.64      0.60      5346

[[1901    0    2  113   73]
 [   9    0    1    2   34]
 [  74    0   31   55  175]
 [ 582    0    4  299   80]
 [ 642    0   11   94 1164]]


In [5]:
# GradientBoosting
GradientBoostingModel = GradientBoostingClassifier(n_estimators=200, 
                                    learning_rate=0.05,
                                    max_depth=2, 
                                    random_state=0)

GradientBoostingModel.fit(X_train, y_train)

val_proba = GradientBoostingModel.predict_proba(X_val)
print('Log Loss', log_loss(y_val,val_proba))

y_valid_pred=GradientBoostingModel.predict(X_val)
print(classification_report(y_val, y_valid_pred))
print(confusion_matrix(y_val, y_valid_pred))    

# -----------------------------
GradientBoostingModel = GradientBoostingClassifier(n_estimators=200, 
                                    learning_rate=0.05,
                                    max_depth=2, 
                                    random_state=0)
GradientBoostingModel.fit(X, y)
test_proba = GradientBoostingModel.predict_proba(X_test)
sub = pd.read_csv('data/sample_submission.csv')
sub.iloc[:,1:] = test_proba
sub.to_csv((os.path.join(folder_name, 'submission_GB.csv')), index=False)

Log Loss 0.868911177904816
              precision    recall  f1-score   support

           0       0.63      0.84      0.72      2089
           1       0.00      0.00      0.00        46
           2       0.61      0.10      0.17       335
           3       0.50      0.49      0.50       965
           4       0.76      0.62      0.68      1911

    accuracy                           0.65      5346
   macro avg       0.50      0.41      0.42      5346
weighted avg       0.65      0.65      0.63      5346



  _warn_prf(average, modifier, msg_start, len(result))


[[1753    0    3  246   87]
 [   6    0    1    4   35]
 [  51    0   34   79  171]
 [ 410    0    2  477   76]
 [ 562    0   16  147 1186]]


In [6]:
# SVM
SVCModel = SVC(gamma='auto',
                probability=True,
                random_state=0,)

SVCModel.fit(X_train, y_train)      

val_proba = SVCModel.predict_proba(X_val)
print('Log Loss', log_loss(y_val,val_proba))

y_valid_pred=SVCModel.predict(X_val)
print(classification_report(y_val, y_valid_pred))
print(confusion_matrix(y_val, y_valid_pred))  

# -----------------------------
SVCModel = SVC(gamma='auto',
                probability=True,
                random_state=0,)
SVCModel.fit(X, y)
test_proba = SVCModel.predict_proba(X_test)
sub = pd.read_csv('data/sample_submission.csv')
sub.iloc[:,1:] = test_proba
sub.to_csv((os.path.join(folder_name, 'submission_SVM.csv')), index=False)

Log Loss 0.9403330663746119


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.64      0.83      0.72      2089
           1       0.00      0.00      0.00        46
           2       0.60      0.07      0.13       335
           3       0.49      0.51      0.50       965
           4       0.75      0.63      0.68      1911

    accuracy                           0.64      5346
   macro avg       0.50      0.41      0.41      5346
weighted avg       0.64      0.64      0.63      5346

[[1727    0    4  238  120]
 [   7    0    1    4   34]
 [  52    0   24   88  171]
 [ 394    0    3  494   74]
 [ 524    0    8  176 1203]]


In [7]:
# XGBoost
from sklearn.utils import class_weight
xgbModel = xgb.XGBClassifier(objective="multi:softprob", 
                                random_state=0)

xgbModel.fit(X_train, y_train)      

val_proba = xgbModel.predict_proba(X_val)
print('Log Loss', log_loss(y_val,val_proba))

y_valid_pred=xgbModel.predict(X_val)
print(classification_report(y_val, y_valid_pred))
print(confusion_matrix(y_val, y_valid_pred))
# -----------------------------
xgbModel = xgb.XGBClassifier(objective="multi:softprob",
                                random_state=0)
xgbModel.fit(X, y)
test_proba = xgbModel.predict_proba(X_test)
sub = pd.read_csv('data/sample_submission.csv')
sub.iloc[:,1:] = test_proba
sub.to_csv((os.path.join(folder_name, 'submission_xgb.csv')), index=False)

Log Loss 0.8736809389450575
              precision    recall  f1-score   support

           0       0.64      0.83      0.72      2089
           1       0.00      0.00      0.00        46
           2       0.61      0.16      0.25       335
           3       0.50      0.49      0.50       965
           4       0.75      0.62      0.68      1911

    accuracy                           0.65      5346
   macro avg       0.50      0.42      0.43      5346
weighted avg       0.65      0.65      0.63      5346

[[1744    0    4  227  114]
 [   6    0    3    4   33]
 [  52    0   54   76  153]
 [ 400    0    5  471   89]
 [ 537    2   23  159 1190]]


In [8]:
# PyCaret
from pycaret.classification import *
experiment = setup(data, target='OutcomeType')

Unnamed: 0,Description,Value
0,session_id,5040
1,Target,OutcomeType
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(26729, 12)"
5,Missing Values,False
6,Numeric Features,1
7,Categorical Features,10
8,Ordinal Features,False
9,High Cardinality Features,False


In [9]:
best_model = compare_models(fold = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.6331,0.8293,0.3967,0.6333,0.6157,0.4385,0.4465,2.052
lightgbm,Light Gradient Boosting Machine,0.6305,0.8258,0.3934,0.629,0.6113,0.4322,0.4423,9.354
xgboost,Extreme Gradient Boosting,0.6288,0.8253,0.3895,0.6239,0.6095,0.4296,0.4392,2.056
catboost,CatBoost Classifier,0.6272,0.8232,0.3858,0.6211,0.6069,0.4265,0.4363,7.532
lr,Logistic Regression,0.6268,0.8084,0.3694,0.612,0.5944,0.4153,0.4328,6.872
lda,Linear Discriminant Analysis,0.625,0.8045,0.3729,0.6073,0.5942,0.4139,0.4315,0.028
rf,Random Forest Classifier,0.6223,0.8134,0.3859,0.6171,0.6024,0.4193,0.4294,0.382
nb,Naive Bayes,0.622,0.7848,0.361,0.6114,0.5818,0.4015,0.4297,1.434
ridge,Ridge Classifier,0.6219,0.0,0.3444,0.575,0.5736,0.3982,0.424,0.018
ada,Ada Boost Classifier,0.6187,0.771,0.373,0.5935,0.5911,0.4071,0.4178,0.21


In [10]:
experiment

(0        3
 1        2
 2        0
 3        4
 4        4
         ..
 26724    4
 26725    4
 26726    0
 26727    4
 26728    4
 Name: OutcomeType, Length: 26729, dtype: int64,
 False,
 '2fef',
 -1,
        Name  OutcomeType  AgeuponOutcome  Breed  Color  Intact Female  \
 0         1            3           365.0      1      1              0   
 1         1            2           365.0      1      0              0   
 2         1            0           730.0      1      1              0   
 3         0            4            21.0      1      0              0   
 4         0            4           730.0      0      0              0   
 ...     ...          ...             ...    ...    ...            ...   
 26724     0            4            30.0      1      1              0   
 26725     0            4            90.0      1      0              0   
 26726     1            0          1460.0      1      1              0   
 26727     0            4            28.0      1      0  

In [11]:
best_model

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=5040, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [12]:
# PyCaret - Test
predictions = predict_model(best_model, raw_score=True, data=X_test)
predictions.head()

Unnamed: 0,Name,AgeuponOutcome,Breed,Color,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown,Cat,Dog,Label,Score_0,Score_1,Score_2,Score_3,Score_4
0,1,300.0,1,1,1,0,0,0,0,0,1,4,0.0841,0.0072,0.0447,0.2411,0.6228
1,1,730.0,0,1,0,0,0,1,0,0,1,0,0.5448,0.0022,0.0271,0.2549,0.171
2,1,365.0,1,0,0,0,1,0,0,1,0,0,0.4962,0.0018,0.0166,0.2429,0.2424
3,1,120.0,1,0,0,1,0,0,0,0,1,4,0.1481,0.0103,0.0513,0.1,0.6902
4,1,730.0,1,0,0,0,1,0,0,0,1,0,0.376,0.0025,0.0274,0.3355,0.2585


In [13]:
predictions.to_csv((os.path.join(folder_name, 'PyCaret_BestModel_submission.csv')), index=False)