In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_path = '/kaggle/input/petfinder-adoption-prediction/train/train.csv'
test_path = '/kaggle/input/petfinder-adoption-prediction/test/test.csv'
sub_path = '/kaggle/input/petfinder-adoption-prediction/test/sample_submission.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sub = pd.read_csv(sub_path)

train['dataset_type'] = 'train'
test['dataset_type'] = 'test'

all_data = pd.concat([train, test])
all_data.head(-1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

numeric_cols = train.select_dtypes(include='number').columns
numeric_cols_length = len(numeric_cols)  
print('numeric_cols_length = {}'.format(numeric_cols_length))
#
corr = train.select_dtypes(include='number').corr()
plt.figure(figsize=(16,6))
corr['AdoptionSpeed'].sort_values(ascending=False)[1:].plot(kind='bar')
plt.tight_layout()

In [None]:
fealtures_to_use = ['Breed1', 'Age', 'Quantity', 'Gender', 'MaturitySize', 'Health', 'Color1', 'Color2', 'Color3', 'Vaccinated','Sterilized','Type','FurLength']

In [None]:
train_dataset = train[[col for col in fealtures_to_use if col in train.columns]] #14993 rows 
train_dataset

In [None]:
test_dataset = test[[col for col in fealtures_to_use if col in test.columns]]  #3972 rows
test_dataset

**cohen_kappa_score function**

In [None]:
from sklearn.metrics import cohen_kappa_score
def kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold

#5 class
n_fold = 5

folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=15)

In [None]:
y = train['AdoptionSpeed']

In [None]:
import gc
import time
#ML Algoirthm
#from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
#import sklearn.linear_model as linear_model

import lightgbm as lgb
from lightgbm import LGBMRegressor
#from sklearn.svm import SVR

#from sklearn.ensemble import GradientBoostingRegressor
#from xgboost import XGBRegressor
#from mlxtend.regressor import StackingCVRegressor

result_dict_lgb = {}
prediction = np.zeros((len(test_dataset), 5))
scores = []

X = train_dataset
X_test = test_dataset

params = {'num_leaves': 512,
        #  'min_data_in_leaf': 60,
         'objective': 'multiclass',
         'max_depth': -1,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 3,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
        #  "lambda_l1": 0.1,
         # "lambda_l2": 0.1,
         "random_state": 42,          
         "verbosity": -1,
         "num_class": 5}

for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
    #chia tập train và valid
    gc.collect()
    print('Fold', fold_n + 1, 'started at', time.ctime(), 'with (train_index, valid_index):', train_index, valid_index)
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    #
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
    
    #Build model lightgbm
    model = lgb.train(params,
                    train_data,
                    num_boost_round=20000,
                    valid_sets = [train_data, valid_data],
                    verbose_eval=500,
                    early_stopping_rounds = 200)
    #prediction cho tập valid
    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    scores.append(kappa(y_valid, y_pred_valid.argmax(1)))
    print('Fold kappa:', kappa(y_valid, y_pred_valid.argmax(1)))
    print('')
    prediction += y_pred

#chia lấy trung bình    
prediction /= n_fold

print('scores kappa: ', scores)    
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
result_dict_lgb['prediction'] = prediction        
        

In [None]:
#
import xgboost as xgb
result_dict_xgb = {}
prediction = np.zeros((len(test_dataset), 5))
scores = []

X = train_dataset
X_test = test_dataset

#xem thêm parameters tai: https://xgboost.readthedocs.io/en/latest/parameter.html
params = {'eta': 0.01, 
          'max_depth': 9, 
          'subsample': 0.9, 
          'colsample_bytree': 0.9, 
          'objective': 'multi:softprob', 
          'eval_metric': 'merror', 
          'silent': True, 
          'nthread': 4, 
          'num_class': 5}

for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
    gc.collect()
    print('Fold', fold_n + 1, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    #
    train_data = xgb.DMatrix(data=X_train, label=y_train)
    valid_data = xgb.DMatrix(data=X_valid, label=y_valid)

    watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
    model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params)
    y_pred_valid = model.predict(xgb.DMatrix(X_valid), ntree_limit=model.best_ntree_limit)
    y_pred = model.predict(xgb.DMatrix(X_test), ntree_limit=model.best_ntree_limit)
            
    
    print('y_valid:', y_valid)
    print('y_pred_valid:', y_pred_valid.argmax(1))
    #
    scores.append(kappa(y_valid, y_pred_valid.argmax(1)))
    print('Fold kappa:', kappa(y_valid, y_pred_valid.argmax(1)))
    print('')
    prediction += y_pred
    
prediction /= n_fold

print('scores kappa: ', scores)    
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
result_dict_xgb['prediction'] = prediction      

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

result_dict_gbc = {}
prediction = np.zeros((len(test_dataset)))
scores = []

X = train_dataset
X_test = test_dataset

for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
    gc.collect()
    print('Fold', fold_n + 1, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    #
    clf = GradientBoostingClassifier(random_state=0)
    clf.fit(X_train, y_train)
    
    y_pred_valid = clf.predict(X_valid)
    y_pred = clf.predict(X_test)
        
    scores.append(kappa(y_valid, y_pred_valid))
    print('Fold kappa:', kappa(y_valid, y_pred_valid))
    prediction += y_pred
    
prediction /= n_fold

print('scores kappa: ', scores)    
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
result_dict_gbc['prediction'] = prediction 

In [None]:
# Keras specific
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical 
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

result_dict_MLP = {}
prediction = np.zeros((len(test_dataset)))
scores = []

X = train_dataset
X_test = test_dataset

sc = StandardScaler()

for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
    gc.collect()
    print('Fold', fold_n + 1, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    X_train = sc.fit_transform(X_train)
    X_valid = sc.fit_transform(X_valid)
    X_test = sc.fit_transform(X_test)
    
    model = MLPClassifier(max_iter= 500, activation = 'relu',solver='adam',random_state=1)
    
    # build the model
    model.fit(X_train, y_train)

    
    
    y_pred_valid = model.predict(X_valid)
    y_pred = model.predict(X_test)
        
    scores.append(kappa(y_valid, y_pred_valid))
    print('Fold kappa:', kappa(y_valid, y_pred_valid))
    
    print('y_pred:', y_pred)
    prediction += y_pred
    
prediction /= n_fold

print('scores kappa: ', scores)    
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
result_dict_MLP['prediction'] = prediction 

In [None]:
submission_prediction_lgb = (result_dict_lgb['prediction']).argmax(1)
submission_lgb = pd.DataFrame({'PetID': sub.PetID, 'AdoptionSpeed': [int(i) for i in submission_prediction_lgb]})
submission_lgb.head()

In [None]:
submission_prediction_xgb = (result_dict_xgb['prediction']).argmax(1)
submission_xgb = pd.DataFrame({'PetID': sub.PetID, 'AdoptionSpeed': [int(i) for i in submission_prediction_xgb]})
submission_xgb.head()

In [None]:
submission_prediction_gbc = (result_dict_gbc['prediction'])
submission_gbc = pd.DataFrame({'PetID': sub.PetID, 'AdoptionSpeed': [float(i) for i in submission_prediction_gbc]})
submission_gbc.head()

In [None]:
submission_prediction_MLP = (result_dict_MLP['prediction'])
submission_MLP = pd.DataFrame({'PetID': sub.PetID, 
                               'AdoptionSpeed': [float(i) for i in submission_prediction_MLP],
                              }
                             )
submission_MLP.head()

In [None]:
submission_prediction_all = (result_dict_lgb['prediction'].argmax(1) + result_dict_xgb['prediction'].argmax(1) + result_dict_gbc['prediction'] + result_dict_MLP['prediction']) / 4
submission_all = pd.DataFrame({'PetID': sub.PetID, 
                           'AdoptionSpeed_lgb': [(i) for i in submission_prediction_lgb], #lgb
                           'AdoptionSpeed_xgb': [(i) for i in submission_prediction_xgb], #xgboost
                           'AdoptionSpeed_gbc': [int(round(i)) for i in submission_prediction_gbc], #GradientBoostingClassifier  
                           'AdoptionSpeed_MLP': [int(round(i)) for i in submission_prediction_MLP], #MLP      
                           'AdoptionSpeed': [int(round(i)) for i in submission_prediction_all]
                          })
submission_all.head(50)

In [None]:
#submission = pd.DataFrame({'PetID': sub.PetID,
                         #  'AdoptionSpeed': [int(i) for i in submission_all]
                         # })
#submission.to_csv('submission.csv')
submission = submission_all[['PetID', 'AdoptionSpeed']]

submission.to_csv('submission.csv', index=False)
