In [1]:
from tensorflow import set_random_seed
set_random_seed(2019)

In [2]:
import gc
import glob
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import warnings

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from joblib import Parallel, delayed
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

np.random.seed(seed=1337)
warnings.filterwarnings('ignore')

split_char = '/'

In [3]:
os.listdir('../input')

['petfinder-external-data', 'densenet-keras', 'petfinder-adoption-prediction']

In [4]:
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')
sample_submission = pd.read_csv('../input/petfinder-adoption-prediction/test/sample_submission.csv')

## Image features

In [5]:
import cv2
import os
from keras.applications.densenet import preprocess_input, DenseNet121

Using TensorFlow backend.


In [6]:
def resize_to_square(im):
    old_size = im.shape[:2]
    ratio = float(img_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
    return new_im

def load_image(path, pet_id):
    image = cv2.imread(f'{path}{pet_id}-1.jpg')
    new_image = resize_to_square(image)
    new_image = preprocess_input(new_image)
    return new_image

In [7]:
img_size = 256
batch_size = 256

In [8]:
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
import keras.backend as K
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor = inp, 
                       weights="../input/densenet-keras/DenseNet-BC-121-32-no-top.h5",
                       include_top = False)
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = AveragePooling1D(4)(x)
out = Lambda(lambda x: x[:,:,0])(x)

m = Model(inp,out)

Instructions for updating:
Colocations handled automatically by placer.


In [9]:
pet_ids = train['PetID'].values
n_batches = len(pet_ids) // batch_size + (len(pet_ids) % batch_size != 0)

features = {}
for b in tqdm(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/train_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]

100%|██████████| 59/59 [02:30<00:00,  2.23s/it]


In [10]:
train_feats = pd.DataFrame.from_dict(features, orient='index')
train_feats.columns = [f'pic_{i}' for i in range(train_feats.shape[1])]

In [11]:
pet_ids = test['PetID'].values
n_batches = len(pet_ids) // batch_size + (len(pet_ids) % batch_size != 0)

features = {}
for b in tqdm(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/test_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]

100%|██████████| 16/16 [00:38<00:00,  2.08s/it]


In [12]:
test_feats = pd.DataFrame.from_dict(features, orient='index')
test_feats.columns = [f'pic_{i}' for i in range(test_feats.shape[1])]

In [13]:
train_feats = train_feats.reset_index()
train_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

test_feats = test_feats.reset_index()
test_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

In [14]:
all_ids = pd.concat([train, test], axis=0, ignore_index=True, sort=False)[['PetID']]
all_ids.shape

(18941, 1)

In [15]:
n_components = 24
svd_ = TruncatedSVD(n_components=n_components, random_state=1337)

features_df = pd.concat([train_feats, test_feats], axis=0)
features = features_df[[f'pic_{i}' for i in range(256)]].values

svd_col = svd_.fit_transform(features)
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('IMG_SVD_')

# Version10
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA, FastICA,NMF,LatentDirichletAllocation,IncrementalPCA,MiniBatchSparsePCA
from sklearn.decomposition import TruncatedSVD,FactorAnalysis,KernelPCA

train_columns = [c for c in train_feats.columns if c not in ['PetID']]


# PCA
n_comp = 12

# ICA
ica = FastICA(n_components=n_comp, random_state=2019)
ica2_results_train = pd.DataFrame(ica.fit_transform(train_feats[train_columns].fillna(-1)))
ica2_results_test = pd.DataFrame(ica.transform(test_feats[train_columns].fillna(-1)))
ica2_results = pd.concat([ica2_results_train,ica2_results_test]).reset_index(drop=True)
ica2_results = ica2_results.add_prefix('IMG_PCA_')

In [16]:
img_features = pd.concat([all_ids, svd_col], axis=1)

#Version10
#img_features = pd.concat([img_features, ica2_results], axis=1)

## About metadata and sentiment

In [17]:
labels_breed = pd.read_csv('../input/petfinder-adoption-prediction/breed_labels.csv')
labels_state = pd.read_csv('../input/petfinder-adoption-prediction/color_labels.csv')
labels_color = pd.read_csv('../input/petfinder-adoption-prediction/state_labels.csv')

In [18]:
train_image_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_images/*.jpg'))
train_metadata_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_metadata/*.json'))
train_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_sentiment/*.json'))

print(f'num of train images files: {len(train_image_files)}')
print(f'num of train metadata files: {len(train_metadata_files)}')
print(f'num of train sentiment files: {len(train_sentiment_files)}')


test_image_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_images/*.jpg'))
test_metadata_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_metadata/*.json'))
test_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_sentiment/*.json'))

print(f'num of test images files: {len(test_image_files)}')
print(f'num of test metadata files: {len(test_metadata_files)}')
print(f'num of test sentiment files: {len(test_sentiment_files)}')

num of train images files: 58311
num of train metadata files: 58311
num of train sentiment files: 14442
num of test images files: 15040
num of test metadata files: 15040
num of test sentiment files: 3815


### Train

In [19]:
# Images:
train_df_ids = train[['PetID']]
print(train_df_ids.shape)

# Metadata:
train_df_ids = train[['PetID']]
train_df_metadata = pd.DataFrame(train_metadata_files)
train_df_metadata.columns = ['metadata_filename']
train_metadata_pets = train_df_metadata['metadata_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
train_df_metadata = train_df_metadata.assign(PetID=train_metadata_pets)
print(len(train_metadata_pets.unique()))

pets_with_metadatas = len(np.intersect1d(train_metadata_pets.unique(), train_df_ids['PetID'].unique()))
print(f'fraction of pets with metadata: {pets_with_metadatas / train_df_ids.shape[0]:.3f}')

# Sentiment:
train_df_ids = train[['PetID']]
train_df_sentiment = pd.DataFrame(train_sentiment_files)
train_df_sentiment.columns = ['sentiment_filename']
train_sentiment_pets = train_df_sentiment['sentiment_filename'].apply(lambda x: x.split(split_char)[-1].split('.')[0])
train_df_sentiment = train_df_sentiment.assign(PetID=train_sentiment_pets)
print(len(train_sentiment_pets.unique()))

pets_with_sentiments = len(np.intersect1d(train_sentiment_pets.unique(), train_df_ids['PetID'].unique()))
print(f'fraction of pets with sentiment: {pets_with_sentiments / train_df_ids.shape[0]:.3f}')

(14993, 1)
14652
fraction of pets with metadata: 0.977
14442
fraction of pets with sentiment: 0.963


### Test

In [20]:
# Images:
test_df_ids = test[['PetID']]
print(test_df_ids.shape)

# Metadata:
test_df_metadata = pd.DataFrame(test_metadata_files)
test_df_metadata.columns = ['metadata_filename']
test_metadata_pets = test_df_metadata['metadata_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
test_df_metadata = test_df_metadata.assign(PetID=test_metadata_pets)
print(len(test_metadata_pets.unique()))

pets_with_metadatas = len(np.intersect1d(test_metadata_pets.unique(), test_df_ids['PetID'].unique()))
print(f'fraction of pets with metadata: {pets_with_metadatas / test_df_ids.shape[0]:.3f}')

# Sentiment:
test_df_sentiment = pd.DataFrame(test_sentiment_files)
test_df_sentiment.columns = ['sentiment_filename']
test_sentiment_pets = test_df_sentiment['sentiment_filename'].apply(lambda x: x.split(split_char)[-1].split('.')[0])
test_df_sentiment = test_df_sentiment.assign(PetID=test_sentiment_pets)
print(len(test_sentiment_pets.unique()))

pets_with_sentiments = len(np.intersect1d(test_sentiment_pets.unique(), test_df_ids['PetID'].unique()))
print(f'fraction of pets with sentiment: {pets_with_sentiments / test_df_ids.shape[0]:.3f}')

(3948, 1)
3821
fraction of pets with metadata: 0.968
3815
fraction of pets with sentiment: 0.966


## Extract features from json

In [21]:
class PetFinderParser(object):
    
    def __init__(self, debug=False):
        
        self.debug = debug
        self.sentence_sep = ' '
        
        self.extract_sentiment_text = False
    
    def open_json_file(self, filename):
        with open(filename, 'r', encoding='utf-8') as f:
            json_file = json.load(f)
        return json_file
        
    def parse_sentiment_file(self, file):
        """
        Parse sentiment file. Output DF with sentiment features.
        """
        
        file_sentiment = file['documentSentiment']
        file_entities = [x['name'] for x in file['entities']]
        file_entities = self.sentence_sep.join(file_entities)
        
        file_sentences_sentiment = [x['sentiment'] for x in file['sentences']]
        
        file_sentences_sentiment = pd.DataFrame.from_dict(
            file_sentences_sentiment, orient='columns')
        file_sentences_sentiment_df = pd.DataFrame(
            {
                'magnitude_sum': file_sentences_sentiment['magnitude'].sum(axis=0),
                'score_sum': file_sentences_sentiment['score'].sum(axis=0),
                'magnitude_median': file_sentences_sentiment['magnitude'].median(axis=0),
                'score_median': file_sentences_sentiment['score'].median(axis=0),
                #'magnitude_mean': file_sentences_sentiment['magnitude'].mean(axis=0),
                #'score_mean': file_sentences_sentiment['score'].mean(axis=0),
                'magnitude_var': file_sentences_sentiment['magnitude'].var(axis=0),
                'score_var': file_sentences_sentiment['score'].var(axis=0),
                'score_min': file_sentences_sentiment['score'].min(axis=0),
                'score_max': file_sentences_sentiment['score'].max(axis=0),
                'magnitude_min': file_sentences_sentiment['magnitude'].min(axis=0),
                'magnitude_max': file_sentences_sentiment['magnitude'].max(axis=0)
            }, index=[0]
        )
        
        df_sentiment = pd.DataFrame.from_dict(file_sentiment, orient='index').T
        df_sentiment = pd.concat([df_sentiment, file_sentences_sentiment_df], axis=1)
            
        df_sentiment['entities'] = file_entities
        df_sentiment = df_sentiment.add_prefix('sentiment_')
        
        return df_sentiment
    
    def parse_metadata_file(self, file):
        """
        Parse metadata file. Output DF with metadata features.
        """
        
        file_keys = list(file.keys())
        
        if 'labelAnnotations' in file_keys:
            file_annots = file['labelAnnotations']
            file_top_score = np.asarray([x['score'] for x in file_annots]).mean()
            file_top_desc = [x['description'] for x in file_annots]
        else:
            file_top_score = np.nan
            file_top_desc = ['']
        
        file_colors = file['imagePropertiesAnnotation']['dominantColors']['colors']
        file_crops = file['cropHintsAnnotation']['cropHints']

        file_color_score = np.asarray([x['score'] for x in file_colors]).mean()
        file_color_pixelfrac = np.asarray([x['pixelFraction'] for x in file_colors]).mean()

        file_crop_conf = np.asarray([x['confidence'] for x in file_crops]).mean()
        
        if 'importanceFraction' in file_crops[0].keys():
            file_crop_importance = np.asarray([x['importanceFraction'] for x in file_crops]).mean()
        else:
            file_crop_importance = np.nan

        df_metadata = {
            'annots_score': file_top_score,
            'color_score': file_color_score,
            'color_pixelfrac': file_color_pixelfrac,
            'crop_conf': file_crop_conf,
            'crop_importance': file_crop_importance,
            'annots_top_desc': self.sentence_sep.join(file_top_desc)
        }
        
        df_metadata = pd.DataFrame.from_dict(df_metadata, orient='index').T
        df_metadata = df_metadata.add_prefix('metadata_')
        
        return df_metadata
    

def extract_additional_features(pet_id, mode='train'):
    
    sentiment_filename = f'../input/petfinder-adoption-prediction/{mode}_sentiment/{pet_id}.json'
    try:
        sentiment_file = pet_parser.open_json_file(sentiment_filename)
        df_sentiment = pet_parser.parse_sentiment_file(sentiment_file)
        df_sentiment['PetID'] = pet_id
    except FileNotFoundError:
        df_sentiment = []

    dfs_metadata = []
    metadata_filenames = sorted(glob.glob(f'../input/petfinder-adoption-prediction/{mode}_metadata/{pet_id}*.json'))
    if len(metadata_filenames) > 0:
        for f in metadata_filenames:
            metadata_file = pet_parser.open_json_file(f)
            df_metadata = pet_parser.parse_metadata_file(metadata_file)
            df_metadata['PetID'] = pet_id
            dfs_metadata.append(df_metadata)
        dfs_metadata = pd.concat(dfs_metadata, ignore_index=True, sort=False)
    dfs = [df_sentiment, dfs_metadata]
    
    return dfs


pet_parser = PetFinderParser()

In [22]:
debug = False
train_pet_ids = train.PetID.unique()
test_pet_ids = test.PetID.unique()

if debug:
    train_pet_ids = train_pet_ids[:1000]
    test_pet_ids = test_pet_ids[:500]


dfs_train = Parallel(n_jobs=-1, verbose=1)(
    delayed(extract_additional_features)(i, mode='train') for i in train_pet_ids)

train_dfs_sentiment = [x[0] for x in dfs_train if isinstance(x[0], pd.DataFrame)]
train_dfs_metadata = [x[1] for x in dfs_train if isinstance(x[1], pd.DataFrame)]

train_dfs_sentiment = pd.concat(train_dfs_sentiment, ignore_index=True, sort=False)
train_dfs_metadata = pd.concat(train_dfs_metadata, ignore_index=True, sort=False)

print(train_dfs_sentiment.shape, train_dfs_metadata.shape)


dfs_test = Parallel(n_jobs=-1, verbose=1)(
    delayed(extract_additional_features)(i, mode='test') for i in test_pet_ids)

test_dfs_sentiment = [x[0] for x in dfs_test if isinstance(x[0], pd.DataFrame)]
test_dfs_metadata = [x[1] for x in dfs_test if isinstance(x[1], pd.DataFrame)]

test_dfs_sentiment = pd.concat(test_dfs_sentiment, ignore_index=True, sort=False)
test_dfs_metadata = pd.concat(test_dfs_metadata, ignore_index=True, sort=False)

print(test_dfs_sentiment.shape, test_dfs_metadata.shape)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:   38.3s
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 4996 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 6046 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 7196 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 8446 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 9796 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 11246 tasks      |

(14442, 14) (58311, 7)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 772 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 1772 tasks      | elapsed:   48.8s
[Parallel(n_jobs=-1)]: Done 3172 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 3948 out of 3948 | elapsed:  1.8min finished


(3815, 14) (15040, 7)


### group extracted features by PetID:

In [23]:
aggregates = ['mean', 'var']
sent_agg = ['sum']


# Train
train_metadata_desc = train_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
train_metadata_desc = train_metadata_desc.reset_index()
train_metadata_desc[
    'metadata_annots_top_desc'] = train_metadata_desc[
    'metadata_annots_top_desc'].apply(lambda x: ' '.join(x))

prefix = 'metadata'
train_metadata_gr = train_dfs_metadata.drop(['metadata_annots_top_desc'], axis=1)
for i in train_metadata_gr.columns:
    if 'PetID' not in i:
        train_metadata_gr[i] = train_metadata_gr[i].astype(float)
train_metadata_gr = train_metadata_gr.groupby(['PetID']).agg(aggregates)
train_metadata_gr.columns = pd.Index([f'{c[0]}_{c[1].upper()}' for c in train_metadata_gr.columns.tolist()])
train_metadata_gr = train_metadata_gr.reset_index()


train_sentiment_desc = train_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
train_sentiment_desc = train_sentiment_desc.reset_index()
train_sentiment_desc[
    'sentiment_entities'] = train_sentiment_desc[
    'sentiment_entities'].apply(lambda x: ' '.join(x))

prefix = 'sentiment'
train_sentiment_gr = train_dfs_sentiment.drop(['sentiment_entities'], axis=1)
for i in train_sentiment_gr.columns:
    if 'PetID' not in i:
        train_sentiment_gr[i] = train_sentiment_gr[i].astype(float)
train_sentiment_gr = train_sentiment_gr.groupby(['PetID']).agg(sent_agg)
train_sentiment_gr.columns = pd.Index([f'{c[0]}' for c in train_sentiment_gr.columns.tolist()])
train_sentiment_gr = train_sentiment_gr.reset_index()


# Test
test_metadata_desc = test_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
test_metadata_desc = test_metadata_desc.reset_index()
test_metadata_desc[
    'metadata_annots_top_desc'] = test_metadata_desc[
    'metadata_annots_top_desc'].apply(lambda x: ' '.join(x))

prefix = 'metadata'
test_metadata_gr = test_dfs_metadata.drop(['metadata_annots_top_desc'], axis=1)
for i in test_metadata_gr.columns:
    if 'PetID' not in i:
        test_metadata_gr[i] = test_metadata_gr[i].astype(float)
test_metadata_gr = test_metadata_gr.groupby(['PetID']).agg(aggregates)
test_metadata_gr.columns = pd.Index([f'{c[0]}_{c[1].upper()}' for c in test_metadata_gr.columns.tolist()])
test_metadata_gr = test_metadata_gr.reset_index()


test_sentiment_desc = test_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
test_sentiment_desc = test_sentiment_desc.reset_index()
test_sentiment_desc[
    'sentiment_entities'] = test_sentiment_desc[
    'sentiment_entities'].apply(lambda x: ' '.join(x))

prefix = 'sentiment'
test_sentiment_gr = test_dfs_sentiment.drop(['sentiment_entities'], axis=1)
for i in test_sentiment_gr.columns:
    if 'PetID' not in i:
        test_sentiment_gr[i] = test_sentiment_gr[i].astype(float)
test_sentiment_gr = test_sentiment_gr.groupby(['PetID']).agg(sent_agg)
test_sentiment_gr.columns = pd.Index([f'{c[0]}' for c in test_sentiment_gr.columns.tolist()])
test_sentiment_gr = test_sentiment_gr.reset_index()

### merge processed DFs with base train/test DF:

In [24]:
# Train merges:
train_proc = train.copy()
train_proc = train_proc.merge(
    train_sentiment_gr, how='left', on='PetID')
train_proc = train_proc.merge(
    train_metadata_gr, how='left', on='PetID')
train_proc = train_proc.merge(
    train_metadata_desc, how='left', on='PetID')
train_proc = train_proc.merge(
    train_sentiment_desc, how='left', on='PetID')

# Test merges:
test_proc = test.copy()
test_proc = test_proc.merge(
    test_sentiment_gr, how='left', on='PetID')
test_proc = test_proc.merge(
    test_metadata_gr, how='left', on='PetID')
test_proc = test_proc.merge(
    test_metadata_desc, how='left', on='PetID')
test_proc = test_proc.merge(
    test_sentiment_desc, how='left', on='PetID')

print(train_proc.shape, test_proc.shape)
assert train_proc.shape[0] == train.shape[0]
assert test_proc.shape[0] == test.shape[0]

(14993, 48) (3948, 47)


In [25]:
train_breed_main = train_proc[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

train_breed_main = train_breed_main.iloc[:, 2:]
train_breed_main = train_breed_main.add_prefix('main_breed_')

train_breed_second = train_proc[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))

train_breed_second = train_breed_second.iloc[:, 2:]
train_breed_second = train_breed_second.add_prefix('second_breed_')


train_proc = pd.concat(
    [train_proc, train_breed_main, train_breed_second], axis=1)


test_breed_main = test_proc[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

test_breed_main = test_breed_main.iloc[:, 2:]
test_breed_main = test_breed_main.add_prefix('main_breed_')

test_breed_second = test_proc[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))

test_breed_second = test_breed_second.iloc[:, 2:]
test_breed_second = test_breed_second.add_prefix('second_breed_')


test_proc = pd.concat(
    [test_proc, test_breed_main, test_breed_second], axis=1)

print(train_proc.shape, test_proc.shape)

(14993, 52) (3948, 51)


In [26]:
X = pd.concat([train_proc, test_proc], ignore_index=True, sort=False)

In [27]:
X_temp = X.copy()

text_columns = ['Description', 'metadata_annots_top_desc', 'sentiment_entities']
categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']

to_drop_columns = ['PetID', 'Name', 'RescuerID']

In [28]:
rescuer_count = X.groupby(['RescuerID'])['PetID'].count().reset_index()
rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']

X_temp = X_temp.merge(rescuer_count, how='left', on='RescuerID')

In [29]:
for i in categorical_columns:
    X_temp.loc[:, i] = pd.factorize(X_temp.loc[:, i])[0]

In [30]:
X_text = X_temp[text_columns]

for i in X_text.columns:
    X_text.loc[:, i] = X_text.loc[:, i].fillna('none')

In [31]:
X_temp['Length_Description'] = X_text['Description'].map(len)
X_temp['Length_metadata_annots_top_desc'] = X_text['metadata_annots_top_desc'].map(len)
X_temp['Lengths_sentiment_entities'] = X_text['sentiment_entities'].map(len)

### TFIDF

In [32]:
n_components = 16
text_features = []

# Generate text features:
for i in X_text.columns:
    
    # Initialize decomposition methods:
    print(f'generating features from: {i}')
    tfv = TfidfVectorizer(min_df=2,  max_features=None,
                          strip_accents='unicode', analyzer='word', token_pattern=r'(?u)\b\w+\b',
                          ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)
    svd_ = TruncatedSVD(
        n_components=n_components, random_state=1337)
    
    tfidf_col = tfv.fit_transform(X_text.loc[:, i].values)
    
    svd_col = svd_.fit_transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))
    
    text_features.append(svd_col)
    
text_features = pd.concat(text_features, axis=1)

X_temp = pd.concat([X_temp, text_features], axis=1)

for i in X_text.columns:
    X_temp = X_temp.drop(i, axis=1)

generating features from: Description
generating features from: metadata_annots_top_desc
generating features from: sentiment_entities


### Merge image features

In [33]:
X_temp = X_temp.merge(img_features, how='left', on='PetID')

### Add image_size features

In [34]:
from PIL import Image
train_df_ids = train[['PetID']]
test_df_ids = test[['PetID']]

train_df_imgs = pd.DataFrame(train_image_files)
train_df_imgs.columns = ['image_filename']
train_imgs_pets = train_df_imgs['image_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])

test_df_imgs = pd.DataFrame(test_image_files)
test_df_imgs.columns = ['image_filename']
test_imgs_pets = test_df_imgs['image_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])

train_df_imgs = train_df_imgs.assign(PetID=train_imgs_pets)
test_df_imgs = test_df_imgs.assign(PetID=test_imgs_pets)

def getSize(filename):
    st = os.stat(filename)
    return st.st_size

def getDimensions(filename):
    img_size = Image.open(filename).size
    return img_size 

train_df_imgs['image_size'] = train_df_imgs['image_filename'].apply(getSize)
train_df_imgs['temp_size'] = train_df_imgs['image_filename'].apply(getDimensions)
train_df_imgs['width'] = train_df_imgs['temp_size'].apply(lambda x : x[0])
train_df_imgs['height'] = train_df_imgs['temp_size'].apply(lambda x : x[1])
train_df_imgs = train_df_imgs.drop(['temp_size'], axis=1)

test_df_imgs['image_size'] = test_df_imgs['image_filename'].apply(getSize)
test_df_imgs['temp_size'] = test_df_imgs['image_filename'].apply(getDimensions)
test_df_imgs['width'] = test_df_imgs['temp_size'].apply(lambda x : x[0])
test_df_imgs['height'] = test_df_imgs['temp_size'].apply(lambda x : x[1])
test_df_imgs = test_df_imgs.drop(['temp_size'], axis=1)

aggs = {
    'image_size': ['sum', 'mean', 'var'],
    'width': ['sum', 'mean', 'var'],
    'height': ['sum', 'mean', 'var'],
}

agg_train_imgs = train_df_imgs.groupby('PetID').agg(aggs)
new_columns = [
    k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_train_imgs.columns = new_columns
agg_train_imgs = agg_train_imgs.reset_index()

agg_test_imgs = test_df_imgs.groupby('PetID').agg(aggs)
new_columns = [
    k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_test_imgs.columns = new_columns
agg_test_imgs = agg_test_imgs.reset_index()

agg_imgs = pd.concat([agg_train_imgs, agg_test_imgs], axis=0).reset_index(drop=True)

In [35]:
X_temp = X_temp.merge(agg_imgs, how='left', on='PetID')

### Drop ID, name and rescuerID

In [36]:
X_temp = X_temp.drop(to_drop_columns, axis=1)

### External Data
- state_id_count / external state_population

In [37]:
state = pd.read_csv("../input/petfinder-external-data/petfinder_external_state.csv")
state = state[['StateID','Population']]
state.columns = ['State','Population']

In [38]:
x_count = X_temp[['State','Type']]
x_count = x_count.groupby(['State','Type'])['State'].agg({'count'}).reset_index()
x_count = x_count.merge(state,on='State',how='left')
x_count['demand_rate'] = 100*x_count['count']/x_count['Population']
x_count = x_count[['State','Type','demand_rate']]

In [39]:
X_temp = pd.merge(X_temp,x_count,how='left',on=['State','Type'])
X_train = X_temp.loc[np.isfinite(X_temp.AdoptionSpeed), :]
X_test = X_temp.loc[~np.isfinite(X_temp.AdoptionSpeed), :]

X_test = X_test.drop(['AdoptionSpeed'], axis=1)

assert X_train.shape[0] == train.shape[0]
assert X_test.shape[0] == test.shape[0]

train_cols = X_train.columns.tolist()
train_cols.remove('AdoptionSpeed')

test_cols = X_test.columns.tolist()

assert np.all(train_cols == test_cols)

In [40]:
X_train_non_null = X_train.fillna(-1)
X_test_non_null = X_test.fillna(-1)

In [41]:
X_train_dog = X_train[X_train['Type'] == 1].drop('Type', axis=1)
X_train_cat = X_train[X_train['Type'] == 2].drop('Type', axis=1)
X_test_dog = X_test[X_test['Type'] == 1].drop('Type', axis=1)
X_test_cat = X_test[X_test['Type'] == 2].drop('Type', axis=1)

In [42]:
X_train_non_null.isnull().any().any(), X_test_non_null.isnull().any().any()

(False, False)

In [43]:
X_train_non_null.shape, X_test_non_null.shape

((14993, 132), (3948, 131))

In [44]:
import scipy as sp

from collections import Counter
from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix


# FROM: https://www.kaggle.com/myltykritik/simple-lgbm-image-features

# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

### OptimizeRounder from [OptimizedRounder() - Improved](https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved)

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0
    
    def _kappa_loss(self, coef, X, y):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return -cohen_kappa_score(y, preds, weights='quadratic')
    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
    
    def predict(self, X, coef):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return preds
    
    def coefficients(self):
        return self.coef_['x']

In [45]:
# put numerical value to one of bins
def to_bins(x, borders):
    for i in range(len(borders)):
        if x <= borders[i]:
            return i
    return len(borders)

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _loss(self, coef, X, y, idx):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        ll = -quadratic_weighted_kappa(y, X_p)
        return ll

    def fit(self, X, y):
        coef = [1.5, 2.0, 2.5, 3.0]
        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [(1, 2), (1.5, 2.5), (2, 3), (2.5, 3.5)]
        for it1 in range(10):
            for idx in range(4):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                coef[idx] = a
                la = self._loss(coef, X, y, idx)
                coef[idx] = b
                lb = self._loss(coef, X, y, idx)
                for it in range(20):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        coef[idx] = a
                        la = self._loss(coef, X, y, idx)
                    else:
                        b = b - (b - a) * golden2
                        coef[idx] = b
                        lb = self._loss(coef, X, y, idx)
        self.coef_ = {'x': coef}

    def predict(self, X, coef):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        return X_p

    def coefficients(self):
        return self.coef_['x']

## Train model

In [46]:
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold

xgb_params = {
    'eval_metric': 'rmse',
    'seed': 1337,
    'eta': 0.0123,
    'subsample': 0.8,
    'colsample_bytree': 0.85,
    'tree_method': 'gpu_hist',
    'device': 'gpu',
    'silent': 1,
}

lgb_params = {'application': 'regression',
              'boosting': 'gbdt',
              'metric': 'rmse',
              'num_leaves': 70,
#               'max_depth': 9,
              'learning_rate': 0.0123,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.85,
    #           'min_split_gain': 0.02,
    #           'min_child_samples': 150,
    #           'min_child_weight': 0.02,
              'lambda_l2': 0.0475,
              'verbosity': -1,
              'data_random_seed': 1337}

cat_params = {
          'depth': 8,
          'eta': 0.03,
          'task_type' :"GPU",
          'random_strength': 1.5,
          'loss_function': 'RMSE',
#           'one_hot_max_size': 2,
          'reg_lambda': 6,
          'od_type': 'Iter',
#           'fold_len_multiplier': 2,
          'border_count': 128,
#           # 'od_type': 'IncToDec',
#           # 'od_pval': 10e-5, 
          'bootstrap_type' : "Bayesian",
#           'bagging_temperature': 1,
          'random_seed': 1337,
          'early_stopping_rounds': 100, 
          'num_boost_round': 2500}

In [47]:
def run_lgb(params, X_train, X_test):
    n_splits = 10
    verbose_eval = 1000
    num_rounds = 60000
    early_stop = 500

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1337)

    oof_train = np.zeros((X_train.shape[0]))
    oof_test = np.zeros((X_test.shape[0]))

    for train_idx, valid_idx in kf.split(X_train, X_train['AdoptionSpeed'].values):

        X_tr = X_train.iloc[train_idx, :]
        X_val = X_train.iloc[valid_idx, :]

        y_tr = X_tr['AdoptionSpeed'].values
        X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)

        y_val = X_val['AdoptionSpeed'].values
        X_val = X_val.drop(['AdoptionSpeed'], axis=1)

        d_train = lgb.Dataset(X_tr, label=y_tr)
        d_valid = lgb.Dataset(X_val, label=y_val)
        watchlist = [d_train, d_valid]

        model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)
    
        valid_pred = model.predict(X_val, num_iteration=model.best_iteration)
        test_pred = model.predict(X_test, num_iteration=model.best_iteration)

        oof_train[valid_idx] = valid_pred
        oof_test += test_pred / n_splits

    return model, oof_train, oof_test

def run_xgb(params, X_train, X_test):
    n_splits = 10
    verbose_eval = 1000
    num_rounds = 60000
    early_stop = 500

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1337)

    oof_train = np.zeros((X_train.shape[0]))
    oof_test = np.zeros((X_test.shape[0]))

    for train_idx, valid_idx in kf.split(X_train, X_train['AdoptionSpeed'].values):

        X_tr = X_train.iloc[train_idx, :]
        X_val = X_train.iloc[valid_idx, :]

        y_tr = X_tr['AdoptionSpeed'].values
        X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)

        y_val = X_val['AdoptionSpeed'].values
        X_val = X_val.drop(['AdoptionSpeed'], axis=1)

        d_train = xgb.DMatrix(data=X_tr, label=y_tr, feature_names=X_tr.columns)
        d_valid = xgb.DMatrix(data=X_val, label=y_val, feature_names=X_val.columns)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(dtrain=d_train, num_boost_round=num_rounds, evals=watchlist,
                         early_stopping_rounds=early_stop, verbose_eval=verbose_eval, params=params)

        valid_pred = model.predict(xgb.DMatrix(X_val, feature_names=X_val.columns), ntree_limit=model.best_ntree_limit)
        test_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_test.columns), ntree_limit=model.best_ntree_limit)

        oof_train[valid_idx] = valid_pred
        oof_test += test_pred / n_splits

    return model, oof_train, oof_test

def run_cat(params, X_train, X_test):
    n_splits = 5
    verbose_eval = 1000
    num_rounds = 2500
    early_stop = 500

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1337)

    oof_train = np.zeros((X_train.shape[0]))
    oof_test = np.zeros((X_test.shape[0]))

    for train_idx, valid_idx in kf.split(X_train, X_train['AdoptionSpeed'].values):

        X_tr = X_train.iloc[train_idx, :]
        X_val = X_train.iloc[valid_idx, :]

        y_tr = X_tr['AdoptionSpeed'].values
        X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)

        y_val = X_val['AdoptionSpeed'].values
        X_val = X_val.drop(['AdoptionSpeed'], axis=1)

        watchlist = (X_tr, y_tr)
        model = CatBoostRegressor(**params)
        model.fit(X_tr, y_tr, eval_set=watchlist, verbose=verbose_eval, use_best_model=True)
    
        valid_pred = model.predict(X_val)
        test_pred = model.predict(X_test)

        oof_train[valid_idx] = valid_pred
        oof_test += test_pred / n_splits

    return model, oof_train, oof_test

In [48]:
model, oof_train_0, oof_test_0 = run_lgb(lgb_params, X_train_non_null, X_test_non_null)
model, oof_train_1, oof_test_1 = run_xgb(xgb_params, X_train_non_null, X_test_non_null)
model, oof_train_2, oof_test_2 = run_cat(cat_params, X_train_non_null, X_test_non_null)

Training until validation scores don't improve for 500 rounds.
[1000]	training's rmse: 0.536446	valid_1's rmse: 1.01185
[2000]	training's rmse: 0.315899	valid_1's rmse: 1.00969
Early stopping, best iteration is:
[2065]	training's rmse: 0.305203	valid_1's rmse: 1.00933
Training until validation scores don't improve for 500 rounds.
[1000]	training's rmse: 0.540136	valid_1's rmse: 1.03712
[2000]	training's rmse: 0.317503	valid_1's rmse: 1.03359
Early stopping, best iteration is:
[2484]	training's rmse: 0.248068	valid_1's rmse: 1.03306
Training until validation scores don't improve for 500 rounds.
[1000]	training's rmse: 0.54191	valid_1's rmse: 1.008
[2000]	training's rmse: 0.316772	valid_1's rmse: 1.0036
[3000]	training's rmse: 0.189501	valid_1's rmse: 1.00205
[4000]	training's rmse: 0.116645	valid_1's rmse: 1.00144
Early stopping, best iteration is:
[4030]	training's rmse: 0.114889	valid_1's rmse: 1.00137
Training until validation scores don't improve for 500 rounds.
[1000]	training's rm



0:	learn: 2.7103782	test: 2.7103782	best: 2.7103782 (0)	total: 38.5ms	remaining: 1m 36s
1000:	learn: 0.8705682	test: 0.8705682	best: 0.8705682 (1000)	total: 38.4s	remaining: 57.6s
2000:	learn: 0.7648366	test: 0.7648366	best: 0.7648366 (2000)	total: 1m 17s	remaining: 19.4s
2499:	learn: 0.7247285	test: 0.7247285	best: 0.7247285 (2499)	total: 1m 37s	remaining: 0us
bestTest = 0.7247285242
bestIteration = 2499




0:	learn: 2.7106475	test: 2.7106475	best: 2.7106475 (0)	total: 35.6ms	remaining: 1m 29s
1000:	learn: 0.8796152	test: 0.8796152	best: 0.8796152 (1000)	total: 37.9s	remaining: 56.8s
2000:	learn: 0.7776210	test: 0.7776210	best: 0.7776210 (2000)	total: 1m 16s	remaining: 19.2s
2499:	learn: 0.7411707	test: 0.7411707	best: 0.7411707 (2499)	total: 1m 35s	remaining: 0us
bestTest = 0.7411707469
bestIteration = 2499




0:	learn: 2.7107206	test: 2.7107206	best: 2.7107206 (0)	total: 42.6ms	remaining: 1m 46s
1000:	learn: 0.8671816	test: 0.8671816	best: 0.8671816 (1000)	total: 38.3s	remaining: 57.4s
2000:	learn: 0.7568291	test: 0.7568291	best: 0.7568291 (2000)	total: 1m 18s	remaining: 19.6s
2499:	learn: 0.7159199	test: 0.7159199	best: 0.7159199 (2499)	total: 1m 38s	remaining: 0us
bestTest = 0.7159198532
bestIteration = 2499




0:	learn: 2.7108621	test: 2.7108621	best: 2.7108621 (0)	total: 42.7ms	remaining: 1m 46s
1000:	learn: 0.8819077	test: 0.8819078	best: 0.8819078 (1000)	total: 38.4s	remaining: 57.5s
2000:	learn: 0.7694418	test: 0.7694418	best: 0.7694418 (2000)	total: 1m 17s	remaining: 19.3s
2499:	learn: 0.7301338	test: 0.7301338	best: 0.7301338 (2499)	total: 1m 37s	remaining: 0us
bestTest = 0.730133833
bestIteration = 2499




0:	learn: 2.7105460	test: 2.7105461	best: 2.7105461 (0)	total: 43.6ms	remaining: 1m 49s
1000:	learn: 0.8784660	test: 0.8784660	best: 0.8784660 (1000)	total: 38.3s	remaining: 57.4s
2000:	learn: 0.7615072	test: 0.7615072	best: 0.7615072 (2000)	total: 1m 17s	remaining: 19.3s
2499:	learn: 0.7209462	test: 0.7209462	best: 0.7209462 (2499)	total: 1m 37s	remaining: 0us
bestTest = 0.7209462285
bestIteration = 2499


In [49]:
from sklearn.linear_model import LinearRegression

oof_train_0 = oof_train_0[:, np.newaxis]
oof_train_1 = oof_train_1[:, np.newaxis]
oof_train_2 = oof_train_2[:, np.newaxis]
oof_train0 = np.hstack((oof_train_0, oof_train_1, oof_train_2))
y_train0 = X_train_non_null['AdoptionSpeed'].values

oof_test_0 = oof_test_0[:, np.newaxis]
oof_test_1 = oof_test_1[:, np.newaxis]
oof_test_2 = oof_test_2[:, np.newaxis]
oof_test0 = np.hstack((oof_test_0, oof_test_1, oof_test_2))

linear_model = LinearRegression()
linear_model.fit(oof_train0, y_train0)
oof_train0 = linear_model.predict(oof_train0)
oof_test0 = linear_model.predict(oof_test0)

linear_model.coef_

array([0.57317479, 0.30507952, 0.17081552])

In [50]:
train_data0 = pd.DataFrame({'PetID': train['PetID'].values, 'predict0': oof_train0})
test_data0 = pd.DataFrame({'PetID': test['PetID'].values, 'predict0': oof_test0})

In [51]:
model_dog, oof_train_dog_0, oof_test_dog_0 = run_lgb(lgb_params, X_train_dog, X_test_dog)
model_cat, oof_train_cat_0, oof_test_cat_0 = run_lgb(lgb_params, X_train_cat, X_test_cat)

model_dog, oof_train_dog_1, oof_test_dog_1 = run_xgb(xgb_params, X_train_dog, X_test_dog)
model_cat, oof_train_cat_1, oof_test_cat_1 = run_xgb(xgb_params, X_train_cat, X_test_cat)

model_dog, oof_train_dog_2, oof_test_dog_2 = run_cat(cat_params, X_train_dog, X_test_dog)
model_cat, oof_train_cat_2, oof_test_cat_2 = run_cat(cat_params, X_train_cat, X_test_cat)

Training until validation scores don't improve for 500 rounds.
[1000]	training's rmse: 0.323125	valid_1's rmse: 0.960546
Early stopping, best iteration is:
[746]	training's rmse: 0.412353	valid_1's rmse: 0.95923
Training until validation scores don't improve for 500 rounds.
[1000]	training's rmse: 0.319976	valid_1's rmse: 0.981149
[2000]	training's rmse: 0.128238	valid_1's rmse: 0.979968
Early stopping, best iteration is:
[1819]	training's rmse: 0.150628	valid_1's rmse: 0.979566
Training until validation scores don't improve for 500 rounds.
[1000]	training's rmse: 0.320838	valid_1's rmse: 1.01254
Early stopping, best iteration is:
[1061]	training's rmse: 0.302953	valid_1's rmse: 1.01223
Training until validation scores don't improve for 500 rounds.
[1000]	training's rmse: 0.320991	valid_1's rmse: 0.98605
Early stopping, best iteration is:
[1216]	training's rmse: 0.261539	valid_1's rmse: 0.985428
Training until validation scores don't improve for 500 rounds.
[1000]	training's rmse: 0.32



0:	learn: 2.7843075	test: 2.7843074	best: 2.7843074 (0)	total: 29.4ms	remaining: 1m 13s
1000:	learn: 0.7782778	test: 0.7782778	best: 0.7782778 (1000)	total: 31.9s	remaining: 47.8s
2000:	learn: 0.6555355	test: 0.6555355	best: 0.6555355 (2000)	total: 1m 3s	remaining: 15.8s
2499:	learn: 0.6092931	test: 0.6092931	best: 0.6092931 (2499)	total: 1m 19s	remaining: 0us
bestTest = 0.609293111
bestIteration = 2499




0:	learn: 2.7839692	test: 2.7839692	best: 2.7839692 (0)	total: 35ms	remaining: 1m 27s
1000:	learn: 0.7738705	test: 0.7738705	best: 0.7738705 (1000)	total: 32.7s	remaining: 48.9s
2000:	learn: 0.6305990	test: 0.6305989	best: 0.6305989 (2000)	total: 1m 6s	remaining: 16.6s
2499:	learn: 0.5820222	test: 0.5820222	best: 0.5820222 (2499)	total: 1m 23s	remaining: 0us
bestTest = 0.5820221558
bestIteration = 2499




0:	learn: 2.7838195	test: 2.7838195	best: 2.7838195 (0)	total: 29.4ms	remaining: 1m 13s
1000:	learn: 0.7606771	test: 0.7606771	best: 0.7606771 (1000)	total: 32.6s	remaining: 48.8s
2000:	learn: 0.6169326	test: 0.6169325	best: 0.6169325 (2000)	total: 1m 6s	remaining: 16.5s
2499:	learn: 0.5650276	test: 0.5650276	best: 0.5650276 (2499)	total: 1m 23s	remaining: 0us
bestTest = 0.5650275623
bestIteration = 2499




0:	learn: 2.7845262	test: 2.7845261	best: 2.7845261 (0)	total: 30.5ms	remaining: 1m 16s
1000:	learn: 0.7535366	test: 0.7535365	best: 0.7535365 (1000)	total: 33.4s	remaining: 50s
2000:	learn: 0.6118363	test: 0.6118363	best: 0.6118363 (2000)	total: 1m 8s	remaining: 17s
2499:	learn: 0.5648572	test: 0.5648571	best: 0.5648571 (2499)	total: 1m 25s	remaining: 0us
bestTest = 0.5648571265
bestIteration = 2499




0:	learn: 2.7839772	test: 2.7839772	best: 2.7839772 (0)	total: 30ms	remaining: 1m 14s
1000:	learn: 0.7599685	test: 0.7599684	best: 0.7599684 (1000)	total: 33.1s	remaining: 49.5s
2000:	learn: 0.6145549	test: 0.6145549	best: 0.6145549 (2000)	total: 1m 6s	remaining: 16.7s
2499:	learn: 0.5693798	test: 0.5693798	best: 0.5693798 (2499)	total: 1m 23s	remaining: 0us
bestTest = 0.5693798298
bestIteration = 2499




0:	learn: 2.6225053	test: 2.6225051	best: 2.6225051 (0)	total: 34.8ms	remaining: 1m 27s
1000:	learn: 0.8076914	test: 0.8076914	best: 0.8076914 (1000)	total: 32s	remaining: 47.9s
2000:	learn: 0.6367397	test: 0.6367397	best: 0.6367397 (2000)	total: 1m 5s	remaining: 16.3s
2499:	learn: 0.5821296	test: 0.5821296	best: 0.5821296 (2499)	total: 1m 21s	remaining: 0us
bestTest = 0.5821295916
bestIteration = 2499




0:	learn: 2.6220846	test: 2.6220846	best: 2.6220846 (0)	total: 34.9ms	remaining: 1m 27s
1000:	learn: 0.7953235	test: 0.7953235	best: 0.7953235 (1000)	total: 32.1s	remaining: 48s
2000:	learn: 0.6310623	test: 0.6310623	best: 0.6310623 (2000)	total: 1m 4s	remaining: 16.2s
2499:	learn: 0.5788051	test: 0.5788050	best: 0.5788050 (2499)	total: 1m 21s	remaining: 0us
bestTest = 0.5788050427
bestIteration = 2499




0:	learn: 2.6222519	test: 2.6222519	best: 2.6222519 (0)	total: 28.5ms	remaining: 1m 11s
1000:	learn: 0.8083656	test: 0.8083656	best: 0.8083656 (1000)	total: 31.7s	remaining: 47.5s
2000:	learn: 0.6511447	test: 0.6511447	best: 0.6511447 (2000)	total: 1m 4s	remaining: 16.2s
2499:	learn: 0.5969341	test: 0.5969341	best: 0.5969341 (2499)	total: 1m 21s	remaining: 0us
bestTest = 0.5969340964
bestIteration = 2499




0:	learn: 2.6219014	test: 2.6219014	best: 2.6219014 (0)	total: 29.5ms	remaining: 1m 13s
1000:	learn: 0.8328116	test: 0.8328116	best: 0.8328116 (1000)	total: 32.2s	remaining: 48.2s
2000:	learn: 0.6793663	test: 0.6793663	best: 0.6793663 (2000)	total: 1m 5s	remaining: 16.4s
2499:	learn: 0.6256183	test: 0.6256183	best: 0.6256183 (2499)	total: 1m 22s	remaining: 0us
bestTest = 0.6256182905
bestIteration = 2499




0:	learn: 2.6217075	test: 2.6217075	best: 2.6217075 (0)	total: 34.1ms	remaining: 1m 25s
1000:	learn: 0.8152969	test: 0.8152969	best: 0.8152969 (1000)	total: 32.8s	remaining: 49.2s
2000:	learn: 0.6621598	test: 0.6621598	best: 0.6621598 (2000)	total: 1m 6s	remaining: 16.5s
2499:	learn: 0.6134679	test: 0.6134679	best: 0.6134679 (2499)	total: 1m 22s	remaining: 0us
bestTest = 0.6134678622
bestIteration = 2499


In [52]:
oof_train_dog_0 = pd.DataFrame(oof_train_dog_0)
oof_train_dog_1 = pd.DataFrame(oof_train_dog_1)
oof_train_dog_2 = pd.DataFrame(oof_train_dog_2)
oof_train_cat_0 = pd.DataFrame(oof_train_cat_0)
oof_train_cat_1 = pd.DataFrame(oof_train_cat_1)
oof_train_cat_2 = pd.DataFrame(oof_train_cat_2)

oof_test_dog_0 = pd.DataFrame(oof_test_dog_0)
oof_test_dog_1 = pd.DataFrame(oof_test_dog_1)
oof_test_dog_2 = pd.DataFrame(oof_test_dog_2)
oof_test_cat_0 = pd.DataFrame(oof_test_cat_0)
oof_test_cat_1 = pd.DataFrame(oof_test_cat_1)
oof_test_cat_2 = pd.DataFrame(oof_test_cat_2)

oof_train_0 = pd.concat([oof_train_dog_0,oof_train_cat_0])
oof_train_1 = pd.concat([oof_train_dog_1,oof_train_cat_1])
oof_train_2 = pd.concat([oof_train_dog_2,oof_train_cat_2])

oof_test_0 = pd.concat([oof_test_dog_0,oof_test_cat_0])
oof_test_1 = pd.concat([oof_test_dog_1,oof_test_cat_1])
oof_test_2 = pd.concat([oof_test_dog_2,oof_test_cat_2])

In [53]:
from sklearn.linear_model import LinearRegression

oof_train1 = np.hstack((oof_train_0, oof_train_1, oof_train_2))

y_train_0 = X_train_dog['AdoptionSpeed']
y_train_1 = X_train_cat['AdoptionSpeed']
y_train1 = pd.concat([X_train_dog,X_train_cat])
y_train1 = y_train1['AdoptionSpeed'].values

oof_test1 = np.hstack((oof_test_0, oof_test_1, oof_test_2))

linear_model = LinearRegression()
linear_model.fit(oof_train1, y_train1)
oof_train1 = linear_model.predict(oof_train1)
oof_test1 = linear_model.predict(oof_test1)

linear_model.coef_

array([0.41475546, 0.38147882, 0.2577447 ])

In [54]:
#oof_test_dog,cat2 is default values. it is changed in next row.
sub_dog = pd.DataFrame({'PetID': test[test['Type'] == 1]['PetID'].values})
sub_cat = pd.DataFrame({'PetID': test[test['Type'] == 2]['PetID'].values})

test_data1 = pd.concat([sub_dog, sub_cat], ignore_index=True, sort=False)
test_data1['predict1'] = oof_test1

sub_dog = pd.DataFrame({'PetID': train[train['Type'] == 1]['PetID'].values})
sub_cat = pd.DataFrame({'PetID': train[train['Type'] == 2]['PetID'].values})

train_data1 = pd.concat([sub_dog, sub_cat], ignore_index=True, sort=False)
train_data1['predict1'] = oof_train1

In [55]:
train_data2 = pd.merge(train_data0,train_data1,how='left',on='PetID')
test_data2 = pd.merge(test_data0,test_data1,how='left',on='PetID')

In [56]:
best_qwk2 = 0
best_i = 0
for i in [1,0.95,0.90,0.85,0.80,0.75,0.70,0.65,0.60,0.55,0.50,0.45,0.40]:
    optR = OptimizedRounder()
    oof_train2 = i*train_data2['predict0'].values + (1-i)*train_data2['predict1'].values
    oof_test2 = i*test_data2['predict0'].values + (1-i)*test_data2['predict1'].values

    optR.fit(oof_train2, y_train0)
    coefficients = optR.coefficients()
    valid_pred = optR.predict(oof_train2, coefficients)
    qwk2 = quadratic_weighted_kappa(y_train0, valid_pred)
    test_predictions = optR.predict(oof_test2, coefficients).astype(np.int8)
    print("i = ", i)
    print("QWK = ", qwk2)
    if best_qwk2 <= qwk2:
        best_qwk2 = qwk2
        best_i = i

i =  1
QWK =  0.4922427710588714
i =  0.95
QWK =  0.49253041266161224
i =  0.9
QWK =  0.4935889709882164
i =  0.85
QWK =  0.4935306583646355
i =  0.8
QWK =  0.4939128532442213
i =  0.75
QWK =  0.49516725159362807
i =  0.7
QWK =  0.4959755368190373
i =  0.65
QWK =  0.4963340740194796
i =  0.6
QWK =  0.49592925633723495
i =  0.55
QWK =  0.4966364565770379
i =  0.5
QWK =  0.49677761470920767
i =  0.45
QWK =  0.4957228197647565
i =  0.4
QWK =  0.49557339487077967


In [57]:
optR = OptimizedRounder()
oof_train2 = best_i*train_data2['predict0'].values + (1-best_i)*train_data2['predict1'].values
oof_test2 = best_i*test_data2['predict0'].values + (1-best_i)*test_data2['predict1'].values

optR.fit(oof_train2, y_train0)
coefficients = optR.coefficients()
valid_pred = optR.predict(oof_train2, coefficients)
qwk2 = quadratic_weighted_kappa(y_train0, valid_pred)
test_predictions = optR.predict(oof_test2, coefficients).astype(np.int8)

In [58]:
submission = pd.DataFrame({'PetID': test['PetID'].values, 'AdoptionSpeed': test_predictions})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,PetID,AdoptionSpeed
0,378fcc4fc,2
1,73c10e136,4
2,72000c4c5,3
3,e147a4b9f,3
4,43fbba852,3
