In [1]:
import gc
import os
import json
import re
import glob
from joblib import Parallel, delayed

import scipy as sp
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MultiLabelBinarizer

from collections import Counter

import xgboost as xgb
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.ensemble import ExtraTreesRegressor
# from catboost import CatBoostRegressor
import lightgbm as lgb

np.random.seed(1029)

from tqdm import tqdm, tqdm_notebook

import cv2
from keras.applications.densenet import preprocess_input, DenseNet121
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
import keras.backend as K

from scipy import stats

from PIL import Image

Using TensorFlow backend.


# nfolds

In [2]:
N_FOLDS = 4
FOLDS = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# functions

In [3]:
def get_chi2(obs, exp):
    diff = set(exp) - set(obs)
    f_obs = obs.value_counts()
    f_exp = exp.value_counts()
    if diff:
        for i in diff:
            f_obs[i] = 0
    f_obs = f_obs.sort_index()
    f_exp = f_exp.sort_index()
    chi2, _ = stats.chisquare(f_obs.values,f_exp.values)
    return chi2

def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat

def histogram(ratings, min_rating=None, max_rating=None):
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings

def quadratic_weighted_kappa(y_true, y_pred):
    rater_a = y_true
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

def get_class_bounds(y, y_pred, N=5, class0_fraction=-1):
    ysort = np.sort(y)
    predsort = np.sort(y_pred)
    bounds = []
    for ibound in range(N-1):
        iy = len(ysort[ysort <= ibound])
        if (ibound == 0) and (class0_fraction >= 0.0) :
            iy = int(class0_fraction * iy)
        bounds.append(predsort[iy])
    return bounds

def assign_class(y_pred, boundaries):
    y_classes = np.zeros(len(y_pred))
    for iclass, bound in enumerate(boundaries):
        y_classes[y_pred >= bound] = iclass + 1
    return y_classes.astype(int)

def get_init_coefs(y_test_pred, y_test):
    kappas = []
    coefs = []
    cl0fracs = np.array(np.arange(0.01,30,0.01))
    for cl0frac in cl0fracs:
        coef = get_class_bounds(y_test, y_test_pred, class0_fraction=cl0frac)
        coefs.append(coef)
        y_test_k = assign_class(y_test_pred, coef)
        kappa = cohen_kappa_score(y_test, y_test_k, weights='quadratic')
        kappas.append(kappa)
    ifmax = np.array(kappas).argmax()
    best_frac = cl0fracs[ifmax]
    best_coef = coefs[ifmax]
#     print("Best init coefs: ", best_coef)
#     print("Bset init coefs kappa: ", np.max(kappas))    
    return best_coef

def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [4]:
def resize_to_square(im):
    old_size = im.shape[:2]
    ratio = float(img_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
    return new_im

def load_image(path, pet_id):
    image = cv2.imread(f'{path}{pet_id}-1.jpg')
    new_image = resize_to_square(image)
    new_image = preprocess_input(new_image)
    return new_image

In [5]:
class OptimizedRounder(object):
    def __init__(self,initial_coefs = None):
        if(initial_coefs == None):
            self.initial_coefs = [1.775, 2.1057, 2.4438, 2.7892]
        else:
            self.initial_coefs = initial_coefs.copy()
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        self.coef_ = sp.optimize.minimize(loss_partial, self.initial_coefs, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

class OptimizedRounder_v2(object):
    def __init__(self, initial_coefs = None):
        if(initial_coefs == None):
            self.initial_coefs = [1.775, 2.1057, 2.4438, 2.7892]
        else:
            self.initial_coefs = initial_coefs.copy()
        self.coef_ = 0
    
    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        ll = cohen_kappa_score(y, X_p, weights = 'quadratic')    
        chi2 =  get_chi2(X_p, y)
        ll = ll - chi2 * (1.0 / 25000)
        return -ll
    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        self.coef_ = sp.optimize.minimize(loss_partial, self.initial_coefs, method = 'nelder-mead')
    
    def predict(self, X, coef):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return preds
    
    def coefficients(self):
        return self.coef_['x']


class OptimizedRounder_v3(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef, len_0=410):
        X_p = np.copy(X)
        temp = sorted(list(X_p))
        threshold = temp[int(0.9*len_0)-1]
        for i, pred in enumerate(X_p):
            if pred < threshold:
                X_p[i] = 0
            elif pred >= threshold and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

# 特征: GZF & ZKR

In [6]:
train = pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
test = pd.read_csv("../input/petfinder-adoption-prediction/test/test.csv")
breeds = pd.read_csv("../input/petfinder-adoption-prediction/breed_labels.csv")
colors = pd.read_csv("../input/petfinder-adoption-prediction/color_labels.csv")
states = pd.read_csv("../input/petfinder-adoption-prediction/state_labels.csv")

# train = train.sort_values("RescuerID")
# train.index = range(len(train))

In [7]:
origin_train = train[list(train.columns)]
origin_test = test[list(test.columns)]

In [8]:
breedid_map = dict(zip(breeds['BreedID'], breeds['BreedName'].map(lambda x:x.lower())))
color_map = dict(zip(colors['ColorID'], colors['ColorName'].map(lambda x:x)))
state_map = dict(zip(states['StateID'], states['StateName'].map(lambda x:x)))

In [9]:
train_id = train['PetID']
test_id = test['PetID']

## 0 common feature

In [10]:
def sentiment_feature(data, ids, path):
    doc_sent_mag = []
    doc_sent_score = []
    doc_sent_len = []
    doc_sent_mags = []
    doc_sent_scores = []

    doc_entity_len = []
    doc_entity_sali = []

    nf_count = 0

    for pet in ids:
        try:
            with open('../input/petfinder-adoption-prediction/%s/' % path + pet + '.json', 'r') as f:
                sentiment = json.load(f)
            doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
            doc_sent_score.append(sentiment['documentSentiment']['score'])
            
            doc_sent_len.append(len(sentiment['sentences']))
            if len(sentiment['sentences']) == 0:
                doc_sent_mags.append([-999])
                doc_sent_scores.append([-999])
            else:
                doc_sent_mags.append([sent['sentiment']['magnitude'] for sent in sentiment['sentences']])
                doc_sent_scores.append([sent['sentiment']['score'] for sent in sentiment['sentences']])
            
            doc_entity_len.append(len(sentiment['entities']))
            if len(sentiment['entities']) == 0:
                doc_entity_sali.append([-999])
            else:
                doc_entity_sali.append([entity['salience'] for entity in sentiment['entities']])
        except FileNotFoundError:
            nf_count += 1
            doc_sent_mag.append(-1)
            doc_sent_score.append(-1)
            doc_sent_len.append(-1)
            doc_sent_mags.append([-1000])
            doc_sent_scores.append([-1000])
            doc_entity_len.append(-1)
            doc_entity_sali.append([-1000])
            
    print('nf count:', nf_count) 

    data.loc[:, 'doc_sent_mag'] = doc_sent_mag
    data.loc[:, 'doc_sent_score'] = doc_sent_score
    
    # train.loc[:, 'doc_sent_len'] = doc_sent_len
    # train.loc[:, 'doc_sent_mags'] = doc_sent_mags
    # train.loc[:, 'doc_sent_scores'] = doc_sent_scores
    # train.loc[:, 'doc_entity_len'] = doc_entity_len
    # train.loc[:, 'doc_entity_sali'] = doc_entity_sali
    return data

train = sentiment_feature(train, train_id, 'train_sentiment')
test = sentiment_feature(test, test_id, 'test_sentiment')

nf count: 551
nf count: 133


In [11]:
def gen_meta_f(df, ids, meta_path):
    vertex_xs = []
    vertex_ys = []
    bounding_confidences = []
    bounding_importance_fracs = []
    dominant_blues = []
    dominant_greens = []
    dominant_reds = []
    dominant_pixel_fracs = []
    dominant_scores = []
    
    dominant_blues1 = []
    dominant_greens1 = []
    dominant_reds1 = []
    dominant_pixel_fracs1 = []
    dominant_scores1 = []

    label_descriptions = []
    label_descriptions1 = []
    label_descriptions2 = []
    label_descriptions3 = []
    
    label_scores = []
    label_scores1 = []
    label_scores2 = []
    label_scores3 = []
    
    nf_count = 0
    nl_count = 0
    label_data = {}
    for idx, pet in enumerate(ids):
        try:
            with open('../input/petfinder-adoption-prediction/%s/' % meta_path + pet + '-1.json', 'r') as f:
                data = json.load(f)
            vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
            vertex_xs.append(vertex_x)
            vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
            vertex_ys.append(vertex_y)
            bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
            bounding_confidences.append(bounding_confidence)
            bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
            bounding_importance_fracs.append(bounding_importance_frac)
            # 0
            dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
            dominant_blues.append(dominant_blue)
            dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
            dominant_greens.append(dominant_green)
            dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
            dominant_reds.append(dominant_red)
            dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
            dominant_pixel_fracs.append(dominant_pixel_frac)
            dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
            dominant_scores.append(dominant_score)
            # 1
            if len(data['imagePropertiesAnnotation']['dominantColors']['colors']) > 1 and len(data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['color']) == 3:
                dominant_blue1 = data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['color']['blue']
                dominant_blues1.append(dominant_blue1)
                dominant_green1 = data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['color']['green']
                dominant_greens1.append(dominant_green1)
                dominant_red1 = data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['color']['red']
                dominant_reds1.append(dominant_red1)
                dominant_pixel_frac1 = data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['pixelFraction']
                dominant_pixel_fracs1.append(dominant_pixel_frac1)
                dominant_score1 = data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['score']
                dominant_scores1.append(dominant_score1)
        
            else:
                dominant_blues1.append(-1)
                dominant_greens1.append(-1)
                dominant_reds1.append(-1)
                dominant_pixel_fracs1.append(-1)
                dominant_scores1.append(-1)
                
            if data.get('labelAnnotations'):
                label_description = data['labelAnnotations'][0]['description']
                label_descriptions.append(label_description)
                label_score = data['labelAnnotations'][0]['score']
                label_scores.append(label_score)

                if len(data['labelAnnotations']) > 1:
                    label_description1 = data['labelAnnotations'][1]['description']
                    label_descriptions1.append(label_description1)
                    label_score1 = data['labelAnnotations'][1]['score']
                    label_scores1.append(label_score1)
                else:
                    label_descriptions1.append('nothing')
                    label_scores1.append(-1)
                
                if len(data['labelAnnotations']) > 2:
                    label_description2 = data['labelAnnotations'][2]['description']
                    label_descriptions2.append(label_description2)
                    label_score2 = data['labelAnnotations'][2]['score']
                    label_scores2.append(label_score2)
                else:
                    label_descriptions2.append('nothing')
                    label_scores2.append(-1)

                if len(data['labelAnnotations']) > 3:
                    label_description3 = data['labelAnnotations'][3]['description']
                    label_descriptions3.append(label_description3)
                    label_score3 = data['labelAnnotations'][3]['score']
                    label_scores3.append(label_score3)
                else:
                    label_descriptions3.append('nothing')
                    label_scores3.append(-1)

            else:
                nl_count += 1
                label_descriptions.append('nothing')
                label_descriptions1.append(label_description1)
                label_descriptions2.append(label_description2)
                label_descriptions3.append(label_description3)
                
                label_scores.append(-1)
                label_scores1.append(-1)
                label_scores2.append(-1)
                label_scores3.append(-1)
                                                            
        except FileNotFoundError:
            nf_count += 1
            vertex_xs.append(-1)
            vertex_ys.append(-1)
            bounding_confidences.append(-1)
            bounding_importance_fracs.append(-1)
            dominant_blues.append(-1)
            dominant_greens.append(-1)
            dominant_reds.append(-1)
            dominant_pixel_fracs.append(-1)
            dominant_scores.append(-1)
            
            dominant_blues1.append(-1)
            dominant_greens1.append(-1)
            dominant_reds1.append(-1)
            dominant_pixel_fracs1.append(-1)
            dominant_scores1.append(-1)

            label_descriptions.append('nothing')
            label_descriptions1.append('nothing')
            label_descriptions2.append('nothing')
            label_descriptions3.append('nothing')
            label_scores.append(-1)
            label_scores1.append(-1)
            label_scores2.append(-1)
            label_scores3.append(-1)

    print(nf_count)
    print(nl_count)
    prefix = 'meta_'
    df.loc[:, prefix+'vertex_x'] = vertex_xs
    df.loc[:, prefix+'vertex_y'] = vertex_ys
    df.loc[:, prefix+'bounding_confidence'] = bounding_confidences
    df.loc[:, prefix+'bounding_importance'] = bounding_importance_fracs
    df.loc[:, prefix+'dominant_blue'] = dominant_blues
    df.loc[:, prefix+'dominant_green'] = dominant_greens
    df.loc[:, prefix+'dominant_red'] = dominant_reds
    df.loc[:, prefix+'dominant_pixel_frac'] = dominant_pixel_fracs
    df.loc[:, prefix+'dominant_score'] = dominant_scores
    
    df.loc[:, prefix+'label_description'] = label_descriptions
    df.loc[:, prefix+'label_description1'] = label_descriptions1
    df.loc[:, prefix+'label_description2'] = label_descriptions2
#     df.loc[:, 'label_description3'] = label_descriptions3

    df.loc[:, prefix+'label_score'] = label_scores
    df.loc[:, prefix+'label_score1'] = label_scores1
    df.loc[:, prefix+'label_score2'] = label_scores1
    cate_cols = [prefix+col for col in ['label_description','label_description1','label_description2']]
    df.loc[:, cate_cols] = df[cate_cols].astype('category')
#     df.loc[:, 'label_score3'] = label_scores3

gen_meta_f(train, train_id, 'train_metadata')
gen_meta_f(test, test_id, 'test_metadata')


341
2
128
0


## 1.1 origin feature

In [12]:
def rescue_feature(df):
    rescue_count = df.groupby('RescuerID')['Quantity'].count()
    rescue_count.name = 'rescue_count'
    rescue_num = df.groupby('RescuerID')['Quantity'].sum()
    rescue_num.name = 'rescue_num'
    rescue_unique_type = df.drop_duplicates(['RescuerID', 'Type']).groupby('RescuerID')['RescuerID'].count()
    rescue_unique_type.name = 'rescue_unique_type'
    df = df.join(rescue_count, on='RescuerID')
    df = df.join(rescue_num, on='RescuerID') 
#     df = df.join(rescue_unique_type, on='RescuerID') 
#     df['rescue_rank'] = min_max(df['RescuerID'].rank())
    df['rescue_rank'] = df.RescuerID.map(df.RescuerID.value_counts().rank()/df.RescuerID.unique().shape[0])
    return df

def pure_breed_encode(data):
    data['pure_breed1'] = np.where((data['Breed1'] != 307) , '0', '1')
    data['pure_breed2'] = np.where((data['Breed2'] == 0) , '0', 
                      np.where(data['Breed2'] != 307, '1', '2'))
    data['pure_breed3'] = (data['pure_breed1'] + data['pure_breed2'])
    data['pure_animal_pure_breed4'] = np.where((data['Type'].astype(np.str)=='1') & (data['pure_breed3']=='00'), '100', 
                                          np.where((data['Type'].astype(np.str)=='2') & (data['pure_breed3']=='00'), '200', 
                                          '333'))
    for col in ['pure_breed1', 'pure_breed2', 'pure_breed3', 'pure_animal_pure_breed4']:
        data[col] = data[col].astype('category')
    del data['pure_animal_pure_breed4']
    return data

def call_name_f(data):
    is_call_name = []
    for name, desc in zip(data['Name'], data['Description']):
        clean_desc = str(desc).lower()
        clean_name = str(name).lower()
        if clean_name == 'nan':
            is_call_name.append(0)
        else:
            num = len(clean_desc.split(clean_name))
            is_call_name.append(num)
    data['call_name_num'] = is_call_name
    return data


train = rescue_feature(train)
test = rescue_feature(test)

train = pure_breed_encode(train)
test = pure_breed_encode(test)

# train = call_name_f(train)
# test = call_name_f(test)

## 1.3 description feature

In [13]:
def language_type(desc):
    desc = str(desc)
    if desc=='nan':
        return 0
    zhmodel = re.compile(u'[\u4e00-\u9fa5]')    #检查中文
    enmodel = re.compile(u'[a-zA-Z]')   #检查英文
    zhmatch = zhmodel.search(desc)
    enmatch = enmodel.search(desc)
    if zhmatch and enmatch:
        return 3  # 中英混合
    elif zhmatch:
        return 3  # 纯中文
    elif enmatch:
        return 2  # 纯英文
    else:
        return 1  # 都是字符

def malaiyu_type(desc):
    desc = str(desc)
#     malai = [' ekor ', ' ngan ', ' dia ', ' sy ', ' dan ', ' leh ', ' nak ', ' dr ', ' dari ', ' la x ' , ' nk ',' nie ', ' umur ', ' di ', 'teruk', ' satu ',' dh ', ' ni ',' tp ', ' yg ', 'mmg', 'msj', ' utk ' ,'neh' ]
    malai = [' la x ' , ' nk ',' nie ', ' umur ', ' di ', 'teruk', ' satu ',' dh ', ' ni ',' tp ', ' yg ', 'mmg', 'msj', ' utk ' ,'neh' ]
    for ma_tag in malai:
        if desc.find(ma_tag) > -1:
            return ma_tag,1
    
    return "", 0

lang_prefix = 'lang_'
train[lang_prefix+'language_type'] = train.Description.map(lambda x:language_type(x))
train[lang_prefix+'malaiyu_type'] = train.Description.map(lambda x:malaiyu_type(x)[1])

test[lang_prefix+'language_type'] = test.Description.map(lambda x:language_type(x))
test[lang_prefix+'malaiyu_type'] = test.Description.map(lambda x:malaiyu_type(x)[1])

In [14]:
def obtain_text(df):
    breed1_text = df['Breed1'].map(lambda x:breedid_map.get(x, 'unknown_breed'))
    breed2_text = df['Breed2'].map(lambda x:breedid_map.get(x, 'unknown_breed'))
    color1_text = df['Color1'].map(lambda x:color_map.get(x, 'unknown_color'))
    color2_text = df['Color2'].map(lambda x:color_map.get(x, 'unknown_color'))
    color3_text = df['Color3'].map(lambda x:color_map.get(x, 'unknown_color'))

    text = df['Name'].fillna("none") + " " \
           + breed1_text  + " " \
           + breed2_text + " " \
           + color1_text + " " \
           + color2_text + " " \
            + color3_text + " " \
            + df['Description'].fillna("none")
    
    return text

# train_desc = obtain_text(train)
# test_desc = obtain_text(test)
train_desc = train.Description.fillna("none").values
test_desc = test.Description.fillna("none").values

tfv = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')
    
# Fit TFIDF
tfv.fit(list(train_desc))
X =  tfv.transform(train_desc)
X_test = tfv.transform(test_desc)

components = 120
svd = TruncatedSVD(n_components=components)
svd.fit(X)

X = svd.transform(X)
X = pd.DataFrame(X, columns=['svd_{}'.format(i) for i in range(components)])
train = pd.concat((train, X), axis=1)
X_test = svd.transform(X_test)
X_test = pd.DataFrame(X_test, columns=['svd_{}'.format(i) for i in range(components)])
test = pd.concat((test, X_test), axis=1)

## 1.4 NMF LDA

In [15]:
def nmf_lda_feature(train, test, train_text, test_text):
    tfv = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')
    # Fit TFIDF
    tfv.fit(list(train_text)+list(test_text))
    X =  tfv.transform(train_text)
    X_test = tfv.transform(test_text)

    # nmf
    components = 20
#     scaler = MinMaxScaler()
    # X = scaler.fit_transform(X)
#     nmf = NMF(n_components=components, random_state=100).fit(np.vstack([X, X_test]))
    nmf = NMF(n_components=components, random_state=100).fit(X)
    nmf_x = nmf.transform(X)
    nmf_x = pd.DataFrame(nmf_x, columns=['nmf_{}'.format(i) for i in range(components)])
    train = pd.concat((train, nmf_x), axis=1)
    nmf_x_test = nmf.transform(X_test)
    nmf_x_test = pd.DataFrame(nmf_x_test, columns=['nmf_{}'.format(i) for i in range(components)])
    test = pd.concat((test, nmf_x_test), axis=1)

    # lda
    components = 12
#     lda = LatentDirichletAllocation(n_components=components, max_iter=10, n_jobs=-1)
    lda = LatentDirichletAllocation(n_components=components, max_iter=120, n_jobs=-1)
    lda.fit(X)
    lda_x = lda.transform(X)
    lda_x = pd.DataFrame(lda_x, columns=['lda_{}'.format(i) for i in range(components)])
    train = pd.concat((train, lda_x), axis=1)
    lda_x_test = lda.transform(X_test)
    lda_x_test = pd.DataFrame(lda_x_test, columns=['lda_{}'.format(i) for i in range(components)])
    test = pd.concat((test, lda_x_test), axis=1)
    
    return train, test

train_text = obtain_text(train)
test_text = obtain_text(test)

train, test = nmf_lda_feature(train, test, train_text, test_text)

## 1.7 image feature

### 1.7.1 image meta_feature

In [16]:
train_df = pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
test_df = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')
test_pet_ids = test_df['PetID'].values
train_pet_ids = train_df['PetID'].values
target = train_df['AdoptionSpeed'].values
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D,Dense,Dropout
import keras.backend as K
from keras.optimizers import Adam
from keras.applications.densenet import preprocess_input, DenseNet121
from keras.applications.resnet50 import preprocess_input as res_preprocess, ResNet50

In [17]:
batch_size = 128
def BASE_MODEL():
    inp = Input((128,128,3))
    backbone = ResNet50(input_tensor = inp, 
                           weights="../input/resnet50/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5",
                           include_top = False)
    x = backbone.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(512)(x)
    x = Dropout(0.5)(x)
    output = Dense(1,activation='linear')(x)
    return Model(inp,output)

def new_load_image(path, pet_id):
    image = cv2.imread(f'{path}{pet_id}-1.jpg')
    try:
        new_image = cv2.resize(image,(128,128))
    except:
        new_image = np.zeros((128,128,3))
    new_image = res_preprocess(new_image)
    return new_image

#base_model = BASE_MODEL()
#model.summary()
def train_gen(batch_size=128,shuffle=True,pet_list=None,pet_labels=None,use_labels=True):
    images_df = pd.DataFrame({'img_id':pet_list,'label':pet_labels})
    while True:
        if shuffle:
            images_df = images_df.sample(frac=1.0).reset_index(drop=True)
        for start in range(0, len(images_df), batch_size):
            x_batch = []
            y_batch = []
            end = min(start + batch_size,len(images_df))
            for _id in range(start,end):
                image_row = images_df.iloc[_id]
                image_id = image_row['img_id']
                img = new_load_image("../input/petfinder-adoption-prediction/train_images/", image_id)
                if use_labels:
                    img_label = image_row['label']
                    y_batch.append(img_label)
                else:
                    y_batch.append(-1.0)
                x_batch.append(img)
            yield np.array(x_batch),np.array(y_batch)
            
def test_gen(batch_size=128,shuffle=True,pet_list=None,pet_labels=None,use_labels=True):
    images_df = pd.DataFrame({'img_id':pet_list,'label':pet_labels})
    while True:
        if shuffle:
            images_df = images_df.sample(frac=1.0).reset_index(drop=True)
        for start in range(0, len(images_df), batch_size):
            x_batch = []
            y_batch = []
            end = min(start + batch_size,len(images_df))
            for _id in range(start,end):
                image_row = images_df.iloc[_id]
                image_id = image_row['img_id']
                img = new_load_image("../input/petfinder-adoption-prediction/test_images/", image_id)
                if use_labels:
                    img_label = image_row['label']
                    y_batch.append(img_label)
                else:
                    y_batch.append(-1.0)
                x_batch.append(img)
            yield np.array(x_batch),np.array(y_batch)

In [18]:
test_img_prob = np.zeros(shape=(test_df.shape[0],1))
train_img_prob = np.zeros(shape=(train_df.shape[0],1))
for tr_idx,te_idx in FOLDS.split(train_pet_ids,
                           target):
    print(len(tr_idx),len(te_idx))
    gen_tr = train_gen(batch_size=batch_size,
                    shuffle=True,
                    pet_list=train_pet_ids[tr_idx],
                    pet_labels=target[tr_idx])
    
    gen_te = train_gen(batch_size=batch_size,
                    shuffle=False,
                    pet_list=train_pet_ids[te_idx],
                    pet_labels=target[te_idx])
    gen_test = test_gen(batch_size=batch_size,
                    shuffle=False,
                    pet_list=test_pet_ids,
                    pet_labels=None,
                    use_labels=False)
    model = BASE_MODEL()
    model.compile(optimizer='adam',
                  loss='mse')
    model.fit_generator(gen_tr,
                       steps_per_epoch=int(np.ceil(len(tr_idx)*1.0/batch_size)),
                       epochs=3,verbose=1,
                       validation_data=gen_te,
                       validation_steps=int(np.ceil(len(te_idx)*1.0/batch_size)),
                       )
    _test_prob = model.predict_generator(gen_test,
                                         steps=int(np.ceil(len(test_df)*1.0/(batch_size))),
                                        )
    _val_prob = model.predict_generator(gen_te,                                         
                              steps=int(np.ceil(len(te_idx)*1.0/(batch_size))),
                             )
    train_img_prob[te_idx,:] = _val_prob 
    test_img_prob += _test_prob

    
test_img_prob /= N_FOLDS

11242 3751
Epoch 1/3
Epoch 2/3
Epoch 3/3
11244 3749
Epoch 1/3
Epoch 2/3
Epoch 3/3
11246 3747
Epoch 1/3
Epoch 2/3
Epoch 3/3
11247 3746
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [19]:
img_size = 256
batch_size = 16

train_df = pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
pet_ids = train_df['PetID'].values
n_batches = len(pet_ids) // batch_size + 1

from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
import keras.backend as K
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor = inp, 
                       weights="../input/densenet-keras/DenseNet-BC-121-32-no-top.h5",
                       include_top = False)
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = AveragePooling1D(4)(x)
out = Lambda(lambda x: x[:,:,0])(x)

m = Model(inp,out)

features = {}
for b in tqdm_notebook(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/train_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]

train_feats = pd.DataFrame.from_dict(features, orient='index')
train_feats.columns = ['pic_'+str(i) for i in range(train_feats.shape[1])]

HBox(children=(IntProgress(value=0, max=938), HTML(value='')))




In [20]:
test_df = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

pet_ids = test_df['PetID'].values
n_batches = len(pet_ids) // batch_size + 1

features = {}
for b in tqdm_notebook(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/test_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]
        
test_feats = pd.DataFrame.from_dict(features, orient='index')
test_feats.columns = ['pic_'+str(i) for i in range(test_feats.shape[1])]

test_feats = test_feats.reset_index()
test_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

train_feats = train_feats.reset_index()
train_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

test_feats.head()
train = pd.merge(train, train_feats, how='left', on='PetID')
test = pd.merge(test, test_feats, how='left', on='PetID')

HBox(children=(IntProgress(value=0, max=247), HTML(value='')))




## 第二份特征

In [21]:
breed_id_map = dict(zip(breeds.BreedID.values,breeds.BreedName.values))
breed_type_map = dict(zip(breeds.BreedID.values,breeds.Type.values))
color_id_map = dict(zip(colors.ColorID.values,colors.ColorName.values))

train['Breed1_text'] = train['Breed1'].map(lambda x:breed_id_map.get(x,'UNK_Breed1'))
train['Breed2_text'] = train['Breed2'].map(lambda x:breed_id_map.get(x,'UNK_Breed2'))
train['Color1_text'] = train['Color1'].map(lambda x:color_id_map.get(x,'UNK_Color1'))
train['Color2_text'] = train['Color2'].map(lambda x:color_id_map.get(x,'UNK_Color2'))
train['Color3_text'] = train['Color3'].map(lambda x:color_id_map.get(x,'UNK_Color3'))

test['Breed1_text'] = test['Breed1'].map(lambda x:breed_id_map.get(x,'UNK_Breed1'))
test['Breed2_text'] = test['Breed2'].map(lambda x:breed_id_map.get(x,'UNK_Breed2'))
test['Color1_text'] = test['Color1'].map(lambda x:color_id_map.get(x,'UNK_Color1'))
test['Color2_text'] = test['Color2'].map(lambda x:color_id_map.get(x,'UNK_Color2'))
test['Color3_text'] = test['Color3'].map(lambda x:color_id_map.get(x,'UNK_Color3'))

train['raw_text'] =  train['Name'] + ' ' \
                    + train['Breed1_text'] + ' ' + train['Breed2_text'] + ' ' \
                    + train['Color1_text'] + ' ' + train['Color2_text'] + ' ' \
                    + train['Color3_text'] + ' ' \
                    + train['Description']

test['raw_text'] =  test['Name'] + ' ' \
                    + test['Breed1_text'] + ' ' + test['Breed2_text'] + ' ' \
                    + test['Color1_text'] + ' ' + test['Color2_text'] + ' ' \
                    + test['Color3_text'] + ' ' \
                    + test['Description']

In [22]:
gzf_prefix = 'gzf_'

In [23]:
train[gzf_prefix+'RescureID_rank'] = train.RescuerID.map(train.RescuerID.value_counts().rank()/train.RescuerID.unique().shape[0])
train[gzf_prefix+'Description_len'] = train.Description.map(lambda x:len(x) if type(x)!=float else 0)
train[gzf_prefix+'Description_word_len'] = train.Description.map(lambda x:len(x.strip().split()) if type(x)!=float else 0)
train[gzf_prefix+'Description_distinct_word_len'] = train.Description.map(lambda x:len(set(x.lower().strip().split())) if type(x)!=float else 0)
train[gzf_prefix+'Description_distinct_word_ratio'] = train[gzf_prefix+'Description_distinct_word_len'] / (train[gzf_prefix+'Description_word_len'] + 1.0)

test[gzf_prefix+'RescureID_rank'] = test.RescuerID.map(test.RescuerID.value_counts().rank()/test.RescuerID.unique().shape[0])
test[gzf_prefix+'Description_len'] = test.Description.map(lambda x:len(x) if type(x)!=float else 0)
test[gzf_prefix+'Description_word_len'] = test.Description.map(lambda x:len(x.strip().split()) if type(x)!=float else 0)
test[gzf_prefix+'Description_distinct_word_len'] = test.Description.map(lambda x:len(set(x.lower().strip().split())) if type(x)!=float else 0)
test[gzf_prefix+'Description_distinct_word_ratio'] = test[gzf_prefix+'Description_distinct_word_len'] / (test[gzf_prefix+'Description_word_len'] + 1.0)

In [24]:
X = pd.concat([train,test],axis=0,ignore_index=True)
len_train = len(train)
print(train.shape, test.shape)
print(X.shape)

(14993, 468) (3948, 467)
(18941, 468)


In [25]:
X[gzf_prefix+'is_pure'] = ((X.Breed1!=307) & (X.Breed2!=307) & (X.Breed2!=0)).astype(float)
X[gzf_prefix+'is_pure_breed1'] = (X.Breed1!=307).astype(float)
X[gzf_prefix+'is_pure_breed2'] = ((X.Breed2!=307) & (X.Breed2!=0)).astype(float)

In [26]:
agg_num_feature = ['Age','Health','PhotoAmt','Quantity',
                   'doc_sent_mag', 'doc_sent_score', 
                   'meta_dominant_score', 'meta_label_score',gzf_prefix+'Description_len']
agg_rescureid_1 = X.groupby(['RescuerID'])[agg_num_feature].mean()
agg_rescureid_1.columns = ['Age_id','Health_id','PhotoAmt_id','Quantity_id',
                   'doc_sent_mag_id', 'doc_sent_score_id', 
                   'dominant_score_id', 'label_score_id','Description_len_id']
agg_rescureid_2 = X.groupby(['RescuerID'])['Breed1'].aggregate({'307_ratio':lambda x:(x==307).mean()})
agg_rescureid = pd.concat([agg_rescureid_1,agg_rescureid_2],axis=1)
agg_rescureid.columns = [gzf_prefix+x for x in agg_rescureid.columns ]
X = pd.merge(X,agg_rescureid,left_on='RescuerID',right_index=True,how='left')
print(X.shape)

(18941, 481)


In [27]:
SVD_FEATURES = 120
NMF_FEATURES = 20
LDA_FEATURES = 12

desc = X.raw_text.fillna("none").values
tfidf = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')
    
# Fit TFIDF
X_tfidf = tfidf.fit_transform(list(desc))
print("X (tfidf):", X_tfidf.shape)

svd = TruncatedSVD(n_components=SVD_FEATURES)
svd.fit(X_tfidf)
X_svd = svd.fit_transform(X_tfidf)
print("X (svd):", X_svd.shape)

X_svd = pd.DataFrame(X_svd, columns=[gzf_prefix+'sdv_{}'.format(i) for i in range(SVD_FEATURES)])
X = pd.concat((X, X_svd), axis=1)
print("X:", X.shape)

nmf = NMF(n_components=NMF_FEATURES)
nmf.fit(X_tfidf)
X_nmf = nmf.fit_transform(X_tfidf)
print("X (nmf):", X_nmf.shape)

X_nmf = pd.DataFrame(X_nmf, columns=[gzf_prefix+'mnf_{}'.format(i) for i in range(NMF_FEATURES)])
X = pd.concat((X, X_nmf), axis=1)
print("X:", X.shape)

# take a long time here
# lda = LatentDirichletAllocation(n_components=LDA_FEATURES, n_jobs=-1,max_iter=10)
lda = LatentDirichletAllocation(n_components=LDA_FEATURES, n_jobs=-1,max_iter=120)
lda.fit(X_tfidf)
X_lda = lda.fit_transform(X_tfidf)
print("X (lda):", X_lda.shape)

X_lda = pd.DataFrame(X_lda, columns=[gzf_prefix+'lad_{}'.format(i) for i in range(LDA_FEATURES)])
X = pd.concat((X, X_lda), axis=1)
print("X:", X.shape)


X (tfidf): (18941, 10000)
X (svd): (18941, 120)
X: (18941, 601)
X (nmf): (18941, 20)
X: (18941, 621)
X (lda): (18941, 12)
X: (18941, 633)


In [28]:
cat_cols = ['Health',
 'Breed1', 'Breed2',
 'Type', 'Gender',
 'Color3', 'Color2', 'Color1',
 'Vaccinated','Sterilized',  'Dewormed',
 'MaturitySize', 'FurLength',
 'State','meta_label_description','meta_label_description1','meta_label_description2']
X.loc[:, cat_cols] = X[cat_cols].astype('category')

In [29]:
# get the categorical features
foo = train.dtypes
cat_feature_names = foo[foo == "category"].index.values
cat_features = [i for i in range(X.shape[1]) if X.columns[i] in cat_feature_names]

In [30]:
train = X[:len_train]
test = X[len_train:]
train.index = range(len_train)
test.index = range(test.shape[0])

target = train['AdoptionSpeed']
rescue_id = train['RescuerID']

train.shape, target.shape

((14993, 633), (14993,))

## train functions

In [31]:
# def process_category_feature(train, test, intercept=300):
#     cat_cols = [col for col in train.columns if train[col].dtype.name == 'category']   
#     cat_feature_train = pd.get_dummies(train[cat_cols], columns=cat_cols)        
#     cat_feature_test = pd.get_dummies(test[cat_cols], columns=cat_cols)    
    
#     pick_col_df = cat_feature_train.sum(axis=0)
#     pick_col_df = pick_col_df[pick_col_df > intercept]
#     pick_cols = list(pick_col_df.index)
    
#     dummy_train = pd.concat([train.drop(cat_cols, axis=1), cat_feature_train[pick_cols]], axis=1)
#     dummy_test = pd.concat([test.drop(cat_cols, axis=1), cat_feature_test[pick_cols]], axis=1)
#     print ("dummy:", dummy_train.shape, dummy_test.shape)
#     return dummy_train, dummy_test

def obtain_train_mse_and_kappa(train_predictions, target):
    optR = OptimizedRounder()
    optR.fit(train_predictions, target)
    coefficients_ = optR.coefficients()
    rmse_score1 = rmse(target, train_predictions)
    train_predictions = optR.predict(train_predictions, optR.coefficients()).astype(int)
    qwk_score = quadratic_weighted_kappa(target, train_predictions)
    rmse_score2 = rmse(target, train_predictions)
    
    return rmse_score1, rmse_score2, qwk_score

def run_cv_model(train, test, target, weight, model_fn, params={}, eval_fn=None, label='model'):
    kf = FOLDS
    n_splits = N_FOLDS
    
    fold_splits = kf.split(train, target)
    cv_scores = []
    qwk_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0], n_splits))
    pred_test = np.zeros((origin_test.shape[0], n_splits))
    
    all_coefficients = np.zeros((n_splits, 4))
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started ' + label + ' fold ' + str(i) + '/{}'.format(n_splits))
        if isinstance(train, pd.DataFrame):
            dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
            dev_weight, val_weight = weight[dev_index], weight[val_index]
        else:
            dev_X, val_X = train[dev_index], train[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
            dev_weight, val_weight = weight[dev_index], weight[val_index]
            
        params2 = params.copy()
        pred_val_y, pred_test_y, importances, coefficients, qwk = model_fn(dev_X, dev_y, val_X, val_y, dev_weight, val_weight, test, params2)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        pred_test[:, i-1] = pred_test_y.reshape(-1)
        
        all_coefficients[i-1, :] = coefficients
        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            qwk_scores.append(qwk)
            print(label + ' cv score {}: RMSE {} QWK {}'.format(i, cv_score, qwk))
        i += 1
    train_rmse1,  train_rmse2, train_qwk = obtain_train_mse_and_kappa([r[0] for r in pred_train], target)
    print('{} cv RMSE scores : {}'.format(label, cv_scores))
    print('{} cv mean        RMSE score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv recalculate RMSE1 score : {}'.format(label, train_rmse1))
    print('{} cv recalculate RMSE2 score : {}'.format(label, train_rmse2))
    print('{} cv std RMSE score : {}'.format(label, np.std(cv_scores)))
    print('{} cv QWK scores : {}'.format(label, qwk_scores))
    print('{} cv mean        QWK score : {}'.format(label, np.mean(qwk_scores)))
    print('{} cv recalculate QWK score : {}'.format(label, train_qwk))
    print('{} cv std QWK score : {}'.format(label, np.std(qwk_scores)))
    pred_full_test = pred_full_test / float(n_splits)
    results = {'label': label,
               'train': pred_train, 'test': pred_full_test, 'test_value':pred_test,
                'cv': cv_scores, 'qwk': qwk_scores,
               'coefficients': all_coefficients}
    return results

def runLGB(train_X, train_y, test_X, test_y, dev_weight, val_weight, test_X2, params):
    print('Prep LGB')

    d_train = lgb.Dataset(train_X, label=train_y, weight=dev_weight)
    d_valid = lgb.Dataset(test_X, label=test_y, weight=val_weight)
    watchlist = [d_train, d_valid]
    print('Train LGB')
    num_rounds = params.pop('num_rounds')
    verbose_eval = params.pop('verbose_eval')
    early_stop = None
    if params.get('early_stop'):
        early_stop = params.pop('early_stop')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)
    print('Predict 1/2')
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    importances = model.feature_importance()
    optR = OptimizedRounder_v3()
    len_0 = test_y[test_y==0].shape[0]
    optR.fit(pred_test_y, test_y)
    coefficients = optR.coefficients()
    pred_test_y_k = optR.predict(pred_test_y, coefficients, len_0)
    print("Valid Counts = ", Counter(test_y))
    print("Predicted Counts = ", Counter(pred_test_y_k))
    print("Coefficients = ", coefficients)
    qwk = quadratic_weighted_kappa(test_y, pred_test_y_k)
    print("QWK = ", qwk)
    print('Predict 2/2')
    return np.array(pred_test_y).reshape(-1, 1), np.array(pred_test_y2).reshape(-1, 1), importances, coefficients, qwk

# def runXGB(train_X, train_y, test_X, test_y, dev_weight, val_weight, test_X2, params):
#     print('Prep XGB')
#     d_train = xgb.DMatrix(train_X, label=train_y, weight=dev_weight)
#     d_valid = xgb.DMatrix(test_X, label=test_y, weight=val_weight)
#     d_test = xgb.DMatrix(test_X2)
#     watchlist = [(d_train, 'train'), (d_valid, 'valid')]
#     print('Train XGB')
#     num_rounds = params['boost_num']
#     verbose_eval = params['verbose_eval']
#     early_stop = params['early_stop']
#     obj = None
#     if 'obj' in params.keys():
#         obj = params['obj']
#         model = xgb.train(params,
#                           dtrain=d_train,
#                           num_boost_round=num_rounds,
#                           evals=watchlist,
#                           verbose_eval=verbose_eval,
#                           early_stopping_rounds=early_stop,
#                           obj=obj)
#         print('Predict 1/2')
#         pred_test_y = softmax(model.predict(d_valid)).argmax(axis=1)
#         print (pred_test_y)
#         pred_test_y2 = softmax(model.predict(d_test)).argmax(axis=1)
#     else:
#         model = xgb.train(params,
#                           dtrain=d_train,
#                           num_boost_round=num_rounds,
#                           evals=watchlist,
#                           verbose_eval=verbose_eval,
#                           early_stopping_rounds=early_stop)
#         print('Predict 1/2')
#         pred_test_y = model.predict(d_valid)
#         pred_test_y2 = model.predict(d_test)

#     importances = [(x, model.get_score()[x]) for x in train_X.columns if x in model.get_score().keys()]
#     importances = [[x[0] for x in importances], [x[1] for x in importances]]
#     optR = OptimizedRounder()
#     len_0 = test_y[test_y==0].shape[0]
#     optR.fit(pred_test_y, test_y)
#     coefficients = optR.coefficients()
#     pred_test_y_k = optR.predict(pred_test_y, coefficients, len_0)
#     print("Valid Counts = ", Counter(test_y))
#     print("Predicted Counts = ", Counter(pred_test_y_k))
#     print("Coefficients = ", coefficients)
#     qwk = quadratic_weighted_kappa(test_y, pred_test_y_k)
#     print("QWK = ", qwk)
#     print('Predict 2/2')
#     return np.array(pred_test_y).reshape(-1, 1), np.array(pred_test_y2).reshape(-1, 1), importances, coefficients, qwk

# def runCGB(train_X, train_y, test_X, test_y, dev_weight, val_weight, test_X2, params):
#     print('Prep CGB')
#     global cat_features
#     watchlist = (test_X, test_y)
#     print('Train CGB')
#     verbose_eval = params.pop('verbose_eval')
#     early_stop = None
#     if params.get('early_stop'):
#         early_stop = params.pop('early_stop')
        
#     model = cgb.CatBoostRegressor(cat_features=list(cat_features), **params)
#     model.fit(train_X, train_y, eval_set=watchlist, verbose=verbose_eval)
        
#     print('Predict 1/2')
#     pred_test_y = model.predict(test_X, ntree_start=0, ntree_end=model.get_best_iteration())
#     pred_test_y2 = model.predict(test_X2, ntree_start=0, ntree_end=model.get_best_iteration())
#     importances = model.get_feature_importance()
#     optR = OptimizedRounder()
#     len_0 = test_y[test_y==0].shape[0]
#     optR.fit(pred_test_y, test_y)
#     coefficients = optR.coefficients()
#     pred_test_y_k = optR.predict(pred_test_y, coefficients, len_0)
#     print("Valid Counts = ", Counter(test_y))
#     print("Predicted Counts = ", Counter(pred_test_y_k))
#     print("Coefficients = ", coefficients)
#     qwk = quadratic_weighted_kappa(test_y, pred_test_y_k)
#     print("QWK = ", qwk)
#     print('Predict 2/2')
#     return np.array(pred_test_y).reshape(-1, 1), np.array(pred_test_y2).reshape(-1, 1), importances, coefficients, qwk


In [32]:
def get_cols(totals, prefixs):
    if isinstance(prefixs, list):
        cols = []
        for prefix in prefixs:
            cols += [col for col in totals if col.find(prefix) > -1]
        return cols
    else:
        return [col for col in totals if col.find(prefixs) > -1]

origin_cols = [
    "Type","Age",
    "Breed1","Breed2","Gender",
    "Color1","Color2","Color3",
    "MaturitySize","FurLength",
    "Vaccinated","Dewormed","Sterilized","Health",
    "Quantity","Fee","State",
    "VideoAmt","PhotoAmt"
]

doc_cols = get_cols(train.columns, 'doc_')
meta_cols = get_cols(train.columns, 'meta_')
pure_cols = get_cols(train.columns, 'pure_')
rescue_cols = get_cols(train.columns, 'rescue_')
lang_cols = get_cols(train.columns, 'lang_')
sml_cols = get_cols(train.columns, ['svd_', 'lda_', 'nmf_'])
pic_cols = get_cols(train.columns, 'pic_')

In [33]:
train['ResNet_meta'] = train_img_prob.flatten()
test['ResNet_meta'] = test_img_prob.flatten()

In [34]:
gzf_cols = doc_cols + lang_cols + origin_cols + pic_cols + [
                'meta_dominant_blue', 'meta_dominant_green','meta_dominant_pixel_frac', 
                 'meta_dominant_red', 'meta_dominant_score', 'meta_label_score', 
                'meta_vertex_x', 'meta_vertex_y'] + [
                gzf_prefix+'RescureID_rank',gzf_prefix+'Description_len',
                gzf_prefix+'Description_word_len',gzf_prefix+'Description_distinct_word_len',
                gzf_prefix+'Description_distinct_word_ratio',
                gzf_prefix+'is_pure',gzf_prefix+'is_pure_breed1',gzf_prefix+'is_pure_breed2',
                gzf_prefix+'Quantity_id',gzf_prefix+'307_ratio'
                ] + [gzf_prefix+'sdv_{}'.format(i) for i in range(SVD_FEATURES)] \
                  + [gzf_prefix+'mnf_{}'.format(i) for i in range(NMF_FEATURES)] \
                  + [gzf_prefix+'lad_{}'.format(i) for i in range(LDA_FEATURES)] \
                  + ['ResNet_meta']

In [35]:
zkr_cols = origin_cols+doc_cols+meta_cols+pure_cols+rescue_cols+lang_cols+sml_cols+pic_cols+['ResNet_meta']

## SAVE !!!

In [36]:
train_gzf = train[gzf_cols]
test_gzf = test[gzf_cols]

train_zkr = train[zkr_cols]
test_zkr = test[zkr_cols]

train_gzf.to_csv("train_gzf.csv", index=False)
test_gzf.to_csv("test_gzf.csv", index=False)

train_zkr.to_csv("train_zkr.csv", index=False)
test_zkr.to_csv("test_zkr.csv", index=False)

print(train_gzf.shape, test_gzf.shape, train_zkr.shape, test_zkr.shape)

(14993, 452) (3948, 452) (14993, 457) (3948, 457)


## 这里应该加入 LR ETC 等

**## MLP, not work for ridge stack

In [37]:
cat_cols = ['Health',
 'Breed1', 'Breed2',
 'Type', 'Gender',
 'Color3', 'Color2', 'Color1',
 'Vaccinated','Sterilized',  'Dewormed',
 'MaturitySize', 'FurLength',
 'State','meta_label_description','meta_label_description1','meta_label_description2']
zkr_numerical_cols = [item for item in zkr_cols if item not in cat_cols]
from sklearn.preprocessing import StandardScaler,MinMaxScaler
ss = StandardScaler()
ss.fit(train_zkr[zkr_numerical_cols].astype(float))
train_zkr_numerical_ss = ss.transform(train_zkr[zkr_numerical_cols].astype(float))
test_zkr_numerical_ss = ss.transform(test_zkr[zkr_numerical_cols].astype(float))

In [38]:
from sklearn.preprocessing import LabelBinarizer,LabelEncoder
max_cnt_list = []
train_zkr_cat_cols = []
test_zkr_cat_cols = []
for item in cat_cols:
    max_cnt_list.append(train[item].unique().shape[0] + 1)
    le = LabelEncoder().fit(pd.concat([train_zkr[item],test_zkr[item]]))
    train_zkr_cat_cols.append(le.transform(train_zkr[item]))
    test_zkr_cat_cols.append(le.transform(test_zkr[item]))

In [39]:
from keras import Model
from keras.layers import Dense,Embedding,Conv1D,SpatialDropout1D,Input,GlobalMaxPool1D,GlobalAvgPool1D
from keras.layers import concatenate,BatchNormalization,Dropout,Flatten,GRU
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder,LabelBinarizer,LabelEncoder
from sklearn.preprocessing import StandardScaler

def MLP():
    input_warpper,embed_warpper = [],[]
    for max_cnt,item in zip(max_cnt_list,cat_cols):
        _input = Input(shape=(1,),name=item,dtype='int32')
        _embed = Embedding(max_cnt,16,input_length=1)(_input)
        _embed = Flatten()(_embed)
        input_warpper.append(_input)
        embed_warpper.append(_embed)
    cate_feature = concatenate(embed_warpper)
    _input_numerical = Input(shape=(len(zkr_numerical_cols),),name='numerical')
    input_warpper.append(_input_numerical)
    numerical_feature = Dense(256,activation='relu')(_input_numerical)
    #numerical_feature = Dropout(0.25)(numerical_feature)
    feature_map = concatenate([cate_feature,numerical_feature])
    fc = BatchNormalization()(feature_map)
    fc = Dropout(0.2)(feature_map)
    fc_relu = Dense(256,activation='relu')(fc)
    fc_tanh = Dense(256,activation='tanh')(fc)
    fc = concatenate([fc_relu,fc_tanh])
    #fc = BatchNormalization()(fc)
    fc = Dropout(0.5)(fc)   
    output = Dense(1,activation='linear')(fc)
    
    return Model(input= input_warpper,output=output)

In [40]:
def run_cv_mlp(train_numerical,test_numerical,target,train_cat_list,test_cat_list):
    test_prob = np.zeros(shape=(test_numerical.shape[0],1))
    train_prob = np.zeros(shape=(train_numerical.shape[0],1))
    for tr_idx,te_idx in FOLDS.split(train_numerical,
                               target.values):
        print(len(tr_idx),len(te_idx))
        dtr = ([item[tr_idx] for item in train_cat_list] + [train_numerical[tr_idx]],target.values[tr_idx])
        dval = ([item[te_idx] for item in train_cat_list] + [train_numerical[te_idx]],target.values[te_idx])
        dtest = test_cat_list + [test_numerical]
        model = MLP()
        model.compile(optimizer='adam',
                      loss='mse')
        model.fit(dtr[0],dtr[1],batch_size=128,epochs=18,validation_data=dval,
                  shuffle=True,verbose=1)
        _test_prob = model.predict(dtest,batch_size=512)
        _val_prob = model.predict(dval[0],batch_size=512)
        train_prob[te_idx,:] = _val_prob 
        test_prob += _test_prob
    test_prob /=4.0
    return train_prob,test_prob                                       target,train_zkr_cat_cols,test_zkr_cat_cols)

SyntaxError: invalid syntax (<ipython-input-40-9f0746764082>, line 20)

In [41]:
mlp_zkr_train_pred.shape,mlp_zkr_test_pred.shape
mlp_zkr_train_pred.mean(),mlp_zkr_test_pred.mean()

NameError: name 'mlp_zkr_train_pred' is not defined

## LGB

In [42]:
params = {'application': 'regression',
          'boosting': 'gbdt',
          'metric': 'rmse',
          'num_leaves': 80,
          'max_depth': 9,
          'learning_rate': 0.01,
          'bagging_fraction': 0.9,
          'bagging_freq': 3,
          'feature_fraction': 0.85,
          'min_split_gain': 0.01,
          'min_child_samples': 150,
          'min_child_weight': 0.1,
          'verbosity': -1,
          'data_random_seed': 3,
          'early_stop': 100,
          'verbose_eval': 500,
          'num_rounds': 5000
         }

weight = pd.Series(np.where(train['Type']==2, 1.0, 1.0))
lgb_gzf = run_cv_model(train[gzf_cols], test[gzf_cols], target, weight, runLGB, params, rmse, 'lgb')

Started lgb fold 1/4
Prep LGB
Train LGB
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.833437	valid_1's rmse: 1.04255
[1000]	training's rmse: 0.709138	valid_1's rmse: 1.0341
[1500]	training's rmse: 0.614901	valid_1's rmse: 1.03166
Early stopping, best iteration is:
[1818]	training's rmse: 0.560503	valid_1's rmse: 1.03088
Predict 1/2
Valid Counts =  Counter({4.0: 1050, 2.0: 1010, 3.0: 815, 1.0: 773, 0.0: 103})
Predicted Counts =  Counter({2.0: 1554, 4.0: 1062, 3.0: 763, 1.0: 281, 0.0: 91})
Coefficients =  [0.51784122 1.81391186 2.50643653 2.85904262]
QWK =  0.4590437565524471
Predict 2/2
lgb cv score 1: RMSE 1.0308779567400281 QWK 0.4590437565524471
Started lgb fold 2/4
Prep LGB
Train LGB
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.830509	valid_1's rmse: 1.04224
[1000]	training's rmse: 0.703029	valid_1's rmse: 1.03395
Early stopping, best iteration is:
[1278]	training's rmse: 0.64846	valid_1's rmse: 1.0

In [43]:
params = {'application': 'regression',
          'boosting': 'gbdt',
          'metric': 'rmse',
          'num_leaves': 80,
          'max_depth': 9,
          'learning_rate': 0.01,
          'bagging_fraction': 0.9,
          'bagging_freq': 3,
          'feature_fraction': 0.84,
          'min_split_gain': 0.01,
          'min_child_samples': 150,
          'min_child_weight': 0.1,
          'verbosity': -1,
          'data_random_seed': 3,
#           'early_stop': 100,
          'verbose_eval': 500,
          'num_rounds': 1500,
         }

weight = pd.Series(np.where(train['Type']==2, 1.0, 1.0))
lgb_zkr = run_cv_model(train[zkr_cols], test[zkr_cols], target, weight, runLGB, params, rmse, 'lgb')

Started lgb fold 1/4
Prep LGB
Train LGB
[500]	training's rmse: 0.827229	valid_1's rmse: 1.04238
[1000]	training's rmse: 0.697797	valid_1's rmse: 1.03435
[1500]	training's rmse: 0.594214	valid_1's rmse: 1.03152
Predict 1/2
Valid Counts =  Counter({4.0: 1050, 2.0: 1010, 3.0: 815, 1.0: 773, 0.0: 103})
Predicted Counts =  Counter({2.0: 2046, 4.0: 1095, 3.0: 462, 0.0: 91, 1.0: 57})
Coefficients =  [0.51205368 1.60020659 2.61892869 2.83391988]
QWK =  0.44439251160399384
Predict 2/2
lgb cv score 1: RMSE 1.0315168018516534 QWK 0.44439251160399384
Started lgb fold 2/4
Prep LGB
Train LGB
[500]	training's rmse: 0.825643	valid_1's rmse: 1.0359
[1000]	training's rmse: 0.693558	valid_1's rmse: 1.02647
[1500]	training's rmse: 0.593332	valid_1's rmse: 1.02389
Predict 1/2
Valid Counts =  Counter({4.0: 1049, 2.0: 1009, 3.0: 815, 1.0: 773, 0.0: 103})
Predicted Counts =  Counter({2.0: 1700, 3.0: 944, 4.0: 825, 1.0: 189, 0.0: 91})
Coefficients =  [0.51081985 1.74319598 2.53636869 2.99962395]
QWK =  0.46858

# 特征: ZYL

In [44]:
# 重新导入, 一了百了

del train, test
gc.collect()

train = pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
test = pd.read_csv("../input/petfinder-adoption-prediction/test/test.csv")

In [45]:
train['Color'] = train.Color1 * 100 + train.Color2 * 10 + train.Color3
train.drop(['Color1', 'Color2', 'Color3'], axis=1, inplace=True)

test['Color'] = test.Color1 * 100 + test.Color2 * 10 + test.Color3
test.drop(['Color1', 'Color2', 'Color3'], axis=1, inplace=True)

In [46]:
target = train['AdoptionSpeed']
train_id = train['PetID']
test_id = test['PetID']

In [47]:
# sentiment data

doc_sent_mag = []
doc_sent_score = []
nf_count = 0
for pet in train_id:
    try:
        with open('../input/petfinder-adoption-prediction/train_sentiment/' + pet + '.json', 'r') as f:
            sentiment = json.load(f)
        doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
        doc_sent_score.append(sentiment['documentSentiment']['score'])
    except FileNotFoundError:
        nf_count += 1
        doc_sent_mag.append(-1)
        doc_sent_score.append(-1)

train.loc[:, 'doc_sent_mag'] = doc_sent_mag
train.loc[:, 'doc_sent_score'] = doc_sent_score
train["doc_sentiment"] = train.doc_sent_mag * train.doc_sent_score

doc_sent_mag = []
doc_sent_score = []
nf_count = 0
for pet in test_id:
    try:
        with open('../input/petfinder-adoption-prediction/test_sentiment/' + pet + '.json', 'r') as f:
            sentiment = json.load(f)
        doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
        doc_sent_score.append(sentiment['documentSentiment']['score'])
    except FileNotFoundError:
        nf_count += 1
        doc_sent_mag.append(-1)
        doc_sent_score.append(-1)

test.loc[:, 'doc_sent_mag'] = doc_sent_mag
test.loc[:, 'doc_sent_score'] = doc_sent_score
test["doc_sentiment"] = test.doc_sent_mag * test.doc_sent_score

In [48]:
# description TF-IDF

n_components = 150

train_desc = train.Description.fillna("none").values
test_desc = test.Description.fillna("none").values

# tfv = TfidfVectorizer(min_df=2,  max_features=None,
#         strip_accents='unicode', analyzer='word', token_pattern=r'(?u)\b\w+\b',
#         ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
#         stop_words='english')

tfv = TfidfVectorizer(min_df=3,  max_features=None,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words='english')

tfv.fit(list(train_desc))
X = tfv.transform(train_desc)
X_test = tfv.transform(test_desc)

svd = TruncatedSVD(n_components=n_components)
svd.fit(X)
X = svd.transform(X)

X = pd.DataFrame(X, columns=['svd_{}'.format(i) for i in range(n_components)])
train = pd.concat((train, X), axis=1)
X_test = svd.transform(X_test)
X_test = pd.DataFrame(X_test, columns=['svd_{}'.format(i) for i in range(n_components)])
test = pd.concat((test, X_test), axis=1)

In [49]:
# image metadata

img_xs = []
img_ys = []
vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0
for pet in train_id:
    try:
        im = Image.open('../input/petfinder-adoption-prediction/train_images/%s-1.jpg' % pet)
        width, height = im.size
        img_xs.append(width)
        img_ys.append(height)
        with open('../input/petfinder-adoption-prediction/train_metadata/' + pet + '-1.json', 'r') as f:
            data = json.load(f)
        vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
        vertex_xs.append(vertex_x)
        vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
        vertex_ys.append(vertex_y)
        bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
        bounding_confidences.append(bounding_confidence)
        bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
        bounding_importance_fracs.append(bounding_importance_frac)
        dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
        dominant_blues.append(dominant_blue)
        dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
        dominant_greens.append(dominant_green)
        dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
        dominant_reds.append(dominant_red)
        dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
        dominant_pixel_fracs.append(dominant_pixel_frac)
        dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
        dominant_scores.append(dominant_score)
        if data.get('labelAnnotations'):
            label_description = data['labelAnnotations'][0]['description']
            label_descriptions.append(label_description)
            label_score = data['labelAnnotations'][0]['score']
            label_scores.append(label_score)
        else:
            nl_count += 1
            label_descriptions.append('nothing')
            label_scores.append(-1)
    except FileNotFoundError:
        nf_count += 1
        img_xs.append(-1)
        img_ys.append(-1)
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)

train.loc[:, 'img_x'] = img_xs
train.loc[:, 'img_y'] = img_ys
train.loc[:, 'vertex_x'] = vertex_xs
train.loc[:, 'vertex_y'] = vertex_ys
train.loc[:, 'bounding_confidence'] = bounding_confidences
train.loc[:, 'bounding_importance'] = bounding_importance_fracs
train.loc[:, 'dominant_blue'] = dominant_blues
train.loc[:, 'dominant_green'] = dominant_greens
train.loc[:, 'dominant_red'] = dominant_reds
train.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
train.loc[:, 'dominant_score'] = dominant_scores
train.loc[:, 'label_description'] = label_descriptions
train.loc[:, 'label_score'] = label_scores

img_xs = []
img_ys = []
vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0
for pet in test_id:
    try:
        im = Image.open('../input/petfinder-adoption-prediction/test_images/%s-1.jpg' % pet)
        width, height = im.size
        img_xs.append(width)
        img_ys.append(height)
        with open('../input/petfinder-adoption-prediction/test_metadata/' + pet + '-1.json', 'r') as f:
            data = json.load(f)
        vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
        vertex_xs.append(vertex_x)
        vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
        vertex_ys.append(vertex_y)
        bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
        bounding_confidences.append(bounding_confidence)
        bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
        bounding_importance_fracs.append(bounding_importance_frac)
        dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
        dominant_blues.append(dominant_blue)
        dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
        dominant_greens.append(dominant_green)
        dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
        dominant_reds.append(dominant_red)
        dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
        dominant_pixel_fracs.append(dominant_pixel_frac)
        dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
        dominant_scores.append(dominant_score)
        if data.get('labelAnnotations'):
            label_description = data['labelAnnotations'][0]['description']
            label_descriptions.append(label_description)
            label_score = data['labelAnnotations'][0]['score']
            label_scores.append(label_score)
        else:
            nl_count += 1
            label_descriptions.append('nothing')
            label_scores.append(-1)
    except FileNotFoundError:
        nf_count += 1
        img_xs.append(-1)
        img_ys.append(-1)
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)

test.loc[:, 'img_x'] = img_xs
test.loc[:, 'img_y'] = img_ys
test.loc[:, 'vertex_x'] = vertex_xs
test.loc[:, 'vertex_y'] = vertex_ys
test.loc[:, 'bounding_confidence'] = bounding_confidences
test.loc[:, 'bounding_importance'] = bounding_importance_fracs
test.loc[:, 'dominant_blue'] = dominant_blues
test.loc[:, 'dominant_green'] = dominant_greens
test.loc[:, 'dominant_red'] = dominant_reds
test.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
test.loc[:, 'dominant_score'] = dominant_scores
test.loc[:, 'label_description'] = label_descriptions
test.loc[:, 'label_score'] = label_scores

In [50]:
train["vertex_x_ratio"] = train.vertex_x / train.img_x
train["vertex_y_ratio"] = train.vertex_y / train.img_y

test["vertex_x_ratio"] = test.vertex_x / test.img_x
test["vertex_y_ratio"] = test.vertex_y / test.img_y

In [51]:
# name length

train.Name = train.Name.fillna('')
test.Name = test.Name.fillna('')
train["Name"] = train.Name.apply(lambda x: str(x).lower())
test["Name"] = test.Name.apply(lambda x: str(x).lower())

train["name_length"] = train.Name.apply(lambda x: len(str(x)))
test["name_length"] = test.Name.apply(lambda x: len(str(x)))

# no name or not
# train['No_name'] = 0
# train.loc[train.name_length == 0, 'No_name'] = 1
# train.loc[train.Name == 'Unnamed', 'No_name'] = 1
# train.loc[train.Name == 'No Name', 'No_name'] = 1
# train.loc[train.Name == 'No Name Yet', 'No_name'] = 1

# test['No_name'] = 0
# test.loc[test.name_length == 0, 'No_name'] = 1
# test.loc[test.Name == 'Unnamed', 'No_name'] = 1
# test.loc[test.Name == 'No Name', 'No_name'] = 1
# test.loc[test.Name == 'No Name Yet', 'No_name'] = 1

In [52]:
all_data = pd.concat((train, test))

name_idx, name_val = all_data.Name.value_counts().index, all_data.Name.value_counts().values
name_map = dict()
for idx, val in zip(name_idx, name_val):
    name_map.update({idx: val})

train["name_cnt"] = train.Name.map(name_map)
test["name_cnt"] = test.Name.map(name_map)

In [53]:
# description length and words

train['Description'] = train['Description'].fillna('')
test['Description'] = test['Description'].fillna('')

train['desc_length'] = train['Description'].apply(lambda x: len(x))
train['desc_words'] = train['Description'].apply(lambda x: len(x.split()))

test['desc_length'] = test['Description'].apply(lambda x: len(x))
test['desc_words'] = test['Description'].apply(lambda x: len(x.split()))

In [54]:
# description lexical density

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '^']

def lexical_density(x):
    for punct in puncts:
        x = x.replace(punct, "")
    li = x.split(" ")
    return len(set(li)) / len(li) if len(li) != 0 else 0

train["desc_lexical_density"] = train.Description.apply(lambda x: lexical_density(x))
test["desc_lexical_density"] = test.Description.apply(lambda x: lexical_density(x))

In [55]:
def sentences_count(x):
    return len(re.split(r'[.!?]+', x))

train["sentences_count"] = train.Description.apply(lambda x: sentences_count(x))
test["sentences_count"] = test.Description.apply(lambda x: sentences_count(x))

In [56]:
# description capitals count

def find_capitals(x):
    return len(re.findall('[A-Z]', x))

train["desc_capitals"] = train.Description.apply(lambda x: find_capitals(x))
test["desc_capitals"] = test.Description.apply(lambda x: find_capitals(x))

In [57]:
# number of same rescuer, a very important feature, +0.026 in LB

rescuer_idx, rescuer_val = all_data.RescuerID.value_counts().index, all_data.RescuerID.value_counts().values
rescuer_map = dict()
for idx, val in zip(rescuer_idx, rescuer_val):
    rescuer_map.update({idx: val})

train["rescuer_cnt"] = train.RescuerID.map(rescuer_map)
test["rescuer_cnt"] = test.RescuerID.map(rescuer_map)

In [58]:
# state GDP: https://en.wikipedia.org/wiki/List_of_Malaysian_states_by_GDP
state_gdp = {
    41336: 116.679,
    41325: 40.596,
    41367: 23.02,
    41401: 190.075,
    41415: 5.984,
    41324: 37.274,
    41332: 42.389,
    41335: 52.452,
    41330: 67.629,
    41380: 5.642,
    41327: 81.284,
    41345: 80.167,
    41342: 121.414,
    41326: 280.698,
    41361: 32.270
}

# state population: https://zh.wikipedia.org/wiki/%E9%A9%AC%E6%9D%A5%E8%A5%BF%E4%BA%9A
state_population = {
    41336: 33.48283,
    41325: 19.47651,
    41367: 15.39601,
    41401: 16.74621,
    41415: 0.86908,
    41324: 8.21110,
    41332: 10.21064,
    41335: 15.00817,
    41330: 23.52743,
    41380: 2.31541,
    41327: 15.61383,
    41345: 32.06742,
    41342: 24.71140,
    41326: 54.62141,
    41361: 10.35977
}

# state area
state_area = {
    41336: 19.210,
    41325: 9.500,
    41367: 15.099,
    41401: 0.243,
    41415: 0.091,
    41324: 1.664,
    41332: 6.686,
    41335: 36.137,
    41330: 21.035,
    41380: 2.31541,
    41327: 0.821,
    41345: 73.631,
    41342: 124.450,
    41326: 8.104,
    41361: 13.035
}

train["state_gdp"] = train.State.map(state_gdp)
train["state_population"] = train.State.map(state_population)
train["state_area"] = train.State.map(state_area)
test["state_gdp"] = test.State.map(state_gdp)
test["state_population"] = test.State.map(state_population)
test["state_area"] = test.State.map(state_area)

In [59]:
# Pure breed or not, seem not so important, but +0.010 in LB
# {"Domestic Long Hair": 264, "Domestic Medium Hair": 265, "Domestic Short Hair": 266, "Mixed Breed": 307}

train['Pure_breed'] = 1
train.loc[train['Breed2'] != 0, 'Pure_breed'] = 0
train.loc[train['Breed1'] == 264, 'Pure_breed'] = 0
train.loc[train['Breed1'] == 265, 'Pure_breed'] = 0
train.loc[train['Breed1'] == 266, 'Pure_breed'] = 0
train.loc[train['Breed1'] == 307, 'Pure_breed'] = 0

test['Pure_breed'] = 1
test.loc[test['Breed2'] != 0, 'Pure_breed'] = 0
test.loc[test['Breed1'] == 264, 'Pure_breed'] = 0
test.loc[test['Breed1'] == 265, 'Pure_breed'] = 0
test.loc[test['Breed1'] == 266, 'Pure_breed'] = 0
test.loc[test['Breed1'] == 307, 'Pure_breed'] = 0

In [60]:
# drop some not so impantance features

train.drop(['vertex_x', 'vertex_y', 'bounding_confidence'], axis=1, inplace=True)
test.drop(['vertex_x', 'vertex_y', 'bounding_confidence'], axis=1, inplace=True)

In [61]:
n_img_features = 128

img_size = 256
batch_size = 16

inp = Input((img_size, img_size, 3))
backbone = DenseNet121(input_tensor=inp, 
                       weights="../input/densenet-keras/DenseNet-BC-121-32-no-top.h5",
                       include_top = False)
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = AveragePooling1D(1024//n_img_features)(x)
out = Lambda(lambda x: x[:,:,0])(x)

m = Model(inp,out)

In [62]:
pet_ids = train_id.values
n_batches = len(pet_ids) // batch_size + 1

features = {}
for b in tqdm_notebook(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/train_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]
        
train_feats = pd.DataFrame.from_dict(features, orient='index')
# train_feats.to_csv('train_img_features.csv')
train_feats.head()

HBox(children=(IntProgress(value=0, max=938), HTML(value='')))




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127
86e1089a3,0.080903,0.02707,0.029983,0.005923,0.054676,0.047727,0.005912,0.022859,0.009381,0.002341,0.036399,0.238115,0.059121,0.286911,0.043492,0.176318,0.092814,0.177671,0.02611,0.439185,0.040107,0.07231,0.011463,0.009991,0.056529,0.038468,0.117934,0.001337,0.016928,0.190724,0.022009,0.004581,0.196088,0.00271,0.018651,0.036641,0.006995,0.072241,0.195218,0.017664,0.003598,0.034458,0.003159,0.045706,0.096832,0.009699,0.014707,0.275425,0.036237,0.006158,0.005701,0.071875,0.004418,0.229531,0.286014,0.047121,0.206055,0.103146,0.102677,0.095363,0.005054,0.002239,0.022126,0.487992,0.579151,0.303814,0.589608,0.413121,0.665153,1.262865,1.033445,0.407925,0.506853,0.366394,0.77876,0.585112,0.547824,0.357888,0.663029,0.347627,0.190081,0.616218,0.964143,0.867626,1.193148,0.28265,0.673261,0.719893,0.329257,0.914918,1.200007,0.694911,0.519954,0.587572,1.510033,0.250937,0.611939,1.002798,0.356929,0.50864,0.922428,0.893496,0.502684,1.394513,0.476291,0.537682,0.648765,0.446755,0.938161,0.590351,1.045655,0.866024,0.504175,0.698822,0.438378,0.775556,1.428558,0.625741,0.920458,0.335475,1.251274,0.513746,0.790772,0.554214,0.699005,0.563984,1.304191,0.730227
6296e909a,0.036908,0.025865,0.093256,0.007122,0.084101,0.077085,0.006319,0.015811,0.005474,0.001847,0.068952,0.130923,0.280532,0.148266,0.035186,0.065542,0.153735,0.237636,0.0151,0.226996,0.061644,0.143224,0.013348,0.003795,0.061625,0.053013,0.228513,0.001519,0.050926,0.288104,0.02673,0.003126,0.11946,0.013254,0.010652,0.044845,0.054899,0.151147,0.230314,0.061158,0.005482,0.04141,0.002876,0.023397,0.180639,0.012006,0.02244,0.1839,0.06037,0.005367,0.005765,0.11348,0.00244,0.181085,0.185797,0.011512,0.158204,0.104252,0.275904,0.108166,0.007445,0.002295,0.01151,0.178896,0.299209,0.427445,0.431121,0.455004,0.391252,0.978274,0.606587,0.892098,0.711356,0.418251,1.458445,0.750638,0.309308,0.688439,0.349693,0.644987,0.587616,0.431601,1.036617,0.482585,0.983732,0.381916,0.65724,1.05952,0.484432,0.633328,1.057919,0.795388,0.677788,0.822105,0.617686,0.278425,0.999062,0.974949,0.781161,0.594618,1.310527,0.635394,1.679605,0.785813,0.977226,1.197514,1.132065,0.618098,0.709188,1.259585,1.110054,0.752975,0.580067,0.645088,0.660442,0.824307,0.784991,0.873006,0.497908,0.634934,1.31137,0.563046,1.040382,0.619333,0.711116,1.271373,0.702722,0.811153
3422e4906,0.031333,0.019316,0.063033,0.003909,0.062636,0.052466,0.005522,0.008873,0.013176,0.002032,0.056556,0.227165,0.036671,0.269196,0.062176,0.295367,0.062095,0.224414,0.027769,0.464024,0.037842,0.053576,0.011953,0.006924,0.076405,0.02624,0.069317,0.001182,0.014946,0.236115,0.017828,0.002726,0.171526,0.014121,0.005398,0.042197,0.015808,0.059264,0.15836,0.01755,0.003582,0.088531,0.0019,0.025637,0.08127,0.007263,0.019419,0.275068,0.067482,0.004163,0.007121,0.100102,0.002837,0.231722,0.423444,0.072245,0.067363,0.230224,0.085733,0.132242,0.00748,0.001563,0.023581,0.365315,0.624939,0.407349,0.448166,0.405758,0.579463,0.776017,0.381641,0.596929,0.285997,0.493722,0.474389,0.363883,0.70121,0.701126,0.563001,0.549886,0.276724,0.712891,0.885036,0.44272,0.382157,0.699339,0.186581,1.105836,0.556738,0.366696,0.564532,0.482095,0.922556,0.718453,1.40834,0.820676,0.859361,0.906259,0.762644,0.659158,0.916725,0.640755,1.181538,0.9801,0.628372,0.708757,0.4793,0.785339,0.940515,0.845812,0.741837,0.427259,0.313643,0.499163,0.944294,0.627926,0.576165,0.618234,0.800512,0.794122,0.807578,0.729883,0.764788,0.762196,1.056118,1.733463,1.272634,0.690354
5842f1ff5,0.100765,0.058779,0.044435,0.005573,0.056934,0.152086,0.003599,0.01309,0.005545,0.003043,0.040706,0.115147,0.086512,0.187995,0.042647,0.095363,0.049536,0.226077,0.023286,0.148373,0.074957,0.063764,0.010141,0.00741,0.095906,0.055115,0.100356,0.001387,0.056281,0.233078,0.021738,0.003783,0.222727,0.005912,0.019378,0.048208,0.012948,0.073872,0.165543,0.037689,0.004991,0.164023,0.002971,0.063428,0.166494,0.010949,0.021875,0.301821,0.103855,0.005201,0.008208,0.030184,0.003318,0.178647,0.227816,0.026276,0.091511,0.180973,0.150826,0.045812,0.007572,0.003175,0.012983,0.395622,0.475078,0.219794,0.351564,0.441491,0.621629,0.890861,1.104187,0.485312,0.330086,0.737265,1.014804,0.301714,0.498786,0.319726,0.822718,0.683046,0.376709,0.970635,0.65751,1.164067,0.545164,0.525729,0.44182,0.673608,0.84466,1.238436,1.106471,0.526544,0.732316,0.829567,0.576851,0.31388,0.933374,1.216672,0.596536,0.606384,1.112674,1.313025,0.747136,0.93562,1.232728,0.72956,1.369965,0.744446,1.55971,0.504133,0.995418,0.763051,0.530875,1.203119,0.489429,0.598452,0.84759,1.254122,0.853125,1.111104,1.204281,1.003967,0.821134,1.081506,0.771984,1.580613,0.802881,1.027818
850a43f90,0.064575,0.056714,0.129328,0.004659,0.060749,0.198181,0.004649,0.013841,0.009161,0.002527,0.055383,0.227129,0.013477,0.325656,0.083217,0.19412,0.034585,0.387116,0.026539,0.223599,0.112257,0.077782,0.0141,0.010001,0.047016,0.05168,0.205966,0.001437,0.007965,0.362459,0.017941,0.003566,0.290657,0.005715,0.024016,0.065434,0.100374,0.086999,0.149593,0.034557,0.004126,0.138259,0.002639,0.056932,0.060603,0.010333,0.022251,0.333058,0.090267,0.006838,0.00762,0.072204,0.002903,0.277812,0.309486,0.058816,0.074558,0.461126,0.13931,0.048256,0.008033,0.003616,0.019297,0.100398,0.391858,0.327033,0.485347,0.373845,0.600073,0.851016,0.849287,0.48698,0.261849,0.450096,0.922868,0.325289,0.387753,0.192341,0.404544,0.446913,0.302947,0.880942,0.725538,0.797994,0.58942,0.807926,0.270234,1.186765,0.402791,0.398661,0.590872,0.441502,0.662898,0.418006,0.918066,0.82024,0.637656,0.677504,0.509595,1.009955,0.681656,0.998213,0.545514,0.919542,0.798344,0.74978,0.794771,0.212994,1.172289,0.363446,0.811347,0.456034,0.244271,0.801077,0.873605,0.665535,0.960412,0.568228,0.766188,0.464421,0.70381,0.798323,0.535618,0.675487,0.57939,0.648065,0.813327,0.385569


In [63]:
pet_ids = test_id.values
n_batches = len(pet_ids) // batch_size + 1

features = {}
for b in tqdm_notebook(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/test_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]
        
test_feats = pd.DataFrame.from_dict(features, orient='index')
# test_feats.to_csv('test_img_features.csv')
test_feats.head()

HBox(children=(IntProgress(value=0, max=247), HTML(value='')))




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127
378fcc4fc,0.046349,0.02944,0.044969,0.00582,0.043166,0.084996,0.006374,0.012181,0.012883,0.002154,0.049012,0.113741,0.166774,0.214079,0.117198,0.170769,0.083287,0.369204,0.023654,0.210734,0.075803,0.077567,0.008761,0.006844,0.064534,0.044727,0.163834,0.001316,0.009899,0.404045,0.032993,0.004249,0.16417,0.006376,0.011148,0.047406,0.049476,0.117032,0.199826,0.057105,0.003518,0.08079,0.00262,0.041794,0.098319,0.00909,0.021795,0.31534,0.063945,0.00473,0.00692,0.070421,0.003485,0.202767,0.226186,0.036033,0.109722,0.17698,0.103157,0.074406,0.00451,0.003517,0.017212,0.316111,0.632204,0.34913,0.661242,0.224366,0.411741,1.005475,0.777165,0.706809,0.366492,0.469502,0.788777,0.402848,0.746356,0.456743,0.512674,0.581825,0.606519,0.716501,0.830395,0.817831,0.843343,0.853221,0.477257,1.366606,0.367085,0.759267,0.444994,0.368155,0.84821,0.475449,1.418633,0.340132,0.615128,0.869077,0.535458,0.781738,1.459623,1.05389,1.426533,0.935947,1.259884,0.764008,0.798453,0.469986,1.37681,0.760331,0.792629,0.538545,0.475917,0.986913,0.783445,0.452399,0.940398,1.184496,0.706118,0.780622,1.099451,1.113779,0.588675,1.029418,0.421483,0.68935,0.818461,0.732772
73c10e136,0.045589,0.052937,0.03565,0.00598,0.055054,0.1055,0.006685,0.019189,0.009906,0.003125,0.032303,0.137175,0.037631,0.233158,0.064486,0.159466,0.067773,0.457415,0.028972,0.203293,0.074134,0.066226,0.00993,0.007108,0.085333,0.056994,0.159706,0.001488,0.013014,0.254318,0.030654,0.004489,0.254228,0.007231,0.015752,0.055318,0.038069,0.055355,0.145451,0.055705,0.004485,0.100308,0.002933,0.036124,0.063251,0.012044,0.02293,0.305344,0.100218,0.005209,0.007967,0.063378,0.002757,0.171008,0.277723,0.030621,0.093463,0.151823,0.105426,0.069669,0.006313,0.004545,0.010563,0.315773,0.334602,0.196531,0.458193,0.450918,0.526053,1.278021,0.366633,0.501924,0.526905,0.626809,0.533934,0.399937,0.502697,0.60458,0.302201,0.546833,0.522286,0.368751,1.452713,0.502576,0.687203,0.510752,0.608004,0.632271,0.458742,0.499368,0.964196,0.413989,0.366097,0.554588,1.122979,0.289271,0.597558,0.767396,0.567055,0.346644,0.991542,0.165603,0.715267,0.490068,0.77378,0.434031,0.289216,0.552551,0.915964,0.600612,0.985994,0.306613,0.487538,0.658591,0.379439,0.309266,1.170504,0.392955,0.62161,0.259793,0.914024,0.44533,0.630293,0.640174,0.816106,0.351082,0.833028,0.622346
72000c4c5,0.037502,0.050106,0.034061,0.006931,0.038617,0.052741,0.00786,0.022462,0.017758,0.00318,0.02561,0.156507,0.098218,0.228299,0.040125,0.112434,0.078324,0.399952,0.043204,0.283079,0.060671,0.070839,0.011359,0.010377,0.105774,0.059309,0.150404,0.001649,0.040381,0.278929,0.028854,0.004602,0.274328,0.012994,0.024581,0.047417,0.062418,0.052685,0.226046,0.056199,0.004751,0.102997,0.004224,0.032885,0.049734,0.01241,0.023451,0.276831,0.081434,0.003758,0.006517,0.064746,0.002303,0.171113,0.287124,0.031599,0.158322,0.082809,0.041491,0.067253,0.003629,0.005206,0.023631,0.343334,0.338089,0.324906,0.438208,0.818757,0.47335,1.204175,0.493931,0.496887,0.577731,0.564775,0.721198,0.535011,0.632743,0.652219,0.40981,0.374663,0.378863,0.471195,0.708706,0.767574,0.693229,0.433185,0.296737,0.386405,0.820593,1.000587,0.926746,0.941541,0.312288,0.711103,1.291658,0.45085,0.525968,1.132253,0.558267,0.407827,1.638329,0.571406,0.462825,0.586985,0.715839,0.458237,0.71874,0.55787,0.824893,0.929462,1.217274,0.638145,0.302924,1.059461,0.219597,0.478852,0.848999,0.864279,0.367135,0.854358,0.94337,0.488411,0.724523,0.395695,1.487489,1.181651,0.963292,1.570807
e147a4b9f,0.071273,0.055146,0.063173,0.007101,0.089654,0.157869,0.00646,0.017983,0.007363,0.003201,0.055125,0.182252,0.100993,0.206117,0.077656,0.169949,0.065301,0.297373,0.02992,0.242749,0.110174,0.120874,0.011839,0.009682,0.029369,0.03627,0.238934,0.001332,0.043875,0.326319,0.029494,0.006165,0.217003,0.012426,0.012992,0.065112,0.037583,0.157354,0.189831,0.035991,0.00414,0.107428,0.00369,0.039592,0.13687,0.010182,0.027896,0.392095,0.132668,0.00987,0.00692,0.126272,0.003746,0.292218,0.317352,0.030506,0.098246,0.200996,0.129977,0.112856,0.007267,0.004949,0.018493,0.138991,0.375109,0.300629,0.322883,0.274114,0.569864,0.987781,0.475167,1.095235,0.567781,0.587673,0.512507,0.345763,0.372966,0.688056,0.495267,0.589257,0.909955,0.999841,1.401905,1.12988,0.711181,0.889036,0.675256,0.744008,1.220147,0.89132,0.790061,0.759829,0.879403,1.166887,1.093494,0.985155,1.375417,1.001402,0.876193,1.793556,1.106975,1.083176,1.340941,1.159961,0.85935,1.782988,1.664184,1.163683,1.246102,0.609332,1.405037,1.099618,0.649382,1.225957,0.930254,0.759043,1.003686,1.041095,1.086146,1.181188,1.385663,0.630985,0.680483,0.379867,1.016909,1.292094,0.996778,1.078207
43fbba852,0.063748,0.035317,0.036975,0.003873,0.042042,0.063653,0.006059,0.013797,0.006523,0.002535,0.047355,0.250222,0.007864,0.186313,0.025167,0.313727,0.040675,0.352359,0.036918,0.536727,0.099861,0.047764,0.01252,0.006457,0.073195,0.030105,0.142776,0.001311,0.009923,0.247909,0.013985,0.003276,0.1318,0.005999,0.015766,0.052328,0.026891,0.067437,0.139621,0.015296,0.003765,0.095621,0.002168,0.044547,0.067574,0.00902,0.018976,0.331504,0.06693,0.008027,0.007206,0.03298,0.002943,0.253995,0.423106,0.071004,0.093567,0.418462,0.115994,0.055149,0.00642,0.002672,0.020206,0.229727,0.497733,0.224861,0.253585,0.460645,0.886743,0.863309,0.668654,0.564627,0.25353,0.515981,0.649569,0.216576,0.545469,0.586237,0.660649,0.428468,0.42928,0.624565,0.897288,0.807497,0.44955,0.381019,0.655371,0.621315,0.862708,1.067131,0.896304,0.514717,0.800848,1.439869,0.988998,0.520423,1.207341,1.213966,1.153998,0.195709,0.59115,0.469768,0.756539,0.986116,0.545458,0.462715,0.945431,0.690648,1.044523,0.706745,0.641306,0.709894,0.647253,0.430361,0.732715,0.409872,0.941131,0.596771,1.157225,0.774777,0.958384,0.607934,0.517225,0.730382,0.709576,1.821009,1.047257,0.976913


In [64]:
train_feats.columns = ["img_feat{}".format(i) for i in range(n_img_features)]
test_feats.columns = ["img_feat{}".format(i) for i in range(n_img_features)]

train_feats["PetID"] = train_feats.index
test_feats["PetID"] = test_feats.index

train = pd.merge(train, train_feats, on="PetID")
test = pd.merge(test, test_feats, on="PetID")

print(train.shape, test.shape)

(14993, 327) (3948, 326)


In [65]:
train.drop(['AdoptionSpeed', 'PetID'], axis=1, inplace=True)
test.drop(['PetID'], axis=1, inplace=True)
train['ResNet_meta'] = train_img_prob.flatten()         #ImageMeta
test['ResNet_meta'] = test_img_prob.flatten()           #ImageMeta

## SAVE !!!

In [66]:
train.to_csv("train_zyl.csv", index=False)
test.to_csv("test_zyl.csv", index=False)

## 这里留给 LR ETC 等

In [67]:
# 占位用

## LGB

In [68]:
train.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)
test.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)

# rearrange columns again
c = ['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'VideoAmt', 'PhotoAmt', 'Color'] +  ["img_feat{}".format(i) for i in range(n_img_features)] + ['doc_sent_mag', 'doc_sent_score'] + ['svd_{}'.format(i) for i in range(n_components)] + ['img_x', 'img_y', 'bounding_importance', 'dominant_blue', 'dominant_green', 'dominant_red', 'dominant_pixel_frac', 'dominant_score','label_description', 'label_score', 'vertex_x_ratio', 'vertex_y_ratio', 'name_length', 'name_cnt', 'desc_length', 'desc_words', 'desc_lexical_density', 'sentences_count', 'desc_capitals', 'rescuer_cnt', 'state_gdp', 'state_population', 'Pure_breed']
train = train[c]
test = test[c]

numeric_cols = ['Age', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt', 'doc_sent_mag', 'doc_sent_score', 'dominant_score', 'dominant_pixel_frac', 'dominant_red', 'dominant_green', 'dominant_blue', 'bounding_importance', 'img_x', 'img_y', 'vertex_x_ratio', 'vertex_y_ratio', 'label_score', 'desc_length', 'desc_words', 'desc_lexical_density', 'sentences_count', 'desc_capitals', 'rescuer_cnt', 'state_gdp', 'state_population', 'Pure_breed', 'name_length', 'name_cnt'] + ['svd_{}'.format(i) for i in range(n_components)] + ["img_feat{}".format(i) for i in range(n_img_features)]
cat_cols = list(set(train.columns) - set(numeric_cols))

train.loc[:, cat_cols] = train[cat_cols].astype('category')
test.loc[:, cat_cols] = test[cat_cols].astype('category')

foo = train.dtypes
cat_feature_names = foo[foo == "category"]
cat_features = [train.columns.get_loc(c) for c in train.columns if c in cat_feature_names]

In [69]:
del run_cv_model
gc.collect()

def run_cv_model(train, test, target, model_fn, params={}, eval_fn=None, label='model'):
    kf = FOLDS
    n_splits = N_FOLDS
    
    fold_splits = kf.split(train, target)
    cv_scores = []
    qwk_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0], n_splits))
    all_coefficients = np.zeros((n_splits, 4))
    feature_importance_df = pd.DataFrame()
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started ' + label + ' fold ' + str(i) + '/' + str(n_splits))
        if isinstance(train, pd.DataFrame):
            dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        else:
            dev_X, val_X = train[dev_index], train[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        params2 = params.copy()
        pred_val_y, pred_test_y, importances, coefficients, qwk = model_fn(dev_X, dev_y, val_X, val_y, test, params2)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        all_coefficients[i-1, :] = coefficients
        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            qwk_scores.append(qwk)
            print(label + ' cv score {}: RMSE {} QWK {}'.format(i, cv_score, qwk))
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = train.columns.values
        fold_importance_df['importance'] = importances
        fold_importance_df['fold'] = i
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        i += 1
    print('{} cv RMSE scores : {}'.format(label, cv_scores))
    print('{} cv mean RMSE score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv std RMSE score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv QWK scores : {}'.format(label, qwk_scores))
    print('{} cv mean QWK score : {}'.format(label, np.mean(qwk_scores)))
    print('{} cv std QWK score : {}'.format(label, np.std(qwk_scores)))
    pred_full_test = pred_full_test / float(n_splits)
    results = {'label': label,
               'train': pred_train, 'test': pred_full_test,
                'cv': cv_scores, 'qwk': qwk_scores,
               'importance': feature_importance_df,
               'coefficients': all_coefficients}
    return results

In [70]:
del runLGB
gc.collect()

def runLGB(train_X, train_y, test_X, test_y, test_X2, params):
    d_train = lgb.Dataset(train_X, label=train_y)
    d_valid = lgb.Dataset(test_X, label=test_y)
    watchlist = [d_train, d_valid]
    print('Train LGB')
    try:
        num_rounds = params.pop('num_rounds')
    except:
        pass
    verbose_eval = params.pop('verbose_eval')
    early_stop = None
    if params.get('early_stop'):
        early_stop = params.pop('early_stop')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=10000,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
#                       categorical_feature=list(cat_features),
                      callbacks=[lgb.reset_parameter(learning_rate=[0.005]*1000+[0.003]*1000+[0.001]*8000)],
                      early_stopping_rounds=early_stop)

    print('Predict 1/2')
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    init_coef = get_init_coefs(pred_test_y, test_y)
    optR = OptimizedRounder_v2(initial_coefs=init_coef)
    optR.fit(pred_test_y, test_y)
    coefficients = optR.coefficients()
    pred_test_y_k = optR.predict(pred_test_y, coefficients)
    chi2 = get_chi2(pred_test_y_k, test_y)
    print("Valid Counts = {}".format(Counter(test_y)))
    print("Predicted Counts = {}".format(Counter(pred_test_y_k)))
    print("Coefficients = {}".format(coefficients))
    print("Chi2 = {}".format(chi2))
    qwk = quadratic_weighted_kappa(test_y, pred_test_y_k)
    print("QWK = {}".format(qwk))
    print('Predict 2/2')
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    return pred_test_y.reshape(-1, 1), pred_test_y2.reshape(-1, 1), model.feature_importance(), coefficients, qwk

In [71]:
param = {'application': 'regression',
         'boosting': 'gbdt', 
         'metric': 'rmse', 
         'num_leaves': 149, 
         'max_depth': 11, 
         'max_bin': 37, 
         'bagging_fraction': 0.975419815153193, 
         'bagging_freq': 1, 
         'feature_fraction': 0.2705570927694394, 
         'min_split_gain': 0.7636472013417633, 
         'min_child_samples': 29, 
         'min_child_weight': 0.13126728393897313, 
         'lambda_l2': 0.841358003322472, 
         'verbosity': -1, 
         'data_random_seed': 1029, 
         'early_stop': 100, 
         'verbose_eval': 2000, 
         'num_rounds': 10000}

In [72]:
lgb_zyl = run_cv_model(train, test, target, runLGB, param, rmse, 'lgb')

Started lgb fold 1/4
Train LGB
Training until validation scores don't improve for 100 rounds.
[2000]	training's rmse: 0.479201	valid_1's rmse: 1.03139
[4000]	training's rmse: 0.468505	valid_1's rmse: 1.02951
[6000]	training's rmse: 0.464839	valid_1's rmse: 1.02874
[8000]	training's rmse: 0.463065	valid_1's rmse: 1.02837
Early stopping, best iteration is:
[8430]	training's rmse: 0.462817	valid_1's rmse: 1.02833
Predict 1/2
Valid Counts = Counter({4: 1050, 2: 1010, 3: 815, 1: 773, 0: 103})
Predicted Counts = Counter({4: 1050, 2: 967, 1: 835, 3: 805, 0: 94})
Coefficients = [1.6580913  2.1756028  2.49596728 2.79969562]
Chi2 = 7.712633340523446
QWK = 0.47771905184309094
Predict 2/2
lgb cv score 1: RMSE 1.0283263588248486 QWK 0.47771905184309094
Started lgb fold 2/4
Train LGB
Training until validation scores don't improve for 100 rounds.
[2000]	training's rmse: 0.477637	valid_1's rmse: 1.02442
[4000]	training's rmse: 0.467012	valid_1's rmse: 1.02288
[6000]	training's rmse: 0.463164	valid_1's

# NN

In [73]:
# 先占位

# 453

https://www.kaggle.com/ranjoranjan/single-xgboost-model

In [74]:
# 重新导入, 一了百了

del train, test
gc.collect()

train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

In [75]:
#TODO: 这里可以避免重复提取的, 后面要记得优化!!!

inp = Input((256,256,3))
backbone = DenseNet121(input_tensor = inp, 
                       weights="../input/densenet-keras/DenseNet-BC-121-32-no-top.h5",
                       include_top = False)
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = AveragePooling1D(4)(x)
out = Lambda(lambda x: x[:,:,0])(x)

m = Model(inp,out)

pet_ids = train['PetID'].values
n_batches = len(pet_ids) // batch_size + 1

features = {}
for b in tqdm(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/train_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]
        
train_feats = pd.DataFrame.from_dict(features, orient='index')
train_feats.columns = [f'pic_{i}' for i in range(train_feats.shape[1])]

pet_ids = test['PetID'].values
n_batches = len(pet_ids) // batch_size + 1

features = {}
for b in tqdm(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/test_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]
        
test_feats = pd.DataFrame.from_dict(features, orient='index')
test_feats.columns = [f'pic_{i}' for i in range(test_feats.shape[1])]

100%|██████████| 938/938 [02:24<00:00,  6.51it/s]
100%|██████████| 247/247 [00:34<00:00,  7.72it/s]


In [76]:
train_feats = train_feats.reset_index()
train_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

test_feats = test_feats.reset_index()
test_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

In [77]:
all_ids = pd.concat([train, test], axis=0, ignore_index=True, sort=False)[['PetID']]
all_ids.shape

(18941, 1)

In [78]:
n_components = 32
svd_ = TruncatedSVD(n_components=n_components, random_state=1337)

features_df = pd.concat([train_feats, test_feats], axis=0)
features = features_df[[f'pic_{i}' for i in range(256)]].values

svd_col = svd_.fit_transform(features)
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('IMG_SVD_')

img_features = pd.concat([all_ids, svd_col], axis=1)

In [79]:
labels_breed = pd.read_csv('../input/petfinder-adoption-prediction/breed_labels.csv')

In [80]:
train_image_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_images/*.jpg'))
train_metadata_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_metadata/*.json'))
train_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_sentiment/*.json'))

test_image_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_images/*.jpg'))
test_metadata_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_metadata/*.json'))
test_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_sentiment/*.json'))

In [81]:
split_char = '/'

In [82]:
train_df_ids = train[['PetID']]

train_df_ids = train[['PetID']]
train_df_metadata = pd.DataFrame(train_metadata_files)
train_df_metadata.columns = ['metadata_filename']
train_metadata_pets = train_df_metadata['metadata_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
train_df_metadata = train_df_metadata.assign(PetID=train_metadata_pets)

train_df_ids = train[['PetID']]
train_df_sentiment = pd.DataFrame(train_sentiment_files)
train_df_sentiment.columns = ['sentiment_filename']
train_sentiment_pets = train_df_sentiment['sentiment_filename'].apply(lambda x: x.split(split_char)[-1].split('.')[0])
train_df_sentiment = train_df_sentiment.assign(PetID=train_sentiment_pets)

In [83]:
test_df_ids = test[['PetID']]

test_df_metadata = pd.DataFrame(test_metadata_files)
test_df_metadata.columns = ['metadata_filename']
test_metadata_pets = test_df_metadata['metadata_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
test_df_metadata = test_df_metadata.assign(PetID=test_metadata_pets)

test_df_sentiment = pd.DataFrame(test_sentiment_files)
test_df_sentiment.columns = ['sentiment_filename']
test_sentiment_pets = test_df_sentiment['sentiment_filename'].apply(lambda x: x.split(split_char)[-1].split('.')[0])
test_df_sentiment = test_df_sentiment.assign(PetID=test_sentiment_pets)

In [84]:
class PetFinderParser(object):
    
    def __init__(self, debug=False):        
        self.debug = debug
        self.sentence_sep = ' '        
        self.extract_sentiment_text = False
    
    def open_json_file(self, filename):
        with open(filename, 'r', encoding='utf-8') as f:
            json_file = json.load(f)
        return json_file
        
    def parse_sentiment_file(self, file):
        file_sentiment = file['documentSentiment']
        file_entities = [x['name'] for x in file['entities']]
        file_entities = self.sentence_sep.join(file_entities)       
        file_sentences_sentiment = [x['sentiment'] for x in file['sentences']]        
        file_sentences_sentiment = pd.DataFrame.from_dict(
            file_sentences_sentiment, orient='columns')
        file_sentences_sentiment_df = pd.DataFrame(
            {
                'magnitude_sum': file_sentences_sentiment['magnitude'].sum(axis=0),
                'score_sum': file_sentences_sentiment['score'].sum(axis=0),
                'magnitude_mean': file_sentences_sentiment['magnitude'].mean(axis=0),
                'score_mean': file_sentences_sentiment['score'].mean(axis=0),
                'magnitude_var': file_sentences_sentiment['magnitude'].var(axis=0),
                'score_var': file_sentences_sentiment['score'].var(axis=0),
            }, index=[0]
        )        
        df_sentiment = pd.DataFrame.from_dict(file_sentiment, orient='index').T
        df_sentiment = pd.concat([df_sentiment, file_sentences_sentiment_df], axis=1)            
        df_sentiment['entities'] = file_entities
        df_sentiment = df_sentiment.add_prefix('sentiment_')        
        return df_sentiment
    
    def parse_metadata_file(self, file):
        file_keys = list(file.keys())        
        if 'labelAnnotations' in file_keys:
            file_annots = file['labelAnnotations']
            file_top_score = np.asarray([x['score'] for x in file_annots]).mean()
            file_top_desc = [x['description'] for x in file_annots]
        else:
            file_top_score = np.nan
            file_top_desc = ['']        
        file_colors = file['imagePropertiesAnnotation']['dominantColors']['colors']
        file_crops = file['cropHintsAnnotation']['cropHints']
        file_color_score = np.asarray([x['score'] for x in file_colors]).mean()
        file_color_pixelfrac = np.asarray([x['pixelFraction'] for x in file_colors]).mean()
        file_crop_conf = np.asarray([x['confidence'] for x in file_crops]).mean()        
        if 'importanceFraction' in file_crops[0].keys():
            file_crop_importance = np.asarray([x['importanceFraction'] for x in file_crops]).mean()
        else:
            file_crop_importance = np.nan
        df_metadata = {
            'annots_score': file_top_score,
            'color_score': file_color_score,
            'color_pixelfrac': file_color_pixelfrac,
            'crop_conf': file_crop_conf,
            'crop_importance': file_crop_importance,
            'annots_top_desc': self.sentence_sep.join(file_top_desc)
        }        
        df_metadata = pd.DataFrame.from_dict(df_metadata, orient='index').T
        df_metadata = df_metadata.add_prefix('metadata_')        
        return df_metadata
    
def extract_additional_features(pet_id, mode='train'):
    sentiment_filename = f'../input/petfinder-adoption-prediction/{mode}_sentiment/{pet_id}.json'
    try:
        sentiment_file = pet_parser.open_json_file(sentiment_filename)
        df_sentiment = pet_parser.parse_sentiment_file(sentiment_file)
        df_sentiment['PetID'] = pet_id
    except FileNotFoundError:
        df_sentiment = []
    dfs_metadata = []
    metadata_filenames = sorted(glob.glob(f'../input/petfinder-adoption-prediction/{mode}_metadata/{pet_id}*.json'))
    if len(metadata_filenames) > 0:
        for f in metadata_filenames:
            metadata_file = pet_parser.open_json_file(f)
            df_metadata = pet_parser.parse_metadata_file(metadata_file)
            df_metadata['PetID'] = pet_id
            dfs_metadata.append(df_metadata)
        dfs_metadata = pd.concat(dfs_metadata, ignore_index=True, sort=False)
    dfs = [df_sentiment, dfs_metadata]    
    return dfs

pet_parser = PetFinderParser()

In [85]:
# take a long time here, about 25 minutes

train_pet_ids = train.PetID.unique()
test_pet_ids = test.PetID.unique()

dfs_train = Parallel(n_jobs=-1, verbose=1)(
    delayed(extract_additional_features)(i, mode='train') for i in train_pet_ids)
train_dfs_sentiment = [x[0] for x in dfs_train if isinstance(x[0], pd.DataFrame)]
train_dfs_metadata = [x[1] for x in dfs_train if isinstance(x[1], pd.DataFrame)]
train_dfs_sentiment = pd.concat(train_dfs_sentiment, ignore_index=True, sort=False)
train_dfs_metadata = pd.concat(train_dfs_metadata, ignore_index=True, sort=False)
print(train_dfs_sentiment.shape, train_dfs_metadata.shape)

dfs_test = Parallel(n_jobs=-1, verbose=1)(
    delayed(extract_additional_features)(i, mode='test') for i in test_pet_ids)
test_dfs_sentiment = [x[0] for x in dfs_test if isinstance(x[0], pd.DataFrame)]
test_dfs_metadata = [x[1] for x in dfs_test if isinstance(x[1], pd.DataFrame)]
test_dfs_sentiment = pd.concat(test_dfs_sentiment, ignore_index=True, sort=False)
test_dfs_metadata = pd.concat(test_dfs_metadata, ignore_index=True, sort=False)
print(test_dfs_sentiment.shape, test_dfs_metadata.shape)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:   39.5s
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 4996 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 6046 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 7196 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 8446 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 9796 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 11246 tasks      |

(14442, 10) (58311, 7)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 772 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 1772 tasks      | elapsed:   53.4s
[Parallel(n_jobs=-1)]: Done 3172 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 3948 out of 3948 | elapsed:  1.9min finished


(3815, 10) (15040, 7)


In [86]:
aggregates = ['sum', 'mean', 'var']
sent_agg = ['sum']

train_metadata_desc = train_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
train_metadata_desc = train_metadata_desc.reset_index()
train_metadata_desc[
    'metadata_annots_top_desc'] = train_metadata_desc[
    'metadata_annots_top_desc'].apply(lambda x: ' '.join(x))

prefix = 'metadata'
train_metadata_gr = train_dfs_metadata.drop(['metadata_annots_top_desc'], axis=1)
for i in train_metadata_gr.columns:
    if 'PetID' not in i:
        train_metadata_gr[i] = train_metadata_gr[i].astype(float)
train_metadata_gr = train_metadata_gr.groupby(['PetID']).agg(aggregates)
train_metadata_gr.columns = pd.Index([f'{c[0]}_{c[1].upper()}' for c in train_metadata_gr.columns.tolist()])
train_metadata_gr = train_metadata_gr.reset_index()

train_sentiment_desc = train_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
train_sentiment_desc = train_sentiment_desc.reset_index()
train_sentiment_desc[
    'sentiment_entities'] = train_sentiment_desc[
    'sentiment_entities'].apply(lambda x: ' '.join(x))

prefix = 'sentiment'
train_sentiment_gr = train_dfs_sentiment.drop(['sentiment_entities'], axis=1)
for i in train_sentiment_gr.columns:
    if 'PetID' not in i:
        train_sentiment_gr[i] = train_sentiment_gr[i].astype(float)
train_sentiment_gr = train_sentiment_gr.groupby(['PetID']).agg(sent_agg)
train_sentiment_gr.columns = pd.Index([f'{c[0]}' for c in train_sentiment_gr.columns.tolist()])
train_sentiment_gr = train_sentiment_gr.reset_index()


test_metadata_desc = test_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
test_metadata_desc = test_metadata_desc.reset_index()
test_metadata_desc[
    'metadata_annots_top_desc'] = test_metadata_desc[
    'metadata_annots_top_desc'].apply(lambda x: ' '.join(x))

prefix = 'metadata'
test_metadata_gr = test_dfs_metadata.drop(['metadata_annots_top_desc'], axis=1)
for i in test_metadata_gr.columns:
    if 'PetID' not in i:
        test_metadata_gr[i] = test_metadata_gr[i].astype(float)
test_metadata_gr = test_metadata_gr.groupby(['PetID']).agg(aggregates)
test_metadata_gr.columns = pd.Index([f'{c[0]}_{c[1].upper()}' for c in test_metadata_gr.columns.tolist()])
test_metadata_gr = test_metadata_gr.reset_index()

test_sentiment_desc = test_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
test_sentiment_desc = test_sentiment_desc.reset_index()
test_sentiment_desc[
    'sentiment_entities'] = test_sentiment_desc[
    'sentiment_entities'].apply(lambda x: ' '.join(x))

prefix = 'sentiment'
test_sentiment_gr = test_dfs_sentiment.drop(['sentiment_entities'], axis=1)
for i in test_sentiment_gr.columns:
    if 'PetID' not in i:
        test_sentiment_gr[i] = test_sentiment_gr[i].astype(float)
test_sentiment_gr = test_sentiment_gr.groupby(['PetID']).agg(sent_agg)
test_sentiment_gr.columns = pd.Index([f'{c[0]}' for c in test_sentiment_gr.columns.tolist()])
test_sentiment_gr = test_sentiment_gr.reset_index()

In [87]:
train_proc = train.copy()
train_proc = train_proc.merge(
    train_sentiment_gr, how='left', on='PetID')
train_proc = train_proc.merge(
    train_metadata_gr, how='left', on='PetID')
train_proc = train_proc.merge(
    train_metadata_desc, how='left', on='PetID')
train_proc = train_proc.merge(
    train_sentiment_desc, how='left', on='PetID')

test_proc = test.copy()
test_proc = test_proc.merge(
    test_sentiment_gr, how='left', on='PetID')
test_proc = test_proc.merge(
    test_metadata_gr, how='left', on='PetID')
test_proc = test_proc.merge(
    test_metadata_desc, how='left', on='PetID')
test_proc = test_proc.merge(
    test_sentiment_desc, how='left', on='PetID')

print(train_proc.shape, test_proc.shape)
assert train_proc.shape[0] == train.shape[0]
assert test_proc.shape[0] == test.shape[0]

(14993, 49) (3948, 48)


In [88]:
train_breed_main = train_proc[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))
train_breed_main = train_breed_main.iloc[:, 2:]
train_breed_main = train_breed_main.add_prefix('main_breed_')
train_breed_second = train_proc[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))
train_breed_second = train_breed_second.iloc[:, 2:]
train_breed_second = train_breed_second.add_prefix('second_breed_')
train_proc = pd.concat(
    [train_proc, train_breed_main, train_breed_second], axis=1)

test_breed_main = test_proc[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))
test_breed_main = test_breed_main.iloc[:, 2:]
test_breed_main = test_breed_main.add_prefix('main_breed_')
test_breed_second = test_proc[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))
test_breed_second = test_breed_second.iloc[:, 2:]
test_breed_second = test_breed_second.add_prefix('second_breed_')
test_proc = pd.concat(
    [test_proc, test_breed_main, test_breed_second], axis=1)

print(train_proc.shape, test_proc.shape)

(14993, 53) (3948, 52)


In [89]:
X = pd.concat([train_proc, test_proc], ignore_index=True, sort=False)

In [90]:
X_temp = X.copy()

text_columns = ['Description', 'metadata_annots_top_desc', 'sentiment_entities']
categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']

to_drop_columns = ['PetID', 'Name', 'RescuerID']

In [91]:
rescuer_count = X.groupby(['RescuerID'])['PetID'].count().reset_index()
rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']

X_temp = X_temp.merge(rescuer_count, how='left', on='RescuerID')

In [92]:
for i in categorical_columns:
    X_temp.loc[:, i] = pd.factorize(X_temp.loc[:, i])[0]

In [93]:
X_text = X_temp[text_columns]

for i in X_text.columns:
    X_text.loc[:, i] = X_text.loc[:, i].fillna('none')

In [94]:
X_temp['Length_Description'] = X_text['Description'].map(len)
X_temp['Length_metadata_annots_top_desc'] = X_text['metadata_annots_top_desc'].map(len)
X_temp['Lengths_sentiment_entities'] = X_text['sentiment_entities'].map(len)

In [95]:
n_components = 16
text_features = []

for i in X_text.columns:
    print(f'generating features from: {i}')
    tfv = TfidfVectorizer(min_df=2,  max_features=None,
                          strip_accents='unicode', analyzer='word', token_pattern=r'(?u)\b\w+\b',
                          ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)
    svd_ = TruncatedSVD(
        n_components=n_components, random_state=1337)
    tfidf_col = tfv.fit_transform(X_text.loc[:, i].values)    
    svd_col = svd_.fit_transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))    
    text_features.append(svd_col)
    
text_features = pd.concat(text_features, axis=1)

X_temp = pd.concat([X_temp, text_features], axis=1)

for i in X_text.columns:
    X_temp = X_temp.drop(i, axis=1)

generating features from: Description
generating features from: metadata_annots_top_desc
generating features from: sentiment_entities


In [96]:
X_temp = X_temp.merge(img_features, how='left', on='PetID')

In [97]:
train_df_ids = train[['PetID']]
test_df_ids = test[['PetID']]

train_df_imgs = pd.DataFrame(train_image_files)
train_df_imgs.columns = ['image_filename']
train_imgs_pets = train_df_imgs['image_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])

test_df_imgs = pd.DataFrame(test_image_files)
test_df_imgs.columns = ['image_filename']
test_imgs_pets = test_df_imgs['image_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])

train_df_imgs = train_df_imgs.assign(PetID=train_imgs_pets)
test_df_imgs = test_df_imgs.assign(PetID=test_imgs_pets)

def getSize(filename):
    st = os.stat(filename)
    return st.st_size

def getDimensions(filename):
    img_size = Image.open(filename).size
    return img_size 

train_df_imgs['image_size'] = train_df_imgs['image_filename'].apply(getSize)
train_df_imgs['temp_size'] = train_df_imgs['image_filename'].apply(getDimensions)
train_df_imgs['width'] = train_df_imgs['temp_size'].apply(lambda x : x[0])
train_df_imgs['height'] = train_df_imgs['temp_size'].apply(lambda x : x[1])
train_df_imgs = train_df_imgs.drop(['temp_size'], axis=1)

test_df_imgs['image_size'] = test_df_imgs['image_filename'].apply(getSize)
test_df_imgs['temp_size'] = test_df_imgs['image_filename'].apply(getDimensions)
test_df_imgs['width'] = test_df_imgs['temp_size'].apply(lambda x : x[0])
test_df_imgs['height'] = test_df_imgs['temp_size'].apply(lambda x : x[1])
test_df_imgs = test_df_imgs.drop(['temp_size'], axis=1)

aggs = {
    'image_size': ['sum', 'mean', 'var'],
    'width': ['sum', 'mean', 'var'],
    'height': ['sum', 'mean', 'var'],
}
agg_train_imgs = train_df_imgs.groupby('PetID').agg(aggs)
new_columns = [
    k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_train_imgs.columns = new_columns
agg_train_imgs = agg_train_imgs.reset_index()

agg_test_imgs = test_df_imgs.groupby('PetID').agg(aggs)
new_columns = [
    k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_test_imgs.columns = new_columns
agg_test_imgs = agg_test_imgs.reset_index()

agg_imgs = pd.concat([agg_train_imgs, agg_test_imgs], axis=0).reset_index(drop=True)

In [98]:
X_temp = X_temp.merge(agg_imgs, how='left', on='PetID')

X_temp = X_temp.drop(to_drop_columns, axis=1)

In [99]:
X_train = X_temp.loc[np.isfinite(X_temp.AdoptionSpeed), :]
X_test = X_temp.loc[~np.isfinite(X_temp.AdoptionSpeed), :]

X_test = X_test.drop(['AdoptionSpeed'], axis=1)

assert X_train.shape[0] == train.shape[0]
assert X_test.shape[0] == test.shape[0]

train_cols = X_train.columns.tolist()
train_cols.remove('AdoptionSpeed')

test_cols = X_test.columns.tolist()

assert np.all(train_cols == test_cols)

In [100]:
X_train_non_null = X_train.fillna(-1)
X_test_non_null = X_test.fillna(-1)
X_train_non_null['ResNet_meta'] = train_img_prob.flatten()         # ADD IMG ResNet50 metafeature
X_test_non_null['ResNet_meta'] = test_img_prob.flatten()           # ADD IMG ResNet50 metafeature

In [101]:
X_train_non_null.isnull().any().any(), X_test_non_null.isnull().any().any()

(False, False)

In [102]:
X_train_non_null.shape, X_test_non_null.shape

((14993, 141), (3948, 140))

## SAVE !!!

In [103]:
X_train_non_null.to_csv("train_453.csv", index=False)
X_test_non_null.to_csv("test_453.csv", index=False)

## XGB

In [104]:
xgb_params = {
    'eval_metric': 'rmse',
    'seed': 1337,
    'eta': 0.0123,
    'subsample': 0.8,
    'colsample_bytree': 0.85,
    'tree_method': 'gpu_hist',
    'device': 'gpu',
    'silent': 1,
}

In [105]:
def run_xgb(params, X_train, X_test):
    kf = FOLDS
    n_splits = N_FOLDS
    
    verbose_eval = 1000
    num_rounds = 60000
    early_stop = 500

    oof_train = np.zeros((X_train.shape[0]))
    oof_test = np.zeros((X_test.shape[0], n_splits))

    i = 0

    for train_idx, valid_idx in kf.split(X_train, X_train['AdoptionSpeed'].values):

        X_tr = X_train.iloc[train_idx, :]
        X_val = X_train.iloc[valid_idx, :]

        y_tr = X_tr['AdoptionSpeed'].values
        X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)

        y_val = X_val['AdoptionSpeed'].values
        X_val = X_val.drop(['AdoptionSpeed'], axis=1)

        d_train = xgb.DMatrix(data=X_tr, label=y_tr, feature_names=X_tr.columns)
        d_valid = xgb.DMatrix(data=X_val, label=y_val, feature_names=X_val.columns)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(dtrain=d_train, num_boost_round=num_rounds, evals=watchlist,
                         early_stopping_rounds=early_stop, verbose_eval=verbose_eval, params=params)

        valid_pred = model.predict(xgb.DMatrix(X_val, feature_names=X_val.columns), ntree_limit=model.best_ntree_limit)
        test_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_test.columns), ntree_limit=model.best_ntree_limit)

        oof_train[valid_idx] = valid_pred
        oof_test[:, i] = test_pred

        i += 1
    return model, oof_train, oof_test

In [106]:
model, oof_train, oof_test = run_xgb(xgb_params, X_train_non_null, X_test_non_null)

[0]	train-rmse:2.31218	valid-rmse:2.31227
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
[1000]	train-rmse:0.637177	valid-rmse:1.0308
[2000]	train-rmse:0.416152	valid-rmse:1.02711
[3000]	train-rmse:0.272851	valid-rmse:1.02766
Stopping. Best iteration:
[2573]	train-rmse:0.32597	valid-rmse:1.02696

[0]	train-rmse:2.31234	valid-rmse:2.31232
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
[1000]	train-rmse:0.64406	valid-rmse:1.02729
[2000]	train-rmse:0.428487	valid-rmse:1.02316
Stopping. Best iteration:
[2122]	train-rmse:0.407167	valid-rmse:1.02273

[0]	train-rmse:2.3123	valid-rmse:2.31305
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
[1000]	train-rmse:0.64667	valid-rmse:1.01995
[2000]	train-rmse:0

In [107]:
xgb_453_train_pred = oof_train
xgb_453_test_pred = np.mean(oof_test, axis=1)
xgb_453_train_pred.shape, xgb_453_test_pred.shape

((14993,), (3948,))

WarpperNN

# Corr

In [108]:
gzf_lgb = lgb_gzf["test"].reshape(-1)
zkr_lgb = lgb_zkr["test"].reshape(-1)
zyl_lgb = lgb_zyl["test"].reshape(-1)

dfa = pd.DataFrame({"gzf_lgb":gzf_lgb, 
                    "zkr_lgb":zkr_lgb, 
                    "zyl_lgb":zyl_lgb, 
                    "453_xgb": xgb_453_test_pred})
dfa.corr()

Unnamed: 0,gzf_lgb,zkr_lgb,zyl_lgb,453_xgb
gzf_lgb,1.0,0.971332,0.943132,0.892131
zkr_lgb,0.971332,1.0,0.948968,0.893275
zyl_lgb,0.943132,0.948968,1.0,0.886197
453_xgb,0.892131,0.893275,0.886197,1.0


# Stacking

In [109]:
gzf_lgb_train_pred = np.mean(lgb_gzf['train'], axis=1)
gzf_lgb_test_pred = np.mean(lgb_gzf['test'], axis=1)

zkr_lgb_train_pred = np.mean(lgb_zkr['train'], axis=1)
zkr_lgb_test_pred = np.mean(lgb_zkr['test'], axis=1)

zyl_lgb_train_pred = np.mean(lgb_zyl['train'], axis=1)
zyl_lgb_test_pred = np.mean(lgb_zyl['test'], axis=1)


train_meta = np.concatenate([gzf_lgb_train_pred.reshape(-1,1),
                             zkr_lgb_train_pred.reshape(-1,1),
                             zyl_lgb_train_pred.reshape(-1,1),
                             xgb_453_train_pred.reshape(-1,1),
                             #mlp_zkr_train_pred.reshape(-1,1)
                            ], axis=1)
test_meta = np.concatenate([gzf_lgb_test_pred.reshape(-1,1),
                            zkr_lgb_test_pred.reshape(-1,1),
                            zyl_lgb_test_pred.reshape(-1,1),
                            xgb_453_test_pred.reshape(-1,1),
                            #mlp_zkr_test_pred.reshape(-1,1)
                           ], axis=1)

In [110]:
from sklearn.linear_model import Ridge

In [111]:
clf = Ridge(alpha=0.1)

clf.fit(train_meta, target)
train_pred = clf.predict(train_meta)

In [112]:
print(clf.coef_)

[0.09241487 0.13100189 0.59196831 0.39523388]


In [113]:
init_coef = get_init_coefs(train_pred,  target)
optR = OptimizedRounder_v2(initial_coefs=init_coef)
optR.fit(train_pred, target)
coefficients = optR.coefficients()
print("coefficients: ", coefficients, "\n")

print("True Counter: ", Counter(target))

optR = OptimizedRounder_v2()
train_predictions = optR.predict(train_pred, coefficients).astype(int)
print("Train Counter: ", Counter(train_predictions))

print("\nTrain QWK: ", quadratic_weighted_kappa(target, train_predictions))
print("Train RMSE: ", rmse(target, train_pred))

coefficients:  [1.44182981 2.04854091 2.48911401 2.89681534] 

True Counter:  Counter({4: 4197, 2: 4037, 3: 3259, 1: 3090, 0: 410})
Train Counter:  Counter({2: 4102, 4: 4056, 3: 3281, 1: 3153, 0: 401})

Train QWK:  0.499594060615012
Train RMSE:  1.0061667493691746


In [114]:
predictions = clf.predict(test_meta)
# optR = OptimizedRounder_v2()
# test_predictions = optR.predict(predictions, coefficients).astype(int)
optR = OptimizedRounder_v3()
test_predictions = optR.predict(predictions, coefficients, 110).astype(int)
print("Test Counter: ", Counter(test_predictions), "\n")

print("True Distribution:")
print(pd.value_counts(target, normalize=True).sort_index())
print("Train Predicted Distribution:")
print(pd.value_counts(train_predictions, normalize=True).sort_index())
print("Test Predicted Distribution:")
print(pd.value_counts(test_predictions, normalize=True).sort_index())

Test Counter:  Counter({4: 1105, 2: 1066, 3: 840, 1: 839, 0: 98}) 

True Distribution:
0    0.027346
1    0.206096
2    0.269259
3    0.217368
4    0.279931
Name: AdoptionSpeed, dtype: float64
Train Predicted Distribution:
0    0.026746
1    0.210298
2    0.273594
3    0.218835
4    0.270526
dtype: float64
Test Predicted Distribution:
0    0.024823
1    0.212513
2    0.270010
3    0.212766
4    0.279889
dtype: float64


In [115]:
submission = pd.DataFrame({'PetID': test_id, 'AdoptionSpeed': test_predictions})
submission.head(10)

Unnamed: 0,PetID,AdoptionSpeed
0,378fcc4fc,2
1,73c10e136,4
2,72000c4c5,3
3,e147a4b9f,4
4,43fbba852,4
5,77a490ec9,3
6,28c4b1b13,4
7,d1eada628,3
8,d134dec34,3
9,bcd464bb8,2


In [116]:
submission.to_csv('submission.csv', index=False)

In [117]:
!head submission.csv

PetID,AdoptionSpeed
378fcc4fc,2
73c10e136,4
72000c4c5,3
e147a4b9f,4
43fbba852,4
77a490ec9,3
28c4b1b13,4
d1eada628,3
d134dec34,3
