In [1]:
!pip install lightgbm

Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/77/0f/5157e6b153b3d4a70dc5fbe2ab6f209604197590f387f03177b7a249ac60/lightgbm-2.2.3-py2.py3-none-manylinux1_x86_64.whl (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 14.5MB/s ta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-2.2.3


In [3]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import random
import torch
import json
import cv2
from tqdm import tqdm
import pickle
import glob
import os.path as osp
from multiprocessing import Pool
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

sys.path.append("../vrd")
from utils import get_iou, get_image_size

In [6]:
from train_catboost import classes_1, classes_2

In [8]:
import lightgbm as lgb

In [27]:
from sklearn.preprocessing import LabelEncoder

In [7]:
DATA_DIR='/mnt/chicm/data/open-images/relation'

In [11]:
neg_sample_fn = '../vrd/lb23578/df_neg_small_2.csv'

In [17]:
def parallel_apply(df, func, n_cores=24):
    #ncores = 24
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [19]:
def add_features(df):
    df['iou'] = df.apply(lambda row: get_iou(row), axis=1)
    #df['ratio'] = df.apply(lambda row: get_img_ratio(row), axis=1)
    return df


In [31]:
le1, le2, ley = LabelEncoder(), LabelEncoder(), LabelEncoder()

In [32]:
def get_train_data():
    df_vrd = pd.read_csv(os.path.join(DATA_DIR, 'challenge-2019-train-vrd.csv'))
    df_pos = df_vrd.loc[df_vrd.RelationshipLabel!='is'].copy()

    df_neg = pd.read_csv(neg_sample_fn)

    df_train = pd.concat([df_pos, df_neg], axis=0, sort=False)
    print(df_train.shape)

    df_train = shuffle(df_train)
    df_train = parallel_apply(df_train, add_features)
    print(df_train.head())

    y = df_train.RelationshipLabel
    X = df_train.drop(['ImageID', 'RelationshipLabel'], axis=1)
    
    le1.fit(X.LabelName1)
    le2.fit(X.LabelName2)
    ley.fit(y)
    
    X.LabelName1 = le1.transform(X.LabelName1)
    X.LabelName2 = le2.transform(X.LabelName2)
    y = ley.transform(y)
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)
    return X_train, X_val, y_train, y_val

In [33]:
#def train():
X_train, X_val, y_train, y_val = get_train_data()
print(X_train.dtypes)
categorical_feature_indices = np.where(X_train.dtypes != np.float)[0]
print(categorical_feature_indices)    

(335187, 12)
                 ImageID  LabelName1 LabelName2     XMin1     XMax1     YMin1  \
235521  6c74a94450420453   /m/01mzpv  /m/04bcr3  0.360000  0.409375  0.504167   
66327   678c2b477ec9e760   /m/01mzpv  /m/01mzpv  0.883125  0.976250  0.461667   
212297  ce449209df75a8c6    /m/04yx4  /m/078n6m  0.168889  0.287778  0.142222   
109589  b1ff4494eb6d654a  /m/03bt1vf  /m/019w40  0.520000  0.761875  0.053471   
71898   5c74aca98a04b456   /m/05r655  /m/0bt9lr  0.710000  0.978125  0.223265   

           YMax1     XMin2     XMax2     YMin2     YMax2 RelationshipLabel  \
235521  0.609167  0.371250  0.423125  0.496667  0.558333                at   
66327   0.580000  0.006250  0.152500  0.578333  0.884167              none   
212297  0.601482  0.110000  0.316667  0.405926  0.814815                at   
109589  0.999062  0.548750  0.980625  0.638837  0.926829              none   
71898   0.999062  0.695625  0.798125  0.393058  0.814259    interacts_with   

             iou  
235521  0.32

In [39]:
categorical_feature_indices

array([0, 1])

In [58]:
ley.classes_

array(['at', 'hits', 'holds', 'inside_of', 'interacts_with', 'none', 'on',
       'plays', 'under', 'wears'], dtype=object)

In [56]:
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_feature_indices.tolist())
test_data = lgb.Dataset(X_val, label=y_val)


In [57]:
parameters = {
    'application': 'binary',
    'objective': 'binary',
    #'metric': 'auc',
    'metric': 'accuracy',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class':10,
    'metric': 'multi_logloss',
    'learning_rate': 0.05,
    'max_depth': 7,
    'num_leaves': 17,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.6,
    'bagging_freq': 17
}

model = lgb.train(
    params,
    train_data,
    valid_sets=test_data,
    num_boost_round=5000,
    early_stopping_rounds=100
)

#
# Create a submission
#



[1]	valid_0's multi_logloss: 1.26276
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's multi_logloss: 1.2232
[3]	valid_0's multi_logloss: 1.17867
[4]	valid_0's multi_logloss: 1.14533
[5]	valid_0's multi_logloss: 1.11683
[6]	valid_0's multi_logloss: 1.08263
[7]	valid_0's multi_logloss: 1.05429
[8]	valid_0's multi_logloss: 1.02079
[9]	valid_0's multi_logloss: 0.993258
[10]	valid_0's multi_logloss: 0.968895
[11]	valid_0's multi_logloss: 0.928751
[12]	valid_0's multi_logloss: 0.901151
[13]	valid_0's multi_logloss: 0.876097
[14]	valid_0's multi_logloss: 0.857213
[15]	valid_0's multi_logloss: 0.835699
[16]	valid_0's multi_logloss: 0.812119
[17]	valid_0's multi_logloss: 0.801389
[18]	valid_0's multi_logloss: 0.793859
[19]	valid_0's multi_logloss: 0.768201
[20]	valid_0's multi_logloss: 0.75272
[21]	valid_0's multi_logloss: 0.732962
[22]	valid_0's multi_logloss: 0.714995
[23]	valid_0's multi_logloss: 0.692842
[24]	valid_0's multi_logloss: 0.674878
[25]	valid_0's multi

[208]	valid_0's multi_logloss: 0.1985
[209]	valid_0's multi_logloss: 0.198944
[210]	valid_0's multi_logloss: 0.198146
[211]	valid_0's multi_logloss: 0.202662
[212]	valid_0's multi_logloss: 0.199697
[213]	valid_0's multi_logloss: 0.198984
[214]	valid_0's multi_logloss: 0.198403
[215]	valid_0's multi_logloss: 0.197571
[216]	valid_0's multi_logloss: 0.199462
[217]	valid_0's multi_logloss: 0.199328
[218]	valid_0's multi_logloss: 0.198904
[219]	valid_0's multi_logloss: 0.199091
[220]	valid_0's multi_logloss: 0.198629
[221]	valid_0's multi_logloss: 0.198117
[222]	valid_0's multi_logloss: 0.19769
[223]	valid_0's multi_logloss: 0.197277
[224]	valid_0's multi_logloss: 0.196233
[225]	valid_0's multi_logloss: 0.195626
[226]	valid_0's multi_logloss: 0.19523
[227]	valid_0's multi_logloss: 0.194873
[228]	valid_0's multi_logloss: 0.1935
[229]	valid_0's multi_logloss: 0.193362
[230]	valid_0's multi_logloss: 0.194078
[231]	valid_0's multi_logloss: 0.197153
[232]	valid_0's multi_logloss: 0.196611
[233]	

[414]	valid_0's multi_logloss: 0.186251
[415]	valid_0's multi_logloss: 0.186082
[416]	valid_0's multi_logloss: 0.184484
[417]	valid_0's multi_logloss: 0.184357
[418]	valid_0's multi_logloss: 0.19379
[419]	valid_0's multi_logloss: 0.192081
[420]	valid_0's multi_logloss: 0.18887
[421]	valid_0's multi_logloss: 0.188702
[422]	valid_0's multi_logloss: 0.188555
[423]	valid_0's multi_logloss: 0.186658
[424]	valid_0's multi_logloss: 0.186536
[425]	valid_0's multi_logloss: 0.185269
[426]	valid_0's multi_logloss: 0.186924
[427]	valid_0's multi_logloss: 0.188444
[428]	valid_0's multi_logloss: 0.187115
[429]	valid_0's multi_logloss: 0.186858
[430]	valid_0's multi_logloss: 0.186533
[431]	valid_0's multi_logloss: 0.186777
[432]	valid_0's multi_logloss: 0.186535
[433]	valid_0's multi_logloss: 0.186406
[434]	valid_0's multi_logloss: 0.186201
[435]	valid_0's multi_logloss: 0.186111
[436]	valid_0's multi_logloss: 0.18601
[437]	valid_0's multi_logloss: 0.189647
[438]	valid_0's multi_logloss: 0.186739
[43

In [36]:
y_train

array([0, 0, 6, ..., 7, 5, 0])

In [None]:
submission = pd.read_csv('../input/test.csv')
ids = submission['id'].values
submission.drop('id', inplace=True, axis=1)


x = submission.values
y = model.predict(x)

output = pd.DataFrame({'id': ids, 'target': y})
output.to_csv("submission.csv", index=False)