In [3]:
# tf-idf preprocess
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

print('reading...')
df = pd.read_csv('./train_merged.csv')
print('extracting...')
creative_id = df.groupby('user_id')['creative_id'].apply(list)
advertiser_id = df.groupby('user_id')['advertiser_id'].apply(list)
ad_id = df.groupby('user_id')['ad_id'].apply(list)
del df
print('building first tfidf...')
v_creat = TfidfVectorizer(lowercase=False,tokenizer=lambda x:x)
X_creat = v_creat.fit_transform(creative_id)
del creative_id
print('building second tfidf...')
v_adver = TfidfVectorizer(lowercase=False,tokenizer=lambda x:x)
X_adver = v_adver.fit_transform(advertiser_id)
del advertiser_id
print('building third tfidf...')
v_adid = TfidfVectorizer(lowercase=False,tokenizer=lambda x:x)
X_adid = v_adid.fit_transform(ad_id)
del ad_id
print('saving train attrubutes...')
train_attr = sparse.hstack((X_creat, X_adver, X_adid), dtype=float)
del X_creat, X_adver, X_adid
sparse.save_npz('./train_attr.npz', train_attr)
del train_attr

print('preprocessing predict data...')
print('reading...')
df = pd.read_csv('./predict_merged.csv')
print('extracting...')
creative_id = df.groupby('user_id')['creative_id'].apply(list)
advertiser_id = df.groupby('user_id')['advertiser_id'].apply(list)
ad_id = df.groupby('user_id')['ad_id'].apply(list)
del df
print('building first tfidf...')
X_creat = v_creat.transform(creative_id)
del creative_id
print('building second tfidf...')
X_adver = v_adver.transform(advertiser_id)
del advertiser_id
print('building third tfidf...')
X_adid = v_adid.transform(ad_id)
del ad_id
print('saving predict attrubutes...')
train_attr = sparse.hstack((X_creat, X_adver, X_adid), dtype=float)
del X_creat, X_adver, X_adid
sparse.save_npz('./predict_attr.npz', train_attr)
del train_attr

building third tfidf...
saving train attrubutes...
preprocessing predict data...
reading...
extracting...
building first tfidf...
building second tfidf...
building third tfidf...
saving train attrubutes...


In [2]:
# predict gender
%reset -f
# !pip install lightgbm
import lightgbm as lgb
import pandas as pd
from scipy import sparse
from sklearn.model_selection import KFold
import numpy as np

print('loading data...')
X_train = sparse.load_npz('./train_attr.npz').tocsr()
X_predict = sparse.load_npz('./predict_attr.npz').tocsr()
y_train = pd.read_csv('./user_info.csv')['gender']
print('data is prepared.')

predictions = np.zeros(X_predict.shape[0], dtype=float)
folds = KFold(n_splits=5, shuffle=False)

print('start training...')
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    display(trn_idx,val_idx)
    trn_data_X_train=X_train[trn_idx,:]
    val_data_X_train=X_train[val_idx,:]
    display(trn_data_X_train,val_data_X_train)
    trn_data = lgb.Dataset(trn_data_X_train, y_train.iloc[trn_idx])
    val_data = lgb.Dataset(val_data_X_train, y_train.iloc[val_idx])
    
    lgb_params = {'num_leaves': 2**6-1,
              'min_data_in_leaf': 25, 
              'objective':'binary',
              'max_depth': -1,
              'learning_rate': 0.1,
              'boosting': 'gbdt',
              'feature_fraction': 0.6,
              'bagging_fraction': 0.9,
              'bagging_seed': 11,
              'metric': 'auc',
              'seed':1024,
              'nthread':8,
             }
    model = lgb.train(lgb_params, 
                    trn_data,
                    num_boost_round=200, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 10, 
                    early_stopping_rounds = 100)
    predictions += model.predict(X_predict, num_iteration=model.best_iteration) / folds.n_splits
    display(predictions)
    display(np.where(predictions>0.5))

display(predictions)

loading data...
data is prepared.
start training...
fold n°1


array([180000, 180001, 180002, ..., 899997, 899998, 899999])

array([     0,      1,      2, ..., 179997, 179998, 179999])

<720000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 61014420 stored elements in Compressed Sparse Row format>

<180000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 15292962 stored elements in Compressed Sparse Row format>

Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.90759	valid_1's auc: 0.907775
[20]	training's auc: 0.923606	valid_1's auc: 0.923668
[30]	training's auc: 0.934325	valid_1's auc: 0.934182
[40]	training's auc: 0.941491	valid_1's auc: 0.940933
[50]	training's auc: 0.946735	valid_1's auc: 0.945849
[60]	training's auc: 0.950365	valid_1's auc: 0.949204
[70]	training's auc: 0.953095	valid_1's auc: 0.951616
[80]	training's auc: 0.9553	valid_1's auc: 0.953608
[90]	training's auc: 0.95708	valid_1's auc: 0.955199
[100]	training's auc: 0.9586	valid_1's auc: 0.956485
[110]	training's auc: 0.959854	valid_1's auc: 0.957572
[120]	training's auc: 0.960932	valid_1's auc: 0.958497
[130]	training's auc: 0.961924	valid_1's auc: 0.959288
[140]	training's auc: 0.962835	valid_1's auc: 0.959941
[150]	training's auc: 0.963644	valid_1's auc: 0.960571
[160]	training's auc: 0.96439	valid_1's auc: 0.961153
[170]	training's auc: 0.965088	valid_1's auc: 0.961661
[180]	training's au

array([0.00309808, 0.19907862, 0.06506072, ..., 0.00135544, 0.00032135,
       0.00025179])

(array([], dtype=int64),)

fold n°2


array([     0,      1,      2, ..., 899997, 899998, 899999])

array([180000, 180001, 180002, ..., 359997, 359998, 359999])

<720000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 61050242 stored elements in Compressed Sparse Row format>

<180000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 15257140 stored elements in Compressed Sparse Row format>

Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.905787	valid_1's auc: 0.904623
[20]	training's auc: 0.923206	valid_1's auc: 0.922039
[30]	training's auc: 0.933985	valid_1's auc: 0.932481
[40]	training's auc: 0.941623	valid_1's auc: 0.939989
[50]	training's auc: 0.946682	valid_1's auc: 0.944762
[60]	training's auc: 0.950385	valid_1's auc: 0.948329
[70]	training's auc: 0.953142	valid_1's auc: 0.950882
[80]	training's auc: 0.955346	valid_1's auc: 0.952875
[90]	training's auc: 0.957166	valid_1's auc: 0.954583
[100]	training's auc: 0.958612	valid_1's auc: 0.955871
[110]	training's auc: 0.959875	valid_1's auc: 0.95697
[120]	training's auc: 0.960992	valid_1's auc: 0.957923
[130]	training's auc: 0.96198	valid_1's auc: 0.958746
[140]	training's auc: 0.962857	valid_1's auc: 0.959455
[150]	training's auc: 0.963662	valid_1's auc: 0.960101
[160]	training's auc: 0.964399	valid_1's auc: 0.960631
[170]	training's auc: 0.965089	valid_1's auc: 0.961133
[180]	training

array([0.00522331, 0.39777132, 0.12928408, ..., 0.00232803, 0.00065041,
       0.001434  ])

(array([], dtype=int64),)

fold n°3


array([     0,      1,      2, ..., 899997, 899998, 899999])

array([360000, 360001, 360002, ..., 539997, 539998, 539999])

<720000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 61069614 stored elements in Compressed Sparse Row format>

<180000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 15237768 stored elements in Compressed Sparse Row format>

Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.908018	valid_1's auc: 0.909099
[20]	training's auc: 0.92376	valid_1's auc: 0.924369
[30]	training's auc: 0.93384	valid_1's auc: 0.934121
[40]	training's auc: 0.941381	valid_1's auc: 0.941242
[50]	training's auc: 0.946446	valid_1's auc: 0.945994
[60]	training's auc: 0.950035	valid_1's auc: 0.949367
[70]	training's auc: 0.952781	valid_1's auc: 0.951809
[80]	training's auc: 0.955047	valid_1's auc: 0.953907
[90]	training's auc: 0.956848	valid_1's auc: 0.955514
[100]	training's auc: 0.958334	valid_1's auc: 0.956815
[110]	training's auc: 0.959617	valid_1's auc: 0.957912
[120]	training's auc: 0.960718	valid_1's auc: 0.958788
[130]	training's auc: 0.961709	valid_1's auc: 0.959601
[140]	training's auc: 0.962586	valid_1's auc: 0.960271
[150]	training's auc: 0.963402	valid_1's auc: 0.96091
[160]	training's auc: 0.964148	valid_1's auc: 0.961448
[170]	training's auc: 0.964884	valid_1's auc: 0.961951
[180]	training'

array([0.0068756 , 0.59707468, 0.20042515, ..., 0.00388923, 0.00105466,
       0.00180134])

(array([     1,      9,     16, ..., 999990, 999992, 999994]),)

fold n°4


array([     0,      1,      2, ..., 899997, 899998, 899999])

array([540000, 540001, 540002, ..., 719997, 719998, 719999])

<720000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 61048607 stored elements in Compressed Sparse Row format>

<180000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 15258775 stored elements in Compressed Sparse Row format>

Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.90753	valid_1's auc: 0.90588
[20]	training's auc: 0.923608	valid_1's auc: 0.922226
[30]	training's auc: 0.934236	valid_1's auc: 0.93271
[40]	training's auc: 0.941585	valid_1's auc: 0.939866
[50]	training's auc: 0.946545	valid_1's auc: 0.944722
[60]	training's auc: 0.950267	valid_1's auc: 0.948282
[70]	training's auc: 0.953052	valid_1's auc: 0.950938
[80]	training's auc: 0.95525	valid_1's auc: 0.952965
[90]	training's auc: 0.957018	valid_1's auc: 0.95462
[100]	training's auc: 0.958495	valid_1's auc: 0.955982
[110]	training's auc: 0.959795	valid_1's auc: 0.957151
[120]	training's auc: 0.960895	valid_1's auc: 0.958102
[130]	training's auc: 0.961885	valid_1's auc: 0.958905
[140]	training's auc: 0.96277	valid_1's auc: 0.959651
[150]	training's auc: 0.963582	valid_1's auc: 0.960299
[160]	training's auc: 0.964328	valid_1's auc: 0.960891
[170]	training's auc: 0.965053	valid_1's auc: 0.961401
[180]	training's a

array([0.00979268, 0.79528353, 0.26104046, ..., 0.00508331, 0.00138788,
       0.00213625])

(array([     1,      9,     13, ..., 999992, 999993, 999994]),)

fold n°5


array([     0,      1,      2, ..., 719997, 719998, 719999])

array([720000, 720001, 720002, ..., 899997, 899998, 899999])

<720000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 61046645 stored elements in Compressed Sparse Row format>

<180000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 15260737 stored elements in Compressed Sparse Row format>

Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.906804	valid_1's auc: 0.906557
[20]	training's auc: 0.92273	valid_1's auc: 0.921658
[30]	training's auc: 0.933957	valid_1's auc: 0.932807
[40]	training's auc: 0.941481	valid_1's auc: 0.940373
[50]	training's auc: 0.946573	valid_1's auc: 0.945301
[60]	training's auc: 0.950208	valid_1's auc: 0.948763
[70]	training's auc: 0.952912	valid_1's auc: 0.951279
[80]	training's auc: 0.955106	valid_1's auc: 0.95333
[90]	training's auc: 0.956925	valid_1's auc: 0.954951
[100]	training's auc: 0.95843	valid_1's auc: 0.956325
[110]	training's auc: 0.959697	valid_1's auc: 0.957433
[120]	training's auc: 0.960791	valid_1's auc: 0.958387
[130]	training's auc: 0.961792	valid_1's auc: 0.959199
[140]	training's auc: 0.962662	valid_1's auc: 0.959953
[150]	training's auc: 0.963461	valid_1's auc: 0.96062
[160]	training's auc: 0.964222	valid_1's auc: 0.961202
[170]	training's auc: 0.964912	valid_1's auc: 0.961733
[180]	training's

array([0.01262285, 0.99449394, 0.32285698, ..., 0.00637901, 0.00173432,
       0.00263036])

(array([     1,      9,     13, ..., 999992, 999993, 999994]),)

array([0.01262285, 0.99449394, 0.32285698, ..., 0.00637901, 0.00173432,
       0.00263036])

In [3]:
submission=pd.DataFrame([i for i in range(3000001,4000001)],columns=["user_id"])
submission["predicted_age"]=[0 for _ in range(1000000)]
submission["predicted_gender"]=(predictions>0.5)*1
submission["predicted_gender"]+=1
display(submission)
submission.describe()
submission.to_csv("submission.csv",index=False)

Unnamed: 0,user_id,predicted_age,predicted_gender
0,3000001,0,1
1,3000002,0,2
2,3000003,0,1
3,3000004,0,1
4,3000005,0,1
...,...,...,...
999995,3999996,0,1
999996,3999997,0,1
999997,3999998,0,1
999998,3999999,0,1


In [None]:
# predict age
%reset -f
# !pip install lightgbm
import lightgbm as lgb
import pandas as pd
from scipy import sparse
from sklearn.model_selection import KFold
import numpy as np

print('loading data...')
X_train = sparse.load_npz('./train_attr.npz').tocsr()
X_predict = sparse.load_npz('./predict_attr.npz').tocsr()
y_train = pd.read_csv('./user_info.csv')['age'] - 1
print('data is prepared.')

predictions = np.zeros((X_predict.shape[0], 10), dtype=float)
folds = KFold(n_splits=5, shuffle=False)

print('start training...')
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    display(trn_idx,val_idx)
    trn_data_X_train=X_train[trn_idx,:]
    val_data_X_train=X_train[val_idx,:]
    display(trn_data_X_train,val_data_X_train)
    trn_data = lgb.Dataset(trn_data_X_train, y_train.iloc[trn_idx])
    val_data = lgb.Dataset(val_data_X_train, y_train.iloc[val_idx])
    
    lgb_params = {'num_leaves': 2**6-1,
              'min_data_in_leaf': 25, 
              'objective':'multiclass',
              'num_class':10,
              'max_depth': -1,
              'learning_rate': 0.1,
              'boosting': 'gbdt',
              'feature_fraction': 0.6,
              'bagging_fraction': 0.9,
              'bagging_seed': 11,
              'metric': 'multi_error',
              'seed':1024,
              'nthread':12,
              'lambda_l1': 0.2,
             }
    model = lgb.train(lgb_params, 
                    trn_data,
                    num_boost_round=200, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 10, 
                    early_stopping_rounds = 100)
    predictions += model.predict(X_predict, num_iteration=model.best_iteration) / folds.n_splits
    display(predictions)

display(predictions)

loading data...
data is prepared.
start training...
fold n°1


array([180000, 180001, 180002, ..., 899997, 899998, 899999])

array([     0,      1,      2, ..., 179997, 179998, 179999])

<720000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 61014420 stored elements in Compressed Sparse Row format>

<180000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 15292962 stored elements in Compressed Sparse Row format>

Training until validation scores don't improve for 100 rounds
[10]	training's multi_error: 0.720953	valid_1's multi_error: 0.725594
[20]	training's multi_error: 0.685557	valid_1's multi_error: 0.693111
[30]	training's multi_error: 0.663637	valid_1's multi_error: 0.675478
[40]	training's multi_error: 0.647797	valid_1's multi_error: 0.66175
[50]	training's multi_error: 0.634796	valid_1's multi_error: 0.650833
[60]	training's multi_error: 0.623832	valid_1's multi_error: 0.642383
[70]	training's multi_error: 0.614972	valid_1's multi_error: 0.635667
[80]	training's multi_error: 0.607169	valid_1's multi_error: 0.630628
[90]	training's multi_error: 0.599944	valid_1's multi_error: 0.626144
[100]	training's multi_error: 0.593662	valid_1's multi_error: 0.621678
[110]	training's multi_error: 0.588228	valid_1's multi_error: 0.618206
[120]	training's multi_error: 0.583179	valid_1's multi_error: 0.614406
[130]	training's multi_error: 0.578194	valid_1's multi_error: 0.611433
[140]	training's multi_er

array([[1.61964929e-03, 5.33569076e-03, 7.12979324e-02, ...,
        5.35718466e-03, 2.27759496e-03, 2.34631760e-04],
       [2.94531430e-03, 3.07799305e-03, 4.24791540e-03, ...,
        7.20740048e-02, 4.18791556e-04, 2.12510557e-04],
       [4.88089524e-03, 2.94176136e-02, 1.49348376e-02, ...,
        1.41454091e-02, 1.98796574e-03, 1.23044563e-03],
       ...,
       [7.81046928e-02, 8.10136730e-02, 3.43335584e-02, ...,
        5.58862986e-04, 4.69146450e-04, 2.59893507e-04],
       [1.02884505e-03, 1.04128631e-01, 8.18476827e-02, ...,
        5.95047184e-05, 4.62485401e-05, 9.16160379e-06],
       [1.56209219e-04, 2.66502369e-03, 4.61068919e-03, ...,
        8.69009367e-04, 2.81591261e-05, 7.44807085e-06]])

fold n°2


array([     0,      1,      2, ..., 899997, 899998, 899999])

array([180000, 180001, 180002, ..., 359997, 359998, 359999])

<720000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 61050242 stored elements in Compressed Sparse Row format>

<180000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 15257140 stored elements in Compressed Sparse Row format>

Training until validation scores don't improve for 100 rounds
[10]	training's multi_error: 0.72294	valid_1's multi_error: 0.726528
[20]	training's multi_error: 0.686492	valid_1's multi_error: 0.69395
[30]	training's multi_error: 0.664543	valid_1's multi_error: 0.674511
[40]	training's multi_error: 0.648351	valid_1's multi_error: 0.660989
[50]	training's multi_error: 0.635294	valid_1's multi_error: 0.650411
[60]	training's multi_error: 0.625044	valid_1's multi_error: 0.642428
[70]	training's multi_error: 0.615656	valid_1's multi_error: 0.635822
[80]	training's multi_error: 0.607751	valid_1's multi_error: 0.629456
[90]	training's multi_error: 0.600643	valid_1's multi_error: 0.625006
[100]	training's multi_error: 0.594561	valid_1's multi_error: 0.6205
[110]	training's multi_error: 0.588878	valid_1's multi_error: 0.617272
[120]	training's multi_error: 0.58346	valid_1's multi_error: 0.614411
[130]	training's multi_error: 0.578642	valid_1's multi_error: 0.611072
[140]	training's multi_error:

array([[2.88334806e-03, 9.47370599e-03, 1.56590153e-01, ...,
        1.10374905e-02, 4.36371394e-03, 4.05384468e-04],
       [5.11403578e-03, 6.32922949e-03, 8.24106012e-03, ...,
        1.21145273e-01, 1.48160749e-03, 6.49679559e-04],
       [1.22527764e-02, 5.95568277e-02, 3.41795492e-02, ...,
        2.53460875e-02, 4.03737942e-03, 2.83775050e-03],
       ...,
       [1.64847904e-01, 1.52999907e-01, 6.86815069e-02, ...,
        1.07123538e-03, 1.06191956e-03, 6.25773989e-04],
       [3.04641598e-03, 1.90909475e-01, 1.79371266e-01, ...,
        1.25604939e-04, 7.88464608e-05, 1.83571677e-05],
       [3.87463151e-04, 5.33014287e-03, 9.32533826e-03, ...,
        1.63671676e-03, 4.90909970e-05, 1.46173196e-05]])

fold n°3


array([     0,      1,      2, ..., 899997, 899998, 899999])

array([360000, 360001, 360002, ..., 539997, 539998, 539999])

<720000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 61069614 stored elements in Compressed Sparse Row format>

<180000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 15237768 stored elements in Compressed Sparse Row format>

Training until validation scores don't improve for 100 rounds
[10]	training's multi_error: 0.721869	valid_1's multi_error: 0.726444
[20]	training's multi_error: 0.686686	valid_1's multi_error: 0.694994
[30]	training's multi_error: 0.665086	valid_1's multi_error: 0.675278
[40]	training's multi_error: 0.648419	valid_1's multi_error: 0.660894
[50]	training's multi_error: 0.635269	valid_1's multi_error: 0.650989
[60]	training's multi_error: 0.624768	valid_1's multi_error: 0.6422
[70]	training's multi_error: 0.615869	valid_1's multi_error: 0.635789
[80]	training's multi_error: 0.607793	valid_1's multi_error: 0.62965
[90]	training's multi_error: 0.601046	valid_1's multi_error: 0.624911
[100]	training's multi_error: 0.594397	valid_1's multi_error: 0.620078
[110]	training's multi_error: 0.588689	valid_1's multi_error: 0.616594
[120]	training's multi_error: 0.58355	valid_1's multi_error: 0.613261
[130]	training's multi_error: 0.57865	valid_1's multi_error: 0.610239
[140]	training's multi_error:

array([[4.73625048e-03, 1.47350661e-02, 2.26546630e-01, ...,
        1.59554274e-02, 6.17168016e-03, 5.40966385e-04],
       [7.56399798e-03, 9.30914223e-03, 1.40620892e-02, ...,
        1.72655344e-01, 2.24619024e-03, 8.07939902e-04],
       [1.65271061e-02, 9.45477262e-02, 5.21384444e-02, ...,
        3.87925650e-02, 5.53402059e-03, 4.76699027e-03],
       ...,
       [2.28513412e-01, 2.45728577e-01, 1.03658642e-01, ...,
        1.73064871e-03, 1.87936851e-03, 1.08633180e-03],
       [5.42703655e-03, 2.79770039e-01, 2.70981969e-01, ...,
        1.93034711e-04, 1.13958854e-04, 2.82685922e-05],
       [5.75137500e-04, 7.23728528e-03, 1.44979671e-02, ...,
        2.24888549e-03, 7.63347742e-05, 2.19591594e-05]])

fold n°4


array([     0,      1,      2, ..., 899997, 899998, 899999])

array([540000, 540001, 540002, ..., 719997, 719998, 719999])

<720000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 61048607 stored elements in Compressed Sparse Row format>

<180000x4797415 sparse matrix of type '<class 'numpy.float64'>'
	with 15258775 stored elements in Compressed Sparse Row format>

Training until validation scores don't improve for 100 rounds
[10]	training's multi_error: 0.722082	valid_1's multi_error: 0.724022
[20]	training's multi_error: 0.687419	valid_1's multi_error: 0.693933
[30]	training's multi_error: 0.664538	valid_1's multi_error: 0.674611
[40]	training's multi_error: 0.647668	valid_1's multi_error: 0.6615
[50]	training's multi_error: 0.634675	valid_1's multi_error: 0.651606
[60]	training's multi_error: 0.624437	valid_1's multi_error: 0.643367
[70]	training's multi_error: 0.615078	valid_1's multi_error: 0.636361
[80]	training's multi_error: 0.607122	valid_1's multi_error: 0.630578
[90]	training's multi_error: 0.600163	valid_1's multi_error: 0.62515
[100]	training's multi_error: 0.593974	valid_1's multi_error: 0.620706
[110]	training's multi_error: 0.588097	valid_1's multi_error: 0.61715
[120]	training's multi_error: 0.582968	valid_1's multi_error: 0.613644
[130]	training's multi_error: 0.578157	valid_1's multi_error: 0.611128
[140]	training's multi_error

In [6]:
display(model.predict(X_predict, num_iteration=model.best_iteration))

array([[8.09824646e-03, 2.66784538e-02, 3.56489662e-01, ...,
        2.67859233e-02, 1.13879748e-02, 1.17315880e-03],
       [1.47265715e-02, 1.53899653e-02, 2.12395770e-02, ...,
        3.60370024e-01, 2.09395778e-03, 1.06255279e-03],
       [2.44044762e-02, 1.47088068e-01, 7.46741880e-02, ...,
        7.07270453e-02, 9.93982870e-03, 6.15222816e-03],
       ...,
       [3.90523464e-01, 4.05068365e-01, 1.71667792e-01, ...,
        2.79431493e-03, 2.34573225e-03, 1.29946753e-03],
       [5.14422526e-03, 5.20643154e-01, 4.09238414e-01, ...,
        2.97523592e-04, 2.31242701e-04, 4.58080190e-05],
       [7.81046096e-04, 1.33251185e-02, 2.30534459e-02, ...,
        4.34504684e-03, 1.40795630e-04, 3.72403542e-05]])

In [None]:
import pandas as pd
import numpy as np

submission = pd.read_csv('./submission.csv')
predictions = np.apply_along_axis(np.argmax, 1, predictions)
submission['predict_age'] = predictions + 1
submission.to_csv('./submission.csv', index=False)

In [7]:
import pandas as pd

df = pd.read_csv('./submission.csv')
df.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,user_id,predicted_age,predicted_gender,age
0,0,0,3000001,0,1,3
1,1,1,3000002,0,2,7
2,2,2,3000003,0,1,7
3,3,3,3000004,0,1,3
4,4,4,3000005,0,1,4


In [9]:
submission = df[['user_id', 'age', 'predicted_gender']]
submission.columns = ['user_id', 'predicted_age', 'predicted_gender']
submission.head(5)

Unnamed: 0,user_id,predicted_age,predicted_gender
0,3000001,3,1
1,3000002,7,2
2,3000003,7,1
3,3000004,3,1
4,3000005,4,1


In [10]:
submission.to_csv('./submission.csv', index=False)
pd.read_csv('./submission.csv').head(5)

Unnamed: 0,user_id,predicted_age,predicted_gender
0,3000001,3,1
1,3000002,7,2
2,3000003,7,1
3,3000004,3,1
4,3000005,4,1
