In [1]:
import pandas as pd
import numpy as np
import json
import os
import random
import string
import re

from pathlib import Path
from tqdm import tqdm

import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,mean_squared_log_error

In [2]:

df_train = pd.read_pickle('../data/train/full.pickle')
df_test = pd.read_pickle('../data/test/full.pickle')


In [3]:
df_train = df_train[df_train['cites'] >= 1].reset_index(drop=True)
df_train.shape

(15117, 303)

In [4]:
df_train.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,...,doi_cites_mean_submitter_label,doi_cites_count_submitter_label,doi_cites_sum_submitter_label,doi_cites_min_submitter_label,doi_cites_max_submitter_label,doi_cites_median_submitter_label,doi_cites_std_submitter_label,doi_cites_q10_submitter_label,doi_cites_q25_submitter_label,doi_cites_q75_submitter_label
0,1403.7138,Aigen Li,"Qi Li, S.L. Liang, Aigen Li (University of Mis...",Spectropolarimetric Constraints on the Nature ...,"5 pages, 2 figures; accepted for publication i...",,10.1093/mnrasl/slu021,,astro-ph.GA,http://arxiv.org/licenses/nonexclusive-distrib...,...,13.348837,43,574,0,59,9.0,12.328603,1.2,6.0,17.5
1,1405.5857,Michael Mortonson,"Michael J. Mortonson, Uro\v{s} Seljak",A joint analysis of Planck and BICEP2 B modes ...,"13 pages, 4 figures; submitted to JCAP; refere...",JCAP10(2014)035,10.1088/1475-7516/2014/10/035,,astro-ph.CO gr-qc hep-ph hep-th,http://arxiv.org/licenses/nonexclusive-distrib...,...,46.4375,16,743,8,122,25.5,36.72051,14.0,18.5,69.25
2,1807.01034,Evangelos Thomas Karamatskos,"Evangelos T. Karamatskos, Sebastian Raabe, Ter...",Molecular movie of ultrafast coherent rotation...,9 Figures,"Nat Commun 10, 3364 (2019)",10.1038/s41467-019-11122-y,,physics.chem-ph physics.atom-ph quant-ph,http://arxiv.org/licenses/nonexclusive-distrib...,...,4.666667,3,14,1,7,6.0,3.21455,2.0,3.5,6.5
3,astro-ph/9908243,Peter Meszaros,"C. Weth (1, 2), P. Meszaros (1,3,4), T. Kallma...",Early X-ray/UV Line Signatures of GRB Progenit...,revisions to ApJ ms first submitted 8/21/99; u...,Astrophys.J. 534 (2000) 581-586,10.1086/308792,,astro-ph,,...,122.23913,46,5623,0,773,37.0,186.787245,3.5,14.75,131.0
4,hep-ph/0103252,Tommy Ohlsson,"Tommy Ohlsson, Hakan Snellman",Neutrino oscillations with three flavors in ma...,"13 pages, 8 figures, RevTeX. Final version to ...","Eur.Phys.J.C20:507-515,2001",10.1007/s100520100687,TUM-HEP-405/01,hep-ph,,...,18.465116,43,794,0,134,15.0,22.145997,1.0,5.5,23.0


In [5]:
target = np.array(np.log1p(df_train['cites'].values))
len(target), target

(15117,
 array([2.07944154, 5.24174702, 2.19722458, ..., 4.58496748, 1.60943791,
        1.38629436]))

In [6]:
#####################################################
### LGBで学習、予測する関数の定義
#####################################################
from sklearn.preprocessing import LabelEncoder

SEED = 777
NFOLDS = 5
def Train_and_Pred(df_train, target, test):
    # --------------------------------------
    # パラメータ定義
    # --------------------------------------
    lgb_params = {
                    'objective': 'root_mean_squared_error',
                    'boosting_type': 'gbdt',
                    'n_estimators': 50000,
                    'colsample_bytree': 0.5,
                    'subsample': 0.5,
                    'subsample_freq': 3,
                    'reg_alpha': 8,
                    'reg_lambda': 2,
                    'random_state': SEED,
                    "bagging_fraction": 0.5520399476847848,
                    "bagging_freq": 1,
                    "feature_fraction": 0.4436319472771827,
                    "lambda_l1": 0.01113869595673112,
                    "lambda_l2": 8.706009358617911e-07,
                    "learning_rate": 0.012307412937706345,
                    "min_child_samples": 18,
                    "num_leaves": 8,        
                  }

    # train
    train = df_train.copy()
    train['doi_cites'] = train['doi_cites'].astype('int')
    train = train.drop(['id', 'authors', 'title', 'comments',
        'journal-ref', 'doi', 'report-no', 'categories', 'license',
        'abstract', 'versions', 'update_date_x', 'authors_parsed', 'pub_publisher',
        'update_date_y', 'first_created_date', 'last_created_date', 'cites']
        , axis=1
    )

    
    le = LabelEncoder()
    #decoded = le.inverse_transform(encoded)
    train['doi_id'] = le.fit_transform(train['doi_id'].values)
    train['submitter'] = le.fit_transform(train['submitter'].values)
    train['author_first'] = le.fit_transform(train['author_first'].values)

    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    lgb_oof = np.zeros(train.shape[0])
    lgb_pred = 0


    for fold, (trn_idx, val_idx) in enumerate(kf.split(train)):
        X_train, y_train = train.iloc[trn_idx], target[trn_idx]
        X_valid, y_valid = train.iloc[val_idx], target[val_idx]

        # LightGBM
        model = lgb.LGBMRegressor(**lgb_params)
        model.fit(X_train, y_train,
                  eval_set=(X_valid, y_valid),
                  eval_metric='rmse',
                  verbose=False,
                  early_stopping_rounds=500,
                  categorical_feature=['doi_id', 'submitter', 'author_first'],
                  )

        lgb_oof[val_idx] = model.predict(X_valid)
        rmsle = mean_squared_error(y_valid, lgb_oof[val_idx], squared=False)
        print(f"fold {fold} lgb score: {rmsle}")

    rmsle = mean_squared_error(target, lgb_oof, squared=False)
    print("+-" * 40)
    print(f"score: {rmsle}")
    print(f"model score: {model.score(train, target)}")

    # ------------------------------------------------------------------------------
    # 提出ファイルの作成
    # ------------------------------------------------------------------------------
    #test_predicted = np.expm1(lgb_pred)

    #submit_df = pd.DataFrame({'id': test_index})
    #submit_df['cites'] = np.where(test_predicted < 0, 0, test_predicted)
    #submit_df.to_csv("submission.csv", index=False)
    return rmsle

In [7]:
Train_and_Pred(df_train,target,df_test)

New categorical_feature is ['author_first', 'doi_id', 'submitter']
fold 0 lgb score: 0.4978870904664446
New categorical_feature is ['author_first', 'doi_id', 'submitter']
fold 1 lgb score: 0.5196688626894895
New categorical_feature is ['author_first', 'doi_id', 'submitter']
fold 2 lgb score: 0.5044811828183194
New categorical_feature is ['author_first', 'doi_id', 'submitter']
fold 3 lgb score: 0.49891999459772707
New categorical_feature is ['author_first', 'doi_id', 'submitter']
fold 4 lgb score: 0.509488373861892
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-
score: 0.5061521414115513
model score: 0.8523123633336293


0.5061521414115513