In [1]:
import pandas as pd
import numpy as np
import json
import os
import random
import string
import re
import matplotlib.pyplot as plt
import pickle

from pathlib import Path
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from catboost import CatBoostRegressor
from catboost import Pool
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,mean_squared_log_error
import optuna.integration.lightgbm as lgbo
from sklearn.preprocessing import StandardScaler

import umap
pd.set_option('display.max_columns', 2000)

SEED = 777



In [2]:
df_predict = pd.read_pickle('../data/test/full_comp.pickle')
df_predict.shape


(59084, 2001)

In [3]:
dorop_cols = []
for col in df_predict.columns:
    if '_x' in col:
        dorop_cols.append(col)
len(dorop_cols), dorop_cols
df_predict = df_predict.drop(dorop_cols, axis=1)


In [4]:
importance_0_cols = ['econ_y', 'eess_y', 'nlin_y', 'physics_y', 'acc-phys_y',
       'adap-org_y', 'alg-geom_y', 'ao-sci_y', 'astro-ph_y', 'atom-ph_y',
       'bayes-an_y', 'chao-dyn_y', 'chem-ph_y', 'cmp-lg_y', 'comp-gas_y',
       'dg-ga_y', 'funct-an_y', 'gr-qc_y', 'math-ph_y', 'mtrl-th_y',
       'nucl-ex_y', 'patt-sol_y', 'plasm-ph_y', 'q-alg_y', 'q-fin_y',
       'solv-int_y', 'supr-con_y', 'acc_y', 'adap_y', 'alg_y', 'ao_y',
       'astro_y', 'atom_y', 'bayes_y', 'chao_y', 'chem_y', 'cmp_y',
       'comp_y', 'cond_y', 'dg_y', 'econ', 'eess', 'funct_y', 'gr_y',
       'math', 'mtrl_y', 'nlin', 'patt_y', 'physics', 'plasm_y',
       'quant_y', 'solv_y', 'stat', 'supr_y', 'astro-ph.ga',
       'astro-ph.he', 'astro-ph.sr', 'cond-mat.dis-nn',
       'cond-mat.mes-hall', 'cond-mat.other', 'cond-mat.soft',
       'cond-mat.stat-mech', 'cs.ai', 'cs.ar', 'cs.cc', 'cs.ce', 'cs.cg',
       'cs.cl', 'cs.cr', 'cs.cv', 'cs.cy', 'cs.db', 'cs.dc', 'cs.dl',
       'cs.dm', 'cs.et', 'cs.fl', 'cs.gl', 'cs.gr', 'cs.gt', 'cs.hc',
       'cs.ir', 'cs.it', 'cs.lo', 'cs.ma', 'cs.mm', 'cs.ms', 'cs.na',
       'cs.ne', 'cs.ni', 'cs.oh', 'cs.os', 'cs.pf', 'cs.pl', 'cs.ro',
       'cs.sc', 'cs.sd', 'cs.se', 'cs.sy', 'econ.em', 'econ.gn',
       'econ.th', 'eess.as', 'eess.iv', 'eess.sp', 'eess.sy', 'math.ac',
       'math.ap', 'math.at', 'math.ca', 'math.ct', 'math.cv', 'math.dg',
       'math.ds', 'math.fa', 'math.gm', 'math.gn', 'math.gr', 'math.gt',
       'math.ho', 'math.it', 'math.kt', 'math.lo', 'math.mg', 'math.mp',
       'math.na', 'math.nt', 'math.oa', 'math.oc', 'math.qa', 'math.ra',
       'math.rt', 'math.sg', 'math.sp', 'math.st', 'nlin.ao', 'nlin.cd',
       'nlin.cg', 'nlin.ps', 'nlin.si', 'physics.acc-ph', 'physics.ao-ph',
       'physics.app-ph', 'physics.atm-clus', 'physics.bio-ph',
       'physics.chem-ph', 'physics.class-ph', 'physics.comp-ph',
       'physics.data-an', 'physics.ed-ph', 'physics.flu-dyn',
       'physics.gen-ph', 'physics.geo-ph', 'physics.hist-ph',
       'physics.ins-det', 'physics.med-ph', 'physics.optics',
       'physics.plasm-ph', 'physics.pop-ph', 'physics.soc-ph', 'q-bio.bm',
       'q-bio.cb', 'q-bio.gn', 'q-bio.mn', 'q-bio.nc', 'q-bio.ot',
       'q-bio.pe', 'q-bio.qm', 'q-bio.sc', 'q-bio.to', 'q-fin.cp',
       'q-fin.ec', 'q-fin.gn', 'q-fin.mf', 'q-fin.pm', 'q-fin.pr',
       'q-fin.rm', 'q-fin.st', 'q-fin.tr', 'stat.ap', 'stat.co',
       'stat.me', 'stat.ml', 'stat.ot', 'doi_cites_min_doi_id_label',
       'doi_cites_min_pub_publisher_label',
       'doi_cites_median_pub_publisher_label', 'doi_cites_min_update_ym',
       'doi_cites_min_first_created_ym', 'doi_cites_min_license_label',
       'doi_cites_max_license_label', 'doi_cites_q10_license_label',
       'doi_cites_q75_license_label', 'doi_cites_min_category_main_label',
       'doi_cites_q10_category_main_label',
       'doi_cites_q25_category_main_label',
       'doi_cites_min_category_main_detail_label',
       'doi_cites_median_category_main_detail_label',
       'doi_cites_q10_category_main_detail_label',
       'doi_cites_q25_category_main_detail_label',
       'doi_cites_min_category_name_parent_label',
       'doi_cites_q10_category_name_parent_label',
       'doi_cites_min_category_name_parent_main_label',
       'doi_cites_q10_category_name_parent_main_label',
       'doi_cites_min_category_name_label',
       'pred_doi_cites_min_doi_id_label',
       'pred_doi_cites_min_pub_publisher_label',
       'pred_doi_cites_median_pub_publisher_label',
       'pred_doi_cites_q75_pub_publisher_label',
       'pred_doi_cites_min_update_ym', 'pred_doi_cites_q10_update_ym',
       'pred_doi_cites_min_first_created_ym',
       'pred_doi_cites_q10_first_created_ym',
       'pred_doi_cites_mean_license_label',
       'pred_doi_cites_count_license_label',
       'pred_doi_cites_sum_license_label',
       'pred_doi_cites_min_license_label',
       'pred_doi_cites_std_license_label',
       'pred_doi_cites_q10_license_label',
       'pred_doi_cites_q25_license_label',
       'pred_doi_cites_q75_license_label',
       'pred_doi_cites_mean_category_main_label',
       'pred_doi_cites_min_category_main_label',
       'pred_doi_cites_median_category_main_label',
       'pred_doi_cites_q10_category_main_label',
       'pred_doi_cites_q25_category_main_label',
       'pred_doi_cites_mean_category_main_detail_label',
       'pred_doi_cites_sum_category_main_detail_label',
       'pred_doi_cites_min_category_main_detail_label',
       'pred_doi_cites_median_category_main_detail_label',
       'pred_doi_cites_q10_category_main_detail_label',
       'pred_doi_cites_q25_category_main_detail_label',
       'pred_doi_cites_q75_category_main_detail_label',
       'pred_doi_cites_min_category_name_parent_label',
       'pred_doi_cites_median_category_name_parent_label',
       'pred_doi_cites_q10_category_name_parent_label',
       'pred_doi_cites_q25_category_name_parent_label',
       'pred_doi_cites_min_category_name_parent_main_label',
       'pred_doi_cites_median_category_name_parent_main_label',
       'pred_doi_cites_q10_category_name_parent_main_label',
       'pred_doi_cites_q25_category_name_parent_main_label',
       'pred_doi_cites_min_category_name_label',
       'diff_rate_doi_cites_pred_doi_cites',
       'diff_rate_doi_cites_mean_submitter_label_pred_doi_cites_mean_submitter_label',
       'diff_rate_doi_cites_mean_doi_id_label_doi_cites_mean_pub_publisher_label',
       'diff_rate_doi_cites_mean_doi_id_label_pred_doi_cites_mean_doi_id_label',
       'diff_rate_doi_cites_mean_doi_id_label_pred_doi_cites_mean_pub_publisher_label',
       'diff_rate_doi_cites_mean_author_first_label_pred_doi_cites_mean_author_first_label',
       'diff_doi_cites_mean_author_first_label_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_doi_cites_mean_pub_publisher_label_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_pub_publisher_label',
       'diff_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_update_ym_pred_doi_cites_mean_update_ym',
       'diff_doi_cites_mean_update_ym_pred_doi_cites_mean_category_main_detail_label',
       'diff_doi_cites_mean_update_ym_pred_doi_cites_mean_category_name_parent_label',
       'diff_rate_doi_cites_mean_first_created_ym_pred_doi_cites_mean_first_created_ym',
       'diff_rate_doi_cites_mean_license_label_doi_cites_mean_category_main_detail_label',
       'diff_doi_cites_mean_license_label_pred_doi_cites_mean_license_label',
       'diff_rate_doi_cites_mean_license_label_pred_doi_cites_mean_license_label',
       'diff_doi_cites_mean_license_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_license_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_license_label_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_doi_cites_mean_category_main_label_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_category_main_label_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_category_main_label_pred_doi_cites_mean_category_main_label',
       'diff_rate_doi_cites_mean_category_main_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_category_main_detail_label_doi_cites_mean_category_name_parent_label',
       'diff_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_doi_id_label',
       'diff_rate_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_pub_publisher_label',
       'diff_rate_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_update_ym',
       'diff_rate_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_license_label',
       'diff_rate_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_main_label',
       'diff_rate_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_name_parent_label',
       'diff_rate_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_name_parent_label',
       'diff_rate_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_name_label',
       'diff_rate_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_name_label',
       'diff_rate_doi_cites_mean_category_name_parent_label_pred_doi_cites_mean_update_ym',
       'diff_doi_cites_mean_category_name_parent_label_pred_doi_cites_mean_category_main_label',
       'diff_rate_doi_cites_mean_category_name_parent_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_category_name_parent_label_pred_doi_cites_mean_category_name_parent_label',
       'diff_doi_cites_mean_category_name_parent_main_label_pred_doi_cites_mean_license_label',
       'diff_rate_doi_cites_mean_category_name_parent_main_label_pred_doi_cites_mean_license_label',
       'diff_doi_cites_mean_category_name_parent_main_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_category_name_parent_main_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_category_name_parent_main_label_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_rate_doi_cites_mean_category_name_label_pred_doi_cites_mean_pub_publisher_label',
       'diff_doi_cites_mean_category_name_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_doi_cites_mean_category_name_label_pred_doi_cites_mean_category_name_label',
       'diff_pred_doi_cites_mean_submitter_label_pred_doi_cites_mean_doi_id_label',
       'diff_rate_pred_doi_cites_mean_submitter_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_pred_doi_cites_mean_doi_id_label_pred_doi_cites_mean_author_first_label',
       'diff_rate_pred_doi_cites_mean_doi_id_label_pred_doi_cites_mean_pub_publisher_label',
       'diff_pred_doi_cites_mean_doi_id_label_pred_doi_cites_mean_update_ym',
       'diff_pred_doi_cites_mean_doi_id_label_pred_doi_cites_mean_first_created_ym',
       'diff_rate_pred_doi_cites_mean_doi_id_label_pred_doi_cites_mean_license_label',
       'diff_rate_pred_doi_cites_mean_doi_id_label_pred_doi_cites_mean_category_main_label',
       'diff_pred_doi_cites_mean_doi_id_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_pred_doi_cites_mean_doi_id_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_pred_doi_cites_mean_doi_id_label_pred_doi_cites_mean_category_name_label',
       'diff_pred_doi_cites_mean_author_first_label_pred_doi_cites_mean_pub_publisher_label',
       'diff_rate_pred_doi_cites_mean_author_first_label_pred_doi_cites_mean_update_ym',
       'diff_pred_doi_cites_mean_author_first_label_pred_doi_cites_mean_category_main_label',
       'diff_rate_pred_doi_cites_mean_author_first_label_pred_doi_cites_mean_category_main_label',
       'diff_rate_pred_doi_cites_mean_author_first_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_pred_doi_cites_mean_author_first_label_pred_doi_cites_mean_category_name_parent_label',
       'diff_pred_doi_cites_mean_author_first_label_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_rate_pred_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_update_ym',
       'diff_pred_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_first_created_ym',
       'diff_rate_pred_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_first_created_ym',
       'diff_pred_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_license_label',
       'diff_rate_pred_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_license_label',
       'diff_pred_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_category_main_label',
       'diff_rate_pred_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_category_main_label',
       'diff_pred_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_pred_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_pred_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_pred_doi_cites_mean_pub_publisher_label_pred_doi_cites_mean_category_name_label',
       'diff_rate_pred_doi_cites_mean_update_ym_pred_doi_cites_mean_license_label',
       'diff_pred_doi_cites_mean_update_ym_pred_doi_cites_mean_category_main_label',
       'diff_rate_pred_doi_cites_mean_update_ym_pred_doi_cites_mean_category_main_label',
       'diff_pred_doi_cites_mean_update_ym_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_pred_doi_cites_mean_update_ym_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_pred_doi_cites_mean_update_ym_pred_doi_cites_mean_category_name_parent_label',
       'diff_pred_doi_cites_mean_update_ym_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_pred_doi_cites_mean_first_created_ym_pred_doi_cites_mean_license_label',
       'diff_rate_pred_doi_cites_mean_first_created_ym_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_pred_doi_cites_mean_license_label_pred_doi_cites_mean_category_main_label',
       'diff_rate_pred_doi_cites_mean_license_label_pred_doi_cites_mean_category_main_label',
       'diff_pred_doi_cites_mean_license_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_pred_doi_cites_mean_license_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_pred_doi_cites_mean_license_label_pred_doi_cites_mean_category_name_parent_label',
       'diff_pred_doi_cites_mean_license_label_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_rate_pred_doi_cites_mean_license_label_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_pred_doi_cites_mean_category_main_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_rate_pred_doi_cites_mean_category_main_label_pred_doi_cites_mean_category_main_detail_label',
       'diff_pred_doi_cites_mean_category_main_label_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_rate_pred_doi_cites_mean_category_main_label_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_pred_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_name_parent_label',
       'diff_rate_pred_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_name_parent_label',
       'diff_rate_pred_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_name_parent_main_label',
       'diff_pred_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_name_label',
       'diff_rate_pred_doi_cites_mean_category_main_detail_label_pred_doi_cites_mean_category_name_label',
       'diff_rate_pred_doi_cites_mean_category_name_parent_label_pred_doi_cites_mean_category_name_label',
       'diff_pred_doi_cites_mean_category_name_parent_main_label_pred_doi_cites_mean_category_name_label',
       'diff_rate_pred_doi_cites_mean_category_name_parent_main_label_pred_doi_cites_mean_category_name_label',
       'is_null_comments', 'is_null_journal-ref']


In [5]:
importance_0_cols.extend(['id', 'authors', 'title', 'comments',
    'journal-ref', 'doi', 'report-no', 'categories', 'license',
    'abstract', 'versions', 'authors_parsed', 'pub_publisher',
    'update_date_y', 'first_created_date', 'last_created_date', 'doi_id', 'submitter', 'author_first', 'category_main', 'category_main_detail', 'category_name_parent_main_unique', 'category_name_parent_unique', 'category_name_unique',
    'submitter_label','doi_id_label','author_first_label','pub_publisher_label',
    'license_label','category_main_label','category_name_parent_label','category_name_parent_main_label', 'category_name_label'
    ])

In [6]:
NFOLDS = 5

In [7]:
result_y_lgb = []

predict_data = df_predict.copy()
predict_data = predict_data.drop(importance_0_cols, axis=1)
test_index = []
result = np.zeros([NFOLDS, len(df_predict)])
for fold_no in range(NFOLDS):
    with open(f'../models/lgb_{fold_no}.pickle', mode='rb') as f:
        model = pickle.load(f)
        
    result[fold_no] = model.predict(predict_data)
    print(f"fold {fold_no}")

result.shape
lgb = result.mean(axis=0)
lgb.shape

fold 0
fold 1
fold 2
fold 3
fold 4


(59084,)

In [8]:
result_y_lgb = []

predict_data = df_predict.copy()
predict_data = predict_data.drop(importance_0_cols, axis=1)
test_index = []
result = np.zeros([NFOLDS, len(df_predict)])
for fold_no in range(NFOLDS):
    with open(f'../models/v2_lgb_{fold_no}.pickle', mode='rb') as f:
        model = pickle.load(f)
        
    result[fold_no] = model.predict(predict_data)
    print(f"fold {fold_no}")

result.shape
lgb2 = result.mean(axis=0)
lgb2.shape

fold 0
fold 1
fold 2
fold 3
fold 4


(59084,)

In [9]:
predict_data = df_predict.copy()
predict_data = predict_data.drop(importance_0_cols, axis=1)
result = np.zeros([NFOLDS, len(df_predict)])
for fold_no in range(NFOLDS):
    with open(f'../models/cat_{fold_no}.pickle', mode='rb') as f:
        model = pickle.load(f)
        
    result[fold_no] = model.predict(predict_data)
    print(f"fold {fold_no}")

result.shape
cat = result.mean(axis=0)
cat.shape

fold 0
fold 1
fold 2
fold 3
fold 4


(59084,)

In [10]:
predict_data = df_predict.copy()
predict_data = predict_data.drop(importance_0_cols, axis=1)
result = np.zeros([NFOLDS, len(df_predict)])
for fold_no in range(NFOLDS):
    with open(f'../models/v2_cat_{fold_no}.pickle', mode='rb') as f:
        model = pickle.load(f)
        
    result[fold_no] = model.predict(predict_data)
    print(f"fold {fold_no}")

result.shape
cat2 = result.mean(axis=0)
cat2.shape

fold 0
fold 1
fold 2
fold 3
fold 4


(59084,)

In [11]:
predict_data = df_predict.copy()
predict_data = predict_data.drop(importance_0_cols, axis=1)
predict_data = predict_data.fillna(0)

result = np.zeros([NFOLDS, len(df_predict)])
for fold_no in range(NFOLDS):
    with open(f'../models/rf_{fold_no}.pickle', mode='rb') as f:
        model = pickle.load(f)
        
    result[fold_no] = model.predict(predict_data)
    print(f"fold {fold_no}")

rf = result.mean(axis=0)
rf.shape

fold 0
fold 1
fold 2
fold 3
fold 4


(59084,)

In [12]:
predict_data = df_predict.copy()
predict_data = predict_data.drop(importance_0_cols, axis=1)
predict_data = predict_data.fillna(0)

result = np.zeros([NFOLDS, len(df_predict)])
for fold_no in range(NFOLDS):
    with open(f'../models/v2_rf_{fold_no}.pickle', mode='rb') as f:
        model = pickle.load(f)
        
    result[fold_no] = model.predict(predict_data)
    print(f"fold {fold_no}")

rf2 = result.mean(axis=0)
rf2.shape

fold 0
fold 1
fold 2
fold 3
fold 4


(59084,)

In [13]:
predict_data = df_predict.copy()
predict_data = predict_data.drop(importance_0_cols, axis=1)
result = np.zeros([NFOLDS, len(df_predict)])

scaler = StandardScaler()
predict_data = predict_data.fillna(0)
predict_data = scaler.fit_transform(predict_data)
for fold_no in range(NFOLDS):
    with open(f'../models/reg_{fold_no}.pickle', mode='rb') as f:
        model = pickle.load(f)
        
    result[fold_no] = model.predict(predict_data)
    print(f"fold {fold_no}")

reg = result.mean(axis=0)
reg.shape

fold 0
fold 1
fold 2
fold 3
fold 4


(59084,)

In [14]:
predict_data = df_predict.copy()
predict_data = predict_data.drop(importance_0_cols, axis=1)
result = np.zeros([NFOLDS, len(df_predict)])

scaler = StandardScaler()
predict_data = predict_data.fillna(0)
predict_data = scaler.fit_transform(predict_data)
for fold_no in range(NFOLDS):
    with open(f'../models/v2_reg_{fold_no}.pickle', mode='rb') as f:
        model = pickle.load(f)
        
    result[fold_no] = model.predict(predict_data)
    print(f"fold {fold_no}")

reg2 = result.mean(axis=0)
reg2.shape

fold 0
fold 1
fold 2
fold 3
fold 4


(59084,)

In [15]:
def build_model():
  input = tf.keras.layers.Input(1555, name="input")
  x = tf.keras.layers.Dense(256, activation='relu')(input)
  x = tf.keras.layers.Dropout(0.5)(x)
  x = tf.keras.layers.Dense(128, activation='relu')(x)
  outputs = tf.keras.layers.Dense(1, name='outputs')(x)
  optimizer = tf.keras.optimizers.Adam()

  model = tf.keras.Model(inputs=input, outputs=outputs)
  model.compile(
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError()],
    optimizer=optimizer)
  return model
  
model = build_model()
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 1555)]            0         
_________________________________________________________________
dense (Dense)                (None, 256)               398336    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
outputs (Dense)              (None, 1)                 129       
Total params: 431,361
Trainable params: 431,361
Non-trainable params: 0
_________________________________________________________________


In [16]:
predict_data = df_predict.copy()
predict_data = predict_data.drop(importance_0_cols, axis=1)
result = np.zeros([NFOLDS, len(df_predict)])

scaler = StandardScaler()
predict_data = predict_data.fillna(0)
predict_data = scaler.fit_transform(predict_data)
for fold_no in range(NFOLDS):
    model = build_model()
    checkpoint_filepath = f'../models/nn_{fold_no}_weights.hdf5'
    model.load_weights(checkpoint_filepath)

    result[fold_no] = model.predict(predict_data).flatten()
    print(f"fold {fold_no}")

nn = result.mean(axis=0)
nn.shape

fold 0
fold 1
fold 2
fold 3
fold 4


(59084,)

In [30]:
def build_model2():
  input = tf.keras.layers.Input(1555, name="input")
  x = tf.keras.layers.Dense(512, activation='relu')(input)
  x = tf.keras.layers.Dropout(0.5)(x)
  x = tf.keras.layers.Dense(256, activation='relu')(x)
  outputs = tf.keras.layers.Dense(1, name='outputs')(x)
  optimizer = tf.keras.optimizers.Adam()

  model = tf.keras.Model(inputs=input, outputs=outputs)
  model.compile(
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError()],
    optimizer=optimizer)
  return model
  
model = build_model()
model.summary()

Model: "functional_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 1555)]            0         
_________________________________________________________________
dense_22 (Dense)             (None, 256)               398336    
_________________________________________________________________
dropout_11 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 128)               32896     
_________________________________________________________________
outputs (Dense)              (None, 1)                 129       
Total params: 431,361
Trainable params: 431,361
Non-trainable params: 0
_________________________________________________________________


In [31]:
predict_data = df_predict.copy()
predict_data = predict_data.drop(importance_0_cols, axis=1)
result = np.zeros([NFOLDS, len(df_predict)])

scaler = StandardScaler()
predict_data = predict_data.fillna(0)
predict_data = scaler.fit_transform(predict_data)
for fold_no in range(NFOLDS):
    model = build_model2()
    checkpoint_filepath = f'../models/v2_nn_{fold_no}_weights.hdf5'
    model.load_weights(checkpoint_filepath)

    result[fold_no] = model.predict(predict_data).flatten()
    print(f"fold {fold_no}")

nn2 = result.mean(axis=0)
nn2.shape

fold 0
fold 1
fold 2
fold 3
fold 4


(59084,)

In [33]:
df_result = pd.DataFrame(lgb, columns=['lgb'])
df_result = pd.concat([df_result, pd.DataFrame(cat, columns=['cat'])], axis=1)
df_result = pd.concat([df_result, pd.DataFrame(rf, columns=['rf'])], axis=1)
df_result = pd.concat([df_result, pd.DataFrame(reg, columns=['reg'])], axis=1)
df_result = pd.concat([df_result, pd.DataFrame(nn, columns=['nn'])], axis=1)
df_result = pd.concat([df_result, pd.DataFrame(cat, columns=['lgb2'])], axis=1)
df_result = pd.concat([df_result, pd.DataFrame(cat, columns=['cat2'])], axis=1)
df_result = pd.concat([df_result, pd.DataFrame(rf, columns=['rf2'])], axis=1)
df_result = pd.concat([df_result, pd.DataFrame(reg, columns=['reg2'])], axis=1)
df_result = pd.concat([df_result, pd.DataFrame(nn, columns=['nn2'])], axis=1)
df_result.head(3)

Unnamed: 0,lgb,cat,rf,reg,nn,lgb2,cat2,rf2,reg2,nn2
0,2.538794,2.523266,2.444625,2.590539,2.458217,2.523266,2.523266,2.444625,2.590539,2.458217
1,3.229687,3.254991,3.247383,3.179583,3.034371,3.254991,3.254991,3.247383,3.179583,3.034371
2,4.531537,4.374481,4.453524,4.520423,4.692705,4.374481,4.374481,4.453524,4.520423,4.692705


In [34]:
result = np.zeros([NFOLDS, len(df_result)])
for fold_no in range(NFOLDS):
    with open(f'../models/v2_2nd_svr_{fold_no}.pickle', mode='rb') as f:
        model = pickle.load(f)
    result[fold_no] = model.predict(df_result)
reg1 = result.mean(axis=0)
reg1.shape

(59084,)

In [35]:
result = np.zeros([NFOLDS, len(df_result)])
for fold_no in range(NFOLDS):
    with open(f'../models/v2_2nd_ridge_{fold_no}.pickle', mode='rb') as f:
        model = pickle.load(f)
    result[fold_no] = model.predict(df_result)
reg2 = result.mean(axis=0)
reg2.shape

(59084,)

In [36]:
result = (reg1 * 0.2) + (reg2 * 0.8)
result

array([2.52779012, 3.1704312 , 4.54201973, ..., 2.4440839 , 1.60152347,
       4.00871844])

In [37]:
test_predicted = np.expm1(result)
test_predicted

array([11.52579497, 22.8177524 , 92.88022103, ..., 10.51999125,
        3.96058398, 54.07624159])

In [38]:
submit_df = pd.DataFrame({'id': df_predict['id']})
submit_df['cites'] = np.where(test_predicted < 0, 0, test_predicted)
submit_df.to_csv("submission_20210328_1023.csv", index=False)

In [39]:
df_temp = pd.DataFrame(test_predicted).rename(columns={0:'proba'})
df_temp.head()

Unnamed: 0,proba
0,11.525795
1,22.817752
2,92.880221
3,11.722009
4,10.741093


In [38]:
df_temp = pd.DataFrame(test_predicted).rename(columns={0:'proba'})
df_temp.head()

Unnamed: 0,proba
0,11.4699
1,22.764166
2,94.341634
3,11.828095
4,11.00662


In [None]:
df_temp[df_temp['proba'] <= 1]

In [None]:
df_temp = pd.DataFrame(test_predicted).rename(columns={0:'proba'})
df_temp.head()

In [40]:
pd.read_csv('submission20210328.csv')

Unnamed: 0,id,cites
0,1605.00995,11.968837
1,1206.6911,24.111304
2,cond-mat/0504055,88.449786
3,astro-ph/9907297,11.697223
4,1104.5407,11.513199
...,...,...
59079,1210.4112,20.889329
59080,1701.03465,5.116972
59081,1709.10428,10.521577
59082,gr-qc/9803020,3.669175
