In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import lightgbm as lgb
import optuna.integration.lightgbm as olgb

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
train = pd.read_csv('../data/train_data.csv')
test = pd.read_csv('../data/test_data.csv')

In [3]:
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold


def label_encoding(train, test, target_cols):
    for f in target_cols:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))
    return train, test

In [4]:
train['is_tokyo_osaka'] = train['area'].isin(['東京都', '大阪府']).astype(int)
test['is_tokyo_osaka'] = test['area'].isin(['東京都', '大阪府']).astype(int)

In [5]:
train

Unnamed: 0,id,position,age,area,sex,partner,num_child,education,service_length,study_time,commute,overtime,salary,is_tokyo_osaka
0,0,1,44,愛知県,2,1,2,1,24,2.0,1.6,9.2,428.074887,0
1,1,2,31,奈良県,1,0,0,0,13,9.0,0.7,12.4,317.930517,0
2,2,2,36,山口県,1,0,0,2,14,4.0,0.4,16.9,357.350316,0
3,3,0,22,東京都,2,0,0,0,4,3.0,0.4,6.1,201.310911,1
4,4,0,25,鹿児島県,2,0,0,1,5,3.0,0.2,4.9,178.067475,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20995,20995,0,27,石川県,2,0,0,1,7,2.0,0.2,15.5,181.735475,0
20996,20996,0,22,福岡県,2,0,0,0,4,3.0,0.2,13.4,201.720711,0
20997,20997,3,36,滋賀県,2,0,0,2,14,4.0,0.8,12.7,364.386736,0
20998,20998,0,21,山梨県,2,1,2,1,0,1.0,1.0,10.9,235.686449,0


In [6]:
train['is_tokyo_osaka_and_partner'] = train['is_tokyo_osaka'].astype(str) + train['partner'].astype(str)
test['is_tokyo_osaka_and_partner'] = test['is_tokyo_osaka'].astype(str) + test['partner'].astype(str)
train

Unnamed: 0,id,position,age,area,sex,partner,num_child,education,service_length,study_time,commute,overtime,salary,is_tokyo_osaka,is_tokyo_osaka_and_partner
0,0,1,44,愛知県,2,1,2,1,24,2.0,1.6,9.2,428.074887,0,01
1,1,2,31,奈良県,1,0,0,0,13,9.0,0.7,12.4,317.930517,0,00
2,2,2,36,山口県,1,0,0,2,14,4.0,0.4,16.9,357.350316,0,00
3,3,0,22,東京都,2,0,0,0,4,3.0,0.4,6.1,201.310911,1,10
4,4,0,25,鹿児島県,2,0,0,1,5,3.0,0.2,4.9,178.067475,0,00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20995,20995,0,27,石川県,2,0,0,1,7,2.0,0.2,15.5,181.735475,0,00
20996,20996,0,22,福岡県,2,0,0,0,4,3.0,0.2,13.4,201.720711,0,00
20997,20997,3,36,滋賀県,2,0,0,2,14,4.0,0.8,12.7,364.386736,0,00
20998,20998,0,21,山梨県,2,1,2,1,0,1.0,1.0,10.9,235.686449,0,01


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          21000 non-null  int64  
 1   position                    21000 non-null  int64  
 2   age                         21000 non-null  int64  
 3   area                        21000 non-null  object 
 4   sex                         21000 non-null  int64  
 5   partner                     21000 non-null  int64  
 6   num_child                   21000 non-null  int64  
 7   education                   21000 non-null  int64  
 8   service_length              21000 non-null  int64  
 9   study_time                  21000 non-null  float64
 10  commute                     21000 non-null  float64
 11  overtime                    21000 non-null  float64
 12  salary                      21000 non-null  float64
 13  is_tokyo_osaka              210

In [8]:
encode_cols = ['is_tokyo_osaka_and_partner']
train, test = label_encoding(train, test, encode_cols)

In [9]:
train

Unnamed: 0,id,position,age,area,sex,partner,num_child,education,service_length,study_time,commute,overtime,salary,is_tokyo_osaka,is_tokyo_osaka_and_partner
0,0,1,44,愛知県,2,1,2,1,24,2.0,1.6,9.2,428.074887,0,1
1,1,2,31,奈良県,1,0,0,0,13,9.0,0.7,12.4,317.930517,0,0
2,2,2,36,山口県,1,0,0,2,14,4.0,0.4,16.9,357.350316,0,0
3,3,0,22,東京都,2,0,0,0,4,3.0,0.4,6.1,201.310911,1,2
4,4,0,25,鹿児島県,2,0,0,1,5,3.0,0.2,4.9,178.067475,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20995,20995,0,27,石川県,2,0,0,1,7,2.0,0.2,15.5,181.735475,0,0
20996,20996,0,22,福岡県,2,0,0,0,4,3.0,0.2,13.4,201.720711,0,0
20997,20997,3,36,滋賀県,2,0,0,2,14,4.0,0.8,12.7,364.386736,0,0
20998,20998,0,21,山梨県,2,1,2,1,0,1.0,1.0,10.9,235.686449,0,1


In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=71)

In [11]:
train['fold_id'] = np.nan

In [12]:
train

Unnamed: 0,id,position,age,area,sex,partner,num_child,education,service_length,study_time,commute,overtime,salary,is_tokyo_osaka,is_tokyo_osaka_and_partner,fold_id
0,0,1,44,愛知県,2,1,2,1,24,2.0,1.6,9.2,428.074887,0,1,
1,1,2,31,奈良県,1,0,0,0,13,9.0,0.7,12.4,317.930517,0,0,
2,2,2,36,山口県,1,0,0,2,14,4.0,0.4,16.9,357.350316,0,0,
3,3,0,22,東京都,2,0,0,0,4,3.0,0.4,6.1,201.310911,1,2,
4,4,0,25,鹿児島県,2,0,0,1,5,3.0,0.2,4.9,178.067475,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20995,20995,0,27,石川県,2,0,0,1,7,2.0,0.2,15.5,181.735475,0,0,
20996,20996,0,22,福岡県,2,0,0,0,4,3.0,0.2,13.4,201.720711,0,0,
20997,20997,3,36,滋賀県,2,0,0,2,14,4.0,0.8,12.7,364.386736,0,0,
20998,20998,0,21,山梨県,2,1,2,1,0,1.0,1.0,10.9,235.686449,0,1,


In [13]:
for i, (train_index, valid_index) in enumerate(cv.split(train, train['is_tokyo_osaka_and_partner'])):
    train.loc[valid_index, 'fold_id'] = i



In [14]:
train

Unnamed: 0,id,position,age,area,sex,partner,num_child,education,service_length,study_time,commute,overtime,salary,is_tokyo_osaka,is_tokyo_osaka_and_partner,fold_id
0,0,1,44,愛知県,2,1,2,1,24,2.0,1.6,9.2,428.074887,0,1,1.0
1,1,2,31,奈良県,1,0,0,0,13,9.0,0.7,12.4,317.930517,0,0,3.0
2,2,2,36,山口県,1,0,0,2,14,4.0,0.4,16.9,357.350316,0,0,4.0
3,3,0,22,東京都,2,0,0,0,4,3.0,0.4,6.1,201.310911,1,2,3.0
4,4,0,25,鹿児島県,2,0,0,1,5,3.0,0.2,4.9,178.067475,0,0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20995,20995,0,27,石川県,2,0,0,1,7,2.0,0.2,15.5,181.735475,0,0,1.0
20996,20996,0,22,福岡県,2,0,0,0,4,3.0,0.2,13.4,201.720711,0,0,3.0
20997,20997,3,36,滋賀県,2,0,0,2,14,4.0,0.8,12.7,364.386736,0,0,2.0
20998,20998,0,21,山梨県,2,1,2,1,0,1.0,1.0,10.9,235.686449,0,1,3.0


In [None]:
train['fold_id'].to_csv('../outputs/fold_id.csv', index=False)