<a href="https://colab.research.google.com/github/dAn-solution/competition/blob/main/Signate_beginner_008.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 【第19回_Beginner限定コンペ】国勢調査からの収入予測
- 教育年数や職業等の国勢調査データから年収が$50,000ドルを超えるかどうかを予測しよう。
- GBDT（勾配ブースティング木） を実施

### Google Driveのマウント

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/'My Drive'

Mounted at /content/drive
/content/drive/My Drive


### ライブラリのインストール、インポート

In [3]:
# カレントディレクトリを変更
import os
os.chdir('/content/drive/My Drive/signate/Beginner-19/')
print(os.getcwd())

/content/drive/My Drive/signate/Beginner-19


In [4]:
class Config():
    root_path = './'
    input_path = os.path.join(root_path, 'input')
    output_path = os.path.join(root_path, 'output')
    intermediate_path = os.path.join(root_path, 'intermediate')
    seed = 42
    debug = False

In [5]:
import pandas as pd
import numpy as np
import warnings
import datetime

import scipy.stats as stats

import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

from sklearn.ensemble import GradientBoostingRegressor

### データの読み込み

In [6]:
train_df = pd.read_csv(f'{Config.input_path}/train.csv')
test_df = pd.read_csv(f'{Config.input_path}/test.csv')
sample_df = pd.read_csv(f'{Config.input_path}/sample_submit.csv', header=None)

test_df = test_df.rename(columns={'id': 'index'})

## GBDTの実行前準備

In [7]:
train_df.columns

Index(['index', 'age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'native-country', 'Y'],
      dtype='object')

In [8]:
test_df.columns

Index(['index', 'age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'native-country'],
      dtype='object')

In [9]:
# columnsをターゲット変数とtargetとカテゴリカル変数に分割
target_column = "Y"
train_column = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
category_column = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'native-country']

y = train_df[target_column]
X = train_df[category_column]
test_X = test_df[category_column]
train_index = train_df['index']
test_index = test_df['index']

traintest = pd.concat([X, test_X], ignore_index = True)

In [10]:
# 説明変数をラベルエンコーディング
warnings.simplefilter('ignore')
for column in traintest.columns:
  le = preprocessing.LabelEncoder()
  target_column = traintest[column]
  train_target_column = X[column]
  test_target_column = test_X[column]  
  le.fit(target_column)
  train_label_encoded_column = le.transform(train_target_column)
  test_label_encoded_column = le.transform(test_target_column)
  X[column] = pd.Series(train_label_encoded_column).astype("category")
  test_X[column] = pd.Series(test_label_encoded_column).astype("category")

In [11]:
X['age'] = train_df['age']
# X['fnlwgt'] = train_df['fnlwgt']
X['education_num'] = train_df['education-num']
test_X['age'] = test_df['age']
# test_X['fnlwgt'] = test_df['fnlwgt']
test_X['education_num'] = test_df['education-num']

In [12]:
# 標準化
ss = preprocessing.StandardScaler()
ss.fit(X)
X = pd.DataFrame(ss.transform(X))
test_X = pd.DataFrame(ss.transform(test_X))

In [13]:
# trainデータを訓練用とテスト用に分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = Config.seed)

## GBDTの実行

In [14]:
from scipy.sparse.construct import rand
# 学習/予測

y_oof = np.zeros(len(y_train))
y_preds = []
kf = StratifiedKFold(n_splits=5)
for fold, (tr_idx, vl_idx) in enumerate(kf.split(X_train, y_train)):
    x_tr_fold = X.iloc[tr_idx]
    y_tr_fold = y.iloc[tr_idx]
    x_vl_fold = X.iloc[vl_idx]
    y_vl_fold = y.iloc[vl_idx]

    # データセットを生成する
    # param_grid = {"n_estimators":[100,500,1000], # 2000まではいらない
    #           "max_features": [1, 2, 3, 4, 5, 7, 10],
    #           "max_depth": [3,5,7,10,15,None], #,20,30は過学習を引き起こす
    #           "min_samples_leaf":  [1, 2, 4],
    #           "min_samples_split": [2, 5, 10]
    #          } 
    model = GradientBoostingRegressor(n_estimators=200, max_features=2, max_depth=3, min_samples_leaf=2,random_state=Config.seed)
    model.fit(x_tr_fold, y_tr_fold)
    y_oof[vl_idx] = np.where(model.predict(x_vl_fold) > 0.5, 1, 0)
    y_preds.append(model.predict(test_X))
    print(
        f'fold {fold} score:', accuracy_score(np.where(y_oof[vl_idx]>0.5,1,0), y_vl_fold)
    )

fold 0 score: 0.8780637254901961
fold 1 score: 0.8743872549019608
fold 2 score: 0.8860294117647058
fold 3 score: 0.8811274509803921
fold 4 score: 0.8805147058823529


### 提出データの作成

In [None]:
y_preds

[array([ 0.00456268,  0.05021487,  0.48609943, ..., -0.01939822,
        -0.01103572, -0.02658347]),
 array([ 0.09118492,  0.03278729,  0.54411699, ...,  0.00737089,
        -0.0149267 , -0.02385903]),
 array([ 0.05462345,  0.08922477,  0.50186597, ..., -0.00570989,
        -0.00866761,  0.00533465]),
 array([ 0.05275998,  0.07061959,  0.49793961, ...,  0.0100709 ,
        -0.01463435, -0.03282768]),
 array([ 0.07747491,  0.04193647,  0.48593743, ...,  0.03439174,
        -0.00592243, -0.02040266])]

In [None]:
for i in range(len(y_preds)):
    y_preds[i] = np.where(y_preds[i]>0.5,1,0)
result = stats.mode(y_preds, axis=0)

In [None]:
result = result[0].reshape([6800])

In [None]:
sample_df[1] = result
print(sample_df)

          0  1
0     10200  0
1     10201  0
2     10202  0
3     10203  0
4     10204  1
...     ... ..
6795  16995  0
6796  16996  0
6797  16997  0
6798  16998  0
6799  16999  0

[6800 rows x 2 columns]


In [None]:
# submitファイルの出力(Google Driveに出力）

# sample_df.to_csv(os.path.join(Config.intermediate_path, "submit_008.csv"), index=False)

In [None]:
# submitファイルのファイル名に利用する作成時刻の取得

now = datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=1)))
date_time = "{0:%Y%m%d_%H%M}".format(now)

In [None]:
# submitファイルの出力(Google Driveに出力）

sample_df.to_csv(os.path.join(Config.output_path, f"submit_{date_time}.csv"), index=False)