In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install signate

from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth


auth.authenticate_user()

drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'signate.json'", fields="files(id)").execute()
signate_api_key = results.get('files', [])

filename = "/root/.signate/signate.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)

request = drive_service.files().get_media(fileId=signate_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

Download 100%.


In [None]:
!signate list

  competitionId  title                                                                             closing     prize        submitters
---------------  --------------------------------------------------------------------------------  ----------  ---------  ------------
              1  【練習問題】銀行の顧客ターゲティング                                              -                              7368
             24  【練習問題】お弁当の需要予測                                                      -                              9032
             27  【練習問題】Jリーグの観客動員数予測                                               -                              2267
            100  【練習問題】手書き文字認識                                                        -           Knowledge           395
            102  【練習問題】タイタニックの生存予測                                                -           Knowledge          3304
            103  【練習問題】音楽ラベリング                                                        -           Knowledge           203
            

In [None]:
! signate files --competition-id=1393

  fileId  name               title                               size  updated_at
--------  -----------------  --------------------------------  ------  -------------------
    2520  train.csv          学習用データ                      137862  2025-01-29 16:46:50
    2521  test.csv           評価用データ                       88524  2025-01-29 16:47:06
    2522  sample_submit.csv  サンプルサブミッション用ファイル   13006  2025-01-29 16:47:22


In [None]:
!signate download --competition-id=1393

sample_submit.csv

test.csv

train.csv

[32m
Download completed.[0m


### ライブラリ

In [None]:
import pandas as pd
import numpy as np

In [None]:
# ヘッダ有りとヘッダ無しCSVに区別して読み取る
train_raw = pd.read_csv("/content/train.csv", header=0)
test_raw = pd.read_csv("/content/test.csv")
submission = pd.read_csv("/content/sample_submit.csv", header=None, names=["index", "Outcome"])

In [None]:
train_raw.shape

(2861, 10)

In [None]:
test_raw.shape

(1919, 9)

In [None]:
submission.shape

(1919, 2)

In [None]:
train_raw.head(1)

Unnamed: 0,index,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,200,9,125,74,0,0,28.53691,0.444902,45,1


In [None]:
test_raw.head(1)

Unnamed: 0,index,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,398,0,126,80,0,0,40.096264,0.822517,21


In [None]:
submission.head(1)

Unnamed: 0,index,Outcome
0,398,0


In [None]:
# データ型を確認
train_raw.columns

Index(['index', 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [None]:
# カテゴリ列と数値列を分類
# (今回は無いのでエンコード不要)
cat_col = []
num_col = []

for col in train_raw.columns:
  if col == 'object':
    cat_col.append(col)
  else:
    num_col.append(col)

In [None]:
print(f"In these features, there are {len(cat_col)} CATEGORICAL FEATURES: {cat_col}")

In these features, there are 0 CATEGORICAL FEATURES: []


In [None]:
# 欠損値を確認
# (今回は無いので補完不要)
train_raw.isnull().sum()

Unnamed: 0,0
index,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [None]:
RMV = ["index", "Outcome"]
FEATURES = [c for c in train_raw.columns if not c in RMV]

In [None]:
combined = pd.concat([train_raw, test_raw], axis=0, ignore_index=True)

train = combined.loc[:len(train_raw)-1].copy()
test = combined.loc[len(train_raw):].reset_index(drop=True).copy()

In [None]:
len(train)

2861

In [None]:
len(test)

1919

In [None]:
test.head(1)

Unnamed: 0,index,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,398,0,126,80,0,0,40.096264,0.822517,21,


In [None]:
# ベースラインを作成する
from sklearn.model_selection import KFold # K分割
from xgboost import XGBRegressor, XGBClassifier
import xgboost
import lightgbm as lgb

# LGBMパラメータ
lgbm_params = {
    "task": "train",
    "objective": "binary",
    "boosting": "gbdt",
    # "num_boost_round": 100,
    "learning_rate": 0.1,
    "device_type": "cpu",
    "seed": 1234,
    # "metric": "l1",
}

In [None]:
%%time
FOLDS = 5 # 分割数
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

# 検証用結果(valie)
oof_lgb = np.zeros(len(train))

# テスト用結果(test)
pred_lgb = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

  print("#"*25)
  print(f"### Fold {i+1}")
  print("#"*25)

  # インデックスに割り振る

  x_train = train.loc[train_index, FEATURES].copy()
  y_train = train.loc[train_index, "Outcome"]
  x_valid = train.loc[test_index, FEATURES].copy()
  y_valid = train.loc[test_index, "Outcome"]
  x_test = test[FEATURES].copy()

  # データセット作成
  lgb_train = lgb.Dataset(x_train, y_train)
  lgb_eval = lgb.Dataset(x_valid, y_valid, reference = lgb_train)

  # 学習
  model_lgb = lgb.train(lgbm_params,
                        lgb_train,
                        valid_sets = lgb_eval,
                        callbacks = [
                            lgb.early_stopping(stopping_rounds=10,
                            verbose = -1)],)

  # 予測
  y_pred = model_lgb.predict(x_valid, num_iteration = model_lgb.best_iteration)

  oof_lgb[test_index] = model_lgb.predict(x_valid)
  pred_lgb += model_lgb.predict(x_test)

# 平均評価
pred_lgb /= FOLDS

#########################
### Fold 1
#########################
[LightGBM] [Info] Number of positive: 555, number of negative: 1733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 754
[LightGBM] [Info] Number of data points in the train set: 2288, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242570 -> initscore=-1.138641
[LightGBM] [Info] Start training from score -1.138641
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.383275
#########################
### Fold 2
#########################
[LightGBM] [Info] Number of positive: 532, number of negative: 1757
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] To

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [None]:
y_val = []
for i, pred in enumerate(oof_lgb):
  if(pred>0.5):
    y_val.append(1)
  else:
    y_val.append(0)

In [None]:
# 検証用データのスコア算出
y_true = train["Outcome"].tolist()
m = accuracy_score(y_val, y_true)
print(f"/nOverall CV for LightGBM = ",m)

/nOverall CV for LightGBM =  0.8035651869975533


In [None]:
y_pred = []
for i, pred in enumerate(pred_lgb):
  if(pred>0.5):
    y_pred.append(1)
  else:
    y_pred.append(0)

In [None]:
submission["Outcome"] = pd.DataFrame(y_pred)

In [None]:
submission.columns = ['0', '1']

In [None]:
submission.to_csv("submission.csv", header=False, index=False)