In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading databricks_sdk-0.58.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading opentelemetry_api-1.35.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading opentele

In [7]:
base_dir = "/content/drive/MyDrive/学校/3年/1前期/1月/人工知能システム開発/2025-07-14/spaceship-titanic"

In [8]:
# データ読み込み＆型確認

import pandas as pd
from pathlib import Path

# ディレクトリパス
DATA_DIR = Path(base_dir) / "data"

# CSV読み込み
train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")

# 先頭数行と各列の型を表示
print("---- train head ----")
display(train.head())

print("\n---- train.dtypes ----")
print(train.dtypes)

print("\n---- test.dtypes ----")
print(test.dtypes)


---- train head ----


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True



---- train.dtypes ----
PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

---- test.dtypes ----
PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
dtype: object


In [10]:
# データ読み込み＆CryoSleep, VIP 列の確認
import pandas as pd
from pathlib import Path

# 既に base_dir を定義済みとのことなので、そのまま利用
DATA_DIR = Path(base_dir) / "data"

# 再読み込み
train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")

# CryoSleep と VIP の型と先頭数行を表示
print("---- train: CryoSleep, VIP の dtype ----")
print(train[["CryoSleep", "VIP"]].dtypes)
print(train[["CryoSleep", "VIP"]].head())

print("\n---- test: CryoSleep, VIP の dtype ----")
print(test[["CryoSleep", "VIP"]].dtypes)
print(test[["CryoSleep", "VIP"]].head())


---- train: CryoSleep, VIP の dtype ----
CryoSleep    object
VIP          object
dtype: object
  CryoSleep    VIP
0     False  False
1     False  False
2     False   True
3     False  False
4     False  False

---- test: CryoSleep, VIP の dtype ----
CryoSleep    object
VIP          object
dtype: object
  CryoSleep    VIP
0      True  False
1     False  False
2      True  False
3     False  False
4     False  False


In [11]:
# CryoSleep, VIP の数値化＆列リスト再確認
import pandas as pd
from pathlib import Path

# 既に読み込まれた train, test をそのまま使います

# 1) マッピング変換
for col in ["CryoSleep", "VIP"]:
    train[col] = train[col].map({"False": 0, "True": 1})
    test[col]  = test[col].map({"False": 0, "True": 1})

# 2) 欠損確認（もしあれば imputer で埋めます）
print("=== 欠損数 ===")
print(train[["CryoSleep","VIP"]].isnull().sum(), "\n")

# 3) dtype 確認
print("=== 変換後 dtype ===")
print(train[["CryoSleep","VIP"]].dtypes, "\n")
print(train[["CryoSleep","VIP"]].head(), "\n")

# 4) 前処理用に数値列・カテゴリ列を再生成
num_cols = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = train.select_dtypes(include=["object"]).columns.tolist()

print("=== 数値列候補 ===")
print(num_cols, "\n")
print("=== カテゴリ列候補 ===")
print(cat_cols)


=== 欠損数 ===
CryoSleep    8693
VIP          8693
dtype: int64 

=== 変換後 dtype ===
CryoSleep    float64
VIP          float64
dtype: object 

   CryoSleep  VIP
0        NaN  NaN
1        NaN  NaN
2        NaN  NaN
3        NaN  NaN
4        NaN  NaN 

=== 数値列候補 ===
['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'] 

=== カテゴリ列候補 ===
['PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Name']


In [12]:
# CryoSleep, VIP を boolean→数値に変換（欠損は False=0）＆列リスト再確認
import pandas as pd
from pathlib import Path

# base_dir は既定なのでそのまま利用
DATA_DIR = Path(base_dir) / "data"

# 再読み込み（安全のため）
train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")

# ブール列を 0/1 化（欠損は False とみなす）
for col in ["CryoSleep", "VIP"]:
    train[col] = train[col].fillna(False).map({False: 0, True: 1})
    test[col]  = test[col].fillna(False).map({False: 0, True: 1})

# 変換結果の確認
print("---- 変換後 head ----")
display(train[["CryoSleep","VIP"]].head())

print("\n---- 変換後 dtype ----")
print(train[["CryoSleep","VIP"]].dtypes)

# 改めて前処理用の数値／カテゴリ列リストを取得
num_cols = train.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = train.select_dtypes(include=["object"]).columns.tolist()

print("\n=== 数値列候補 ===")
print(num_cols)
print("\n=== カテゴリ列候補 ===")
print(cat_cols)


---- 変換後 head ----


  train[col] = train[col].fillna(False).map({False: 0, True: 1})
  test[col]  = test[col].fillna(False).map({False: 0, True: 1})


Unnamed: 0,CryoSleep,VIP
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0



---- 変換後 dtype ----
CryoSleep    int64
VIP          int64
dtype: object

=== 数値列候補 ===
['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

=== カテゴリ列候補 ===
['PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Name']


In [15]:
# OneHotEncoder 引数修正＆パイプライン再テスト
import pandas as pd
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# データ読み込み＆不要列削除
DATA_DIR = Path(base_dir) / "data"
train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")

# CryoSleep, VIP を 0/1 化
for col in ["CryoSleep", "VIP"]:
    train[col] = train[col].fillna(False).map({False: 0, True: 1})
    test[col]  = test[col].fillna(False).map({False: 0, True: 1})

# 説明変数準備
X = train.drop(columns=["Transported", "PassengerId", "Name"])

# カラム指定
num_cols = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_cols = ['HomePlanet', 'Cabin', 'Destination']

# パイプ定義
num_pipe = Pipeline([("imputer", SimpleImputer(strategy="mean"))])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols),
])

# 前処理テスト
X_pre = preprocessor.fit_transform(X)
print("前処理後の特徴量行列 shape:", X_pre.shape)
ohe_cols = preprocessor.named_transformers_['cat']['ohe'].get_feature_names_out(cat_cols)
print("合計特徴量数:", len(num_cols) + len(ohe_cols))
print("OHE後の一部カテゴリ列名例:", ohe_cols[:5])


  train[col] = train[col].fillna(False).map({False: 0, True: 1})
  test[col]  = test[col].fillna(False).map({False: 0, True: 1})


前処理後の特徴量行列 shape: (8693, 6577)
合計特徴量数: 6577
OHE後の一部カテゴリ列名例: ['HomePlanet_Earth' 'HomePlanet_Europa' 'HomePlanet_Mars'
 'HomePlanet_missing' 'Cabin_A/0/P']


In [17]:
# StratifiedKFold で LightGBM ベースライン学習テスト

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

# データ読み込み
DATA_DIR = Path(base_dir) / "data"
train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")

# CryoSleep, VIP を 0/1 化
for col in ["CryoSleep", "VIP"]:
    train[col] = train[col].fillna(False).map({False: 0, True: 1})
    test[col]  = test[col].fillna(False).map({False: 0, True: 1})

# 説明変数／目的変数
y = train["Transported"].astype(int)
X = train.drop(columns=["Transported", "PassengerId", "Name"])

# 前処理パイプはセル6で定義したものをそのまま流用
# （もしセル6と同じスコープにない場合は、再度定義してください）
preprocessor  = preprocessor

# モデルパイプライン
model_pipe = Pipeline([
    ("pre", preprocessor),
    ("clf", LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42))
])

# StratifiedKFold で CV
oof_preds = np.zeros(len(X))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

    model_pipe.fit(X_tr, y_tr)
    fold_preds = model_pipe.predict_proba(X_va)[:, 1]
    oof_preds[va_idx] = fold_preds
    auc = roc_auc_score(y_va, fold_preds)
    print(f"Fold{fold} AUC: {auc:.4f}")

# 全体スコア
overall_auc = roc_auc_score(y, oof_preds)
print(f"Overall CV AUC: {overall_auc:.4f}")


  train[col] = train[col].fillna(False).map({False: 0, True: 1})
  test[col]  = test[col].fillna(False).map({False: 0, True: 1})


[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1376
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380




Fold1 AUC: 0.8811




[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1376
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380




Fold2 AUC: 0.8717




[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000984 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1375
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380




Fold3 AUC: 0.8711




[LightGBM] [Info] Number of positive: 3503, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001224 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1377
[LightGBM] [Info] Number of data points in the train set: 6955, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503666 -> initscore=0.014666
[LightGBM] [Info] Start training from score 0.014666




Fold4 AUC: 0.8750




[LightGBM] [Info] Number of positive: 3503, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1376
[LightGBM] [Info] Number of data points in the train set: 6955, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503666 -> initscore=0.014666
[LightGBM] [Info] Start training from score 0.014666
Fold5 AUC: 0.8660
Overall CV AUC: 0.8726




In [18]:
# CV Accuracy と baseline 提出ファイル作成

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

# データ読み込み
DATA_DIR = Path(base_dir) / "data"
train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")

# CryoSleep, VIP を 0/1 化
for col in ["CryoSleep", "VIP"]:
    train[col] = train[col].fillna(False).map({False: 0, True: 1})
    test[col]  = test[col].fillna(False).map({False: 0, True: 1})

# 説明変数／目的変数
y = train["Transported"].astype(int)
X = train.drop(columns=["Transported", "PassengerId", "Name"])
X_test = test.drop(columns=["PassengerId", "Name"])

# 前処理＆モデルパイプライン再定義
# （セル6 で定義した preprocessor がそのまま使えます）
model_pipe = Pipeline([
    ("pre", preprocessor),
    ("clf", LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42))
])

# StratifiedKFold で CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_pred = np.zeros(len(X))
test_pred = np.zeros(len(X_test))

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

    model_pipe.fit(X_tr, y_tr)
    pred_va = model_pipe.predict(X_va)
    oof_pred[va_idx] = pred_va
    acc = accuracy_score(y_va, pred_va)
    print(f"Fold{fold} Accuracy: {acc:.4f}")

    # テスト予測も平均
    test_pred += model_pipe.predict(X_test) / skf.n_splits

# Overall CV Accuracy
overall_acc = accuracy_score(y, oof_pred)
print(f"\nOverall CV Accuracy: {overall_acc:.4f}")

# baseline提出ファイル作成（閾値0.5→bool変換）
sub["Transported"] = test_pred > 0.5
out_path = Path(base_dir) / "outputs" / "submissions" / f"{pd.Timestamp.now():%Y%m%d%H%M}" / "sub_baseline.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
sub.to_csv(out_path, index=False)
print(f"Baseline submission saved to: {out_path}")


  train[col] = train[col].fillna(False).map({False: 0, True: 1})
  test[col]  = test[col].fillna(False).map({False: 0, True: 1})


[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000921 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1376
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380




Fold1 Accuracy: 0.8091




[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1376
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380




Fold2 Accuracy: 0.7936




[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000900 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1375
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380




Fold3 Accuracy: 0.7987




[LightGBM] [Info] Number of positive: 3503, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1377
[LightGBM] [Info] Number of data points in the train set: 6955, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503666 -> initscore=0.014666
[LightGBM] [Info] Start training from score 0.014666




Fold4 Accuracy: 0.7900




[LightGBM] [Info] Number of positive: 3503, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000880 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1376
[LightGBM] [Info] Number of data points in the train set: 6955, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503666 -> initscore=0.014666
[LightGBM] [Info] Start training from score 0.014666




Fold5 Accuracy: 0.7814

Overall CV Accuracy: 0.7945
Baseline submission saved to: /content/drive/MyDrive/学校/3年/1前期/1月/人工知能システム開発/2025-07-14/spaceship-titanic/outputs/submissions/202507140809/sub_baseline.csv
