In [4]:
import warnings
warnings.filterwarnings('ignore')
!pip install lightgbm
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import lightgbm as lgb

def preprocess_dataset_train(file_path):
    """
    Train veri setini işler: Eksik değerleri doldurur, kategorik değişkenleri işler,
    özellik mühendisliği yapar, ölçekleme ve encoding uygular.
    """
    # 📌 1. CSV dosyasını oku
    df = pd.read_csv(file_path)
    print(f"✅ Veri yüklendi: {df.shape}")

    # 🟢 2. Eksik Veri Doldurma
    df["HomePlanet"].fillna("Earth", inplace=True)
    df["CryoSleep"].fillna(False, inplace=True)
    df["Destination"].fillna("TRAPPIST-1e", inplace=True)
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["RoomService"].fillna(0, inplace=True)
    df["FoodCourt"].fillna(0, inplace=True)
    df["ShoppingMall"].fillna(0, inplace=True)
    df["Spa"].fillna(0, inplace=True)
    df["VRDeck"].fillna(0, inplace=True)

    # 🟢 3. Özellik Mühendisliği
    df["Group"] = df["PassengerId"].apply(lambda x: int(x.split("_")[0]))
    df[["Deck", "CabinNum", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df.drop(columns=["Cabin", "Name"], inplace=True)

    # Yeni değişken: Toplam harcama
    df["TotalSpend"] = df["RoomService"] + df["FoodCourt"] + df["ShoppingMall"] + df["Spa"] + df["VRDeck"]

    print("✅ Özellik mühendisliği tamamlandı!")

    # 🟢 4. Kategorik Değişkenleri One-Hot Encoding
    df = pd.get_dummies(df, columns=["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"], drop_first=True)

    # 🟢 5. Sayısal Değişkenleri Ölçeklendirme
    num_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend"]
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    return df

def train_lightgbm(df):
    """
    LightGBM modeli ile eğitim yapar ve doğruluk skorunu hesaplar.
    """
    y = df["Transported"].astype(int)
    X = df.drop(columns=["Transported", "PassengerId", "CabinNum"])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05, max_depth=10, random_state=42)
    start_time = time.time()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    elapsed_time = time.time() - start_time

    print("\n✅ **LightGBM Modeli Eğitildi!**")
    print(f"📊 **Accuracy:** {accuracy:.5f}")
    print(f"⏳ **Training Time:** {elapsed_time:.5f} saniye")

    return model, X.columns

# 📌 Train veri setini işle
train_prepared = preprocess_dataset_train("train.csv")

# 📌 Modeli eğit ve doğruluk değerini al
lightgbm_model, train_columns = train_lightgbm(train_prepared)


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------- ----------- 1.0/1.5 MB 4.6 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 5.4 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
✅ Veri yüklendi: (8693, 14)
✅ Özellik mühendisliği tamamlandı!
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000444 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1890
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 21


In [6]:
def preprocess_dataset_test(file_path, train_columns):
    """
    Test veri setini işler ve train setindeki sütunlarla uyumlu hale getirir.
    """
    df = pd.read_csv(file_path)
    print(f"✅ Test verisi yüklendi: {df.shape}")

    # Eksik veri doldurma
    df["HomePlanet"].fillna("Earth", inplace=True)
    df["CryoSleep"].fillna(False, inplace=True)
    df["Destination"].fillna("TRAPPIST-1e", inplace=True)
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["RoomService"].fillna(0, inplace=True)
    df["FoodCourt"].fillna(0, inplace=True)
    df["ShoppingMall"].fillna(0, inplace=True)
    df["Spa"].fillna(0, inplace=True)
    df["VRDeck"].fillna(0, inplace=True)

    # Cabin bilgisini böl
    df["Group"] = df["PassengerId"].apply(lambda x: int(x.split("_")[0]))
    df[["Deck", "CabinNum", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df.drop(columns=["Cabin"], inplace=True)

    # Toplam harcama sütunu ekle
    df["TotalSpend"] = df["RoomService"] + df["FoodCourt"] + df["ShoppingMall"] + df["Spa"] + df["VRDeck"]

    print("✅ Özellik mühendisliği tamamlandı!")

    # One-Hot Encoding
    df = pd.get_dummies(df, columns=["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"], drop_first=True)

    # Train setinde olup test setinde olmayan sütunları ekle
    missing_cols = set(train_columns) - set(df.columns)
    for col in missing_cols:
        df[col] = 0

    df = df[train_columns]
    print(f"✅ Test seti train setiyle uyumlu hale getirildi! Yeni şekil: {df.shape}")

    return df

def test_lightgbm(model, test_df, test_csv_path, output_csv="submission_lightgbm.csv"):
    """
    Test veri setiyle tahmin yapar ve Kaggle submission dosyası oluşturur.
    """
    test_raw = pd.read_csv(test_csv_path)
    test_ids = test_raw["PassengerId"]

    predictions = model.predict(test_df)

    submission = pd.DataFrame({"PassengerId": test_ids, "Transported": predictions})
    submission["Transported"] = submission["Transported"].astype(bool)

    submission.to_csv(output_csv, index=False)
    print(f"✅ Tahminler {output_csv} dosyasına kaydedildi!")

# 📌 1. Test veri setini işle (train ile uyumlu hale getir)
test_prepared = preprocess_dataset_test("test.csv", train_columns)

# 📌 2. Model ile tahmin yap ve submission.csv oluştur
test_lightgbm(lightgbm_model, test_prepared, "test.csv")


✅ Test verisi yüklendi: (4277, 13)
✅ Özellik mühendisliği tamamlandı!
✅ Test seti train setiyle uyumlu hale getirildi! Yeni şekil: (4277, 22)
✅ Tahminler submission_lightgbm.csv dosyasına kaydedildi!
