In [3]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# 📌 **1. Veri Ön İşleme Fonksiyonu**
def preprocess_data(file_path, is_train=True):
    df = pd.read_csv(file_path)
    print(f"✅ Veri yüklendi: {df.shape}")

    # 🟢 1. Kategorik değişkenleri belirle
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    # 🟢 2. Eksik değerleri doldur
    df.fillna(df.median(numeric_only=True), inplace=True)
    print("✅ Eksik değerler dolduruldu!")

    # Hedef değişkeni ayır
    y = df["rainfall"] if "rainfall" in df.columns else None
    X = df.drop(columns=["rainfall"]) if "rainfall" in df.columns else df

    print("✅ Veri hazırlandı!")
    
    return X, y, categorical_cols

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# 📌 **LightGBM Model Eğitme Fonksiyonu**
def train_lgbm(X, y, cat_features):
    """
    LightGBM Modeli ile eğitimi gerçekleştirir.
    """
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    model = lgb.LGBMClassifier(
        boosting_type="gbdt",  # Gradient Boosting Decision Tree
        num_leaves=31,  # Dallanma sayısı
        learning_rate=0.01,  # Küçük öğrenme oranı
        n_estimators=1000,  # 1000 iterasyon
        max_depth=-1,  # Derinlik sınırı yok
        categorical_feature=cat_features,  # Kategorik değişkenleri otomatik işleme
        metric="auc"
    )

    # **Early Stopping İçin Callbacks Kullanımı**
    early_stopping_callback = lgb.early_stopping(50)
    evals_result = {}

    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)],
        eval_metric="auc",
        callbacks=[early_stopping_callback, lgb.log_evaluation(100)]
    )

    y_pred = model.predict_proba(X_val)[:, 1]
    auc_score = roc_auc_score(y_val, y_pred)

    print(f"✅ LightGBM Modeli Eğitildi! ROC AUC: {auc_score:.4f}")
    
    return model


# 📌 **3. Test Verisi Üzerinde Tahmin Yapma Fonksiyonu**
def predict_and_save(model, test_file, output_file):
    X_test, _, _ = preprocess_data(test_file, is_train=False)

    predictions = model.predict_proba(X_test)[:, 1]
    
    # Test seti ID'sini al
    test_df = pd.read_csv(test_file)
    test_ids = test_df["id"]

    # Kaggle formatında CSV oluştur
    submission = pd.DataFrame({"id": test_ids, "rainfall": predictions})
    submission.to_csv(output_file, index=False)
    
    print(f"✅ Tahminler {output_file} dosyasına kaydedildi!")

# 📌 **4. Çalıştırma Kodu**
train_file = "train.csv"
test_file = "test.csv"
output_file = "lgbm_submission.csv"

# 1️⃣ Veriyi işle
X_train, y_train, categorical_cols = preprocess_data(train_file, is_train=True)

# 2️⃣ Modeli eğit (LightGBM kullanıyoruz)
lgbm_model = train_lgbm(X_train, y_train, categorical_cols)

# 3️⃣ Test verisi üzerinde tahmin yap ve sonucu kaydet
predict_and_save(lgbm_model, test_file, output_file)


✅ Veri yüklendi: (2190, 13)
✅ Eksik değerler dolduruldu!
✅ Veri hazırlandı!
[LightGBM] [Info] Number of positive: 1331, number of negative: 421
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1808
[LightGBM] [Info] Number of data points in the train set: 1752, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.759703 -> initscore=1.151053
[LightGBM] [Info] Start training from score 1.151053
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27]	valid_0's auc: 0.858961
✅ LightGBM Modeli Eğitildi! ROC AUC: 0.8590
✅ Veri yüklendi: (730, 12)
✅ Eksik değerler dolduruldu!
✅ Veri hazırlandı!
✅ Tahminler lgbm_submission.csv dosyasına kaydedildi!


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
