In [8]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from scipy.stats import randint

# ðŸ“Œ 1. Veri Ã–n Ä°ÅŸleme Fonksiyonu
def preprocess_data(file_path, is_train=True):
    df = pd.read_csv(file_path)
    print(f"âœ… Veri yÃ¼klendi: {df.shape}")

    # ðŸ“Œ SayÄ±sal ve Kategorik SÃ¼tunlarÄ± Belirleme
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    print(f"ðŸ”¹ Kategorik SÃ¼tunlar: {categorical_cols}")
    print(f"ðŸ”¹ SayÄ±sal SÃ¼tunlar: {numerical_cols}")

    # ðŸ“Œ Eksik deÄŸerleri doldurma
    num_imputer = SimpleImputer(strategy="median")
    cat_imputer = SimpleImputer(strategy="most_frequent")

    if numerical_cols:
        df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])

    if categorical_cols:
        df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols].astype(str))  

    print("âœ… Eksik deÄŸerler dolduruldu!")

    # ðŸ“Œ Kategorik deÄŸiÅŸkenleri encode et
    for col in categorical_cols:
        df[col] = LabelEncoder().fit_transform(df[col])

    print("âœ… Kategorik deÄŸiÅŸkenler encode edildi!")

    # ðŸ“Œ Train seti iÃ§in hedef deÄŸiÅŸkeni ayÄ±r
    if is_train:
        y = df["rainfall"]
        X = df.drop(columns=["rainfall", "id"])
        return X, y
    else:
        test_ids = df["id"]
        X = df.drop(columns=["id"])
        return X, test_ids

# ðŸ“Œ 2. CatBoost Modelini EÄŸitme
def train_catboost(X_train, y_train):
    param_grid = {
        "iterations": randint(100, 1000),
        "learning_rate": [0.01, 0.02, 0.05, 0.1],
        "depth": randint(3, 10),
        "l2_leaf_reg": randint(1, 10),
        "border_count": randint(32, 255)
    }

    cat_model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        random_seed=42,
        verbose=100,
        task_type="CPU"  # âœ… GPU yerine CPU kullanÄ±yoruz!
    )

    search = RandomizedSearchCV(cat_model, param_grid, n_iter=15, scoring="roc_auc", cv=3, verbose=1, n_jobs=-1, random_state=42)
    search.fit(X_train, y_train)

    best_model = search.best_estimator_
    print(f"âœ… En iyi CatBoost Modeli bulundu: {search.best_params_}")
    return best_model

# ðŸ“Œ 3. Test Verisinde Tahmin Yapma ve Sonucu Kaydetme
def predict_and_save(model, test_file, output_file="submission_catboost.csv"):
    X_test, test_ids = preprocess_data(test_file, is_train=False)
    y_pred = model.predict_proba(X_test)[:, 1]  # YaÄŸmur olma olasÄ±lÄ±ÄŸÄ±nÄ± al
    submission = pd.DataFrame({"id": test_ids, "rainfall": y_pred})
    submission.to_csv(output_file, index=False)
    print(f"âœ… Tahminler {output_file} dosyasÄ±na kaydedildi!")

# ðŸ“Œ Ana AkÄ±ÅŸ
if __name__ == "__main__":
    train_file = "train.csv"
    test_file = "test.csv"

    # ðŸ“Œ Veriyi yÃ¼kle ve iÅŸle
    X_train, y_train = preprocess_data(train_file, is_train=True)

    # ðŸ“Œ Modeli eÄŸit
    best_catboost_model = train_catboost(X_train, y_train)

    # ðŸ“Œ Test verisi Ã¼zerinde tahmin yap ve sonucu kaydet
    predict_and_save(best_catboost_model, test_file)


âœ… Veri yÃ¼klendi: (2190, 13)
ðŸ”¹ Kategorik SÃ¼tunlar: []
ðŸ”¹ SayÄ±sal SÃ¼tunlar: ['id', 'day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed', 'rainfall']
âœ… Eksik deÄŸerler dolduruldu!
âœ… Kategorik deÄŸiÅŸkenler encode edildi!
Fitting 3 folds for each of 15 candidates, totalling 45 fits
0:	total: 148ms	remaining: 51.9s
100:	total: 583ms	remaining: 1.45s
200:	total: 1.03s	remaining: 774ms
300:	total: 1.47s	remaining: 248ms
351:	total: 1.69s	remaining: 0us
âœ… En iyi CatBoost Modeli bulundu: {'border_count': 89, 'depth': 8, 'iterations': 352, 'l2_leaf_reg': 9, 'learning_rate': 0.01}
âœ… Veri yÃ¼klendi: (730, 12)
ðŸ”¹ Kategorik SÃ¼tunlar: []
ðŸ”¹ SayÄ±sal SÃ¼tunlar: ['id', 'day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed']
âœ… Eksik deÄŸerler dolduruldu!
âœ… Kategorik deÄŸiÅŸkenler encode edildi!
âœ… Tahminler submission_catboos