In [5]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from scipy.stats import randint
import joblib  # Modeli ve Ã¶zellikleri kaydetmek iÃ§in

# ğŸ“Œ 1. EÄŸitim Verisini HazÄ±rla
def preprocess_data(file_path, is_train=True):
    df = pd.read_csv(file_path)
    print(f"âœ… Veri yÃ¼klendi: {df.shape}")

    # ğŸ“Œ Kategorik ve SayÄ±sal DeÄŸiÅŸkenleri AyÄ±r
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # ğŸ“Œ Eksik deÄŸerleri doldurma
    num_imputer = SimpleImputer(strategy="median")
    cat_imputer = SimpleImputer(strategy="most_frequent")

    if numerical_cols:
        df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])

    if categorical_cols:
        df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols].astype(str))

    print("âœ… Eksik deÄŸerler dolduruldu!")

    # ğŸ“Œ One-Hot Encoding
    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    
    if is_train:
        df_encoded = pd.DataFrame(encoder.fit_transform(df[categorical_cols]))
        joblib.dump(encoder, "encoder.pkl")  # EÄŸitilen encoder'Ä± kaydet
    else:
        encoder = joblib.load("encoder.pkl")  # Ã–nceki encoder'Ä± yÃ¼kle
        df_encoded = pd.DataFrame(encoder.transform(df[categorical_cols]))

    df_encoded.columns = encoder.get_feature_names_out(categorical_cols)
    df = df.drop(columns=categorical_cols).reset_index(drop=True)
    df = pd.concat([df, df_encoded], axis=1)

    print("âœ… One-Hot Encoding tamamlandÄ±!")

    # ğŸ“Œ Train seti iÃ§in hedef deÄŸiÅŸkeni ayÄ±r
    if is_train:
        y = df["rainfall"]
        X = df.drop(columns=["rainfall", "id"])
        joblib.dump(X.columns, "train_features.pkl")  # KullanÄ±lan feature'larÄ± kaydet
        return X, y
    else:
        test_ids = df["id"]
        X = df.drop(columns=["id"])

        # ğŸ“Œ Feature SÄ±ralamasÄ±nÄ± DÃ¼zelt
        train_features = joblib.load("train_features.pkl")  # Train'de kullanÄ±lan sÃ¼tunlarÄ± al
        for col in train_features:
            if col not in X.columns:
                X[col] = 0  # Eksik sÃ¼tun varsa sÄ±fÄ±rla
        X = X[train_features]  # SÃ¼tun sÄ±rasÄ±nÄ± dÃ¼zelt

        print("âœ… Test seti train ile uyumlu hale getirildi!")
        return X, test_ids

# ğŸ“Œ 2. Random Forest Modelini EÄŸitme
def train_optimized_random_forest(X_train, y_train):
    param_grid = {
        "n_estimators": randint(200, 500),
        "max_depth": randint(10, 20),
        "min_samples_split": randint(2, 10),
        "min_samples_leaf": randint(1, 5),
        "max_features": ["sqrt", "log2"]
    }

    rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
    search = RandomizedSearchCV(rf_model, param_grid, n_iter=15, scoring="roc_auc", cv=3, verbose=1, n_jobs=-1, random_state=42)
    search.fit(X_train, y_train)

    best_model = search.best_estimator_
    joblib.dump(best_model, "random_forest_model.pkl")  # Modeli kaydet
    print(f"âœ… En iyi Random Forest Modeli bulundu: {search.best_params_}")
    return best_model

# ğŸ“Œ 3. Test Verisinde Tahmin Yapma ve Sonucu Kaydetme
def predict_and_save(model, test_file, output_file="submission_rf_fixed.csv"):
    X_test, test_ids = preprocess_data(test_file, is_train=False)
    y_pred = model.predict_proba(X_test)[:, 1]  # YaÄŸmur olma olasÄ±lÄ±ÄŸÄ±nÄ± al
    submission = pd.DataFrame({"id": test_ids, "rainfall": y_pred})
    submission.to_csv(output_file, index=False)
    print(f"âœ… Tahminler {output_file} dosyasÄ±na kaydedildi!")

# ğŸ“Œ Ana AkÄ±ÅŸ
if __name__ == "__main__":
    train_file = "train.csv"
    test_file = "test.csv"

    # ğŸ“Œ Veriyi yÃ¼kle ve iÅŸle
    X_train, y_train = preprocess_data(train_file, is_train=True)

    # ğŸ“Œ Modeli eÄŸit
    best_rf_model = train_optimized_random_forest(X_train, y_train)

    # ğŸ“Œ Test verisi Ã¼zerinde tahmin yap ve sonucu kaydet
    predict_and_save(best_rf_model, test_file)


âœ… Veri yÃ¼klendi: (2190, 13)
âœ… Eksik deÄŸerler dolduruldu!
âœ… One-Hot Encoding tamamlandÄ±!
Fitting 3 folds for each of 15 candidates, totalling 45 fits
âœ… En iyi Random Forest Modeli bulundu: {'max_depth': 11, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 360}
âœ… Veri yÃ¼klendi: (730, 12)
âœ… Eksik deÄŸerler dolduruldu!
âœ… One-Hot Encoding tamamlandÄ±!
âœ… Test seti train ile uyumlu hale getirildi!
âœ… Tahminler submission_rf_fixed.csv dosyasÄ±na kaydedildi!
