In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# 📌 1. Veri Hazırlama
def preprocess(file_path, is_train=True):
    df = pd.read_csv(file_path)
    cat_cols = df.select_dtypes(include='object').columns.tolist()
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Eksik doldurma
    for col in cat_cols:
        df[col] = df[col].astype(str)
        df[col].fillna(df[col].mode()[0], inplace=True)
        df[col] = LabelEncoder().fit_transform(df[col])

    for col in num_cols:
        df[col].fillna(df[col].median(), inplace=True)

    if is_train:
        X = df.drop(columns=["rainfall", "id"])
        y = df["rainfall"]
        return X, y
    else:
        ids = df["id"]
        X = df.drop(columns=["id"])
        return X, ids

# 📌 2. Random Forest
def train_rf(X, y):
    rf = RandomForestRegressor(
        n_estimators=500,
        max_depth=16,
        min_samples_split=4,
        min_samples_leaf=2,
        n_jobs=-1,
        random_state=42
    )
    rf.fit(X, y)
    return rf

# 📌 3. XGBoost
def train_xgb(X, y):
    xgb = XGBRegressor(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    xgb.fit(X, y)
    return xgb

# 📌 4. Ensemble Tahmin ve Submission
def ensemble_predict_and_save(rf, xgb, test_file, output="submission_ensemble.csv"):
    X_test, ids = preprocess(test_file, is_train=False)
    
    rf_pred = rf.predict(X_test)
    xgb_pred = xgb.predict(X_test)

    # Normalize ederek average al
    rf_scaled = (rf_pred - rf_pred.mean()) / rf_pred.std()
    xgb_scaled = (xgb_pred - xgb_pred.mean()) / xgb_pred.std()

    final_pred = (rf_scaled + xgb_scaled) / 2

    # Skoru tekrar orijinal aralığa çek
    final_pred = final_pred * 0.01  # rainfall değeri küçük aralıkta: [-0.05, 0.05]

    submission = pd.DataFrame({"id": ids, "rainfall": final_pred})
    submission.to_csv(output, index=False)
    print(f"✅ Tahminler {output} dosyasına kaydedildi!")

# 📌 Ana Akış
if __name__ == "__main__":
    train_file = "train.csv"
    test_file = "test.csv"

    X, y = preprocess(train_file, is_train=True)

    # Train / Validation split kontrolü
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

    rf_model = train_rf(X_train, y_train)
    xgb_model = train_xgb(X_train, y_train)

    # Val set kontrolü
    val_preds = (rf_model.predict(X_val) + xgb_model.predict(X_val)) / 2
    val_mse = mean_squared_error(y_val, val_preds)
    print(f"📉 Validation MSE: {val_mse:.7f}")

    # Final test tahmini
    ensemble_predict_and_save(rf_model, xgb_model, test_file)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


📉 Validation MSE: 0.1236981
✅ Tahminler submission_ensemble.csv dosyasına kaydedildi!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
