In [2]:
import numpy as np
import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

def preprocess_dataset_train(file_path):
    """
    Train veri setini işler: Eksik değerleri doldurur, kategorik değişkenleri işler,
    özellik mühendisliği yapar, ölçekleme ve encoding uygular.
    """
    # 📌 1. CSV dosyasını oku
    df = pd.read_csv(file_path)
    print(f"✅ Veri yüklendi: {df.shape}")

    # 🟢 2. Eksik Veri Doldurma
    df["HomePlanet"].fillna("Earth", inplace=True)
    df["CryoSleep"].fillna(False, inplace=True)
    df["Destination"].fillna("TRAPPIST-1e", inplace=True)
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["RoomService"].fillna(0, inplace=True)
    df["FoodCourt"].fillna(0, inplace=True)
    df["ShoppingMall"].fillna(0, inplace=True)
    df["Spa"].fillna(0, inplace=True)
    df["VRDeck"].fillna(0, inplace=True)

    # 🟢 3. Özellik Mühendisliği
    df["Group"] = df["PassengerId"].apply(lambda x: int(x.split("_")[0]))
    df[["Deck", "CabinNum", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df.drop(columns=["Cabin", "Name"], inplace=True)

    # Yeni değişken: Toplam harcama
    df["TotalSpend"] = df["RoomService"] + df["FoodCourt"] + df["ShoppingMall"] + df["Spa"] + df["VRDeck"]

    print("✅ Özellik mühendisliği tamamlandı!")

    # 🟢 4. Kategorik Değişkenleri One-Hot Encoding
    df = pd.get_dummies(df, columns=["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"], drop_first=True)

    # 🟢 5. Sayısal Değişkenleri Ölçeklendirme
    num_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend"]
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    return df

def train_random_forest(df):
    """
    Random Forest modeli ile eğitim yapar ve doğruluk skorunu hesaplar.
    """
    y = df["Transported"].astype(int)
    X = df.drop(columns=["Transported", "PassengerId", "CabinNum"])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    start_time = time.time()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    elapsed_time = time.time() - start_time

    print("\n✅ **Random Forest Modeli Eğitildi!**")
    print(f"📊 **Accuracy:** {accuracy:.5f}")
    print(f"⏳ **Training Time:** {elapsed_time:.5f} saniye")

    return model, X.columns

# 📌 Train veri setini işle
train_prepared = preprocess_dataset_train("train.csv")

# 📌 Modeli eğit ve doğruluk değerini al
random_forest_model, train_columns = train_random_forest(train_prepared)


✅ Veri yüklendi: (8693, 14)
✅ Özellik mühendisliği tamamlandı!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["HomePlanet"].fillna("Earth", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["CryoSleep"].fillna(False, inplace=True)
  df["CryoSleep"].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interme


✅ **Random Forest Modeli Eğitildi!**
📊 **Accuracy:** 0.80104
⏳ **Training Time:** 1.04672 saniye


In [3]:
def preprocess_dataset_test(file_path, train_columns):
    """
    Test veri setini işler ve train setindeki sütunlarla uyumlu hale getirir.
    """
    df = pd.read_csv(file_path)
    print(f"✅ Test verisi yüklendi: {df.shape}")

    # Eksik veri doldurma
    df["HomePlanet"].fillna("Earth", inplace=True)
    df["CryoSleep"].fillna(False, inplace=True)
    df["Destination"].fillna("TRAPPIST-1e", inplace=True)
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["RoomService"].fillna(0, inplace=True)
    df["FoodCourt"].fillna(0, inplace=True)
    df["ShoppingMall"].fillna(0, inplace=True)
    df["Spa"].fillna(0, inplace=True)
    df["VRDeck"].fillna(0, inplace=True)

    # Cabin bilgisini böl
    df["Group"] = df["PassengerId"].apply(lambda x: int(x.split("_")[0]))
    df[["Deck", "CabinNum", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df.drop(columns=["Cabin"], inplace=True)

    # Toplam harcama sütunu ekle
    df["TotalSpend"] = df["RoomService"] + df["FoodCourt"] + df["ShoppingMall"] + df["Spa"] + df["VRDeck"]

    print("✅ Özellik mühendisliği tamamlandı!")

    # One-Hot Encoding
    df = pd.get_dummies(df, columns=["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"], drop_first=True)

    # Train setinde olup test setinde olmayan sütunları ekle
    missing_cols = set(train_columns) - set(df.columns)
    for col in missing_cols:
        df[col] = 0

    df = df[train_columns]
    print(f"✅ Test seti train setiyle uyumlu hale getirildi! Yeni şekil: {df.shape}")

    return df

def test_random_forest(model, test_df, test_csv_path, output_csv="submission.csv"):
    """
    Test veri setiyle tahmin yapar ve Kaggle submission dosyası oluşturur.
    """
    test_raw = pd.read_csv(test_csv_path)
    test_ids = test_raw["PassengerId"]

    predictions = model.predict(test_df)

    submission = pd.DataFrame({"PassengerId": test_ids, "Transported": predictions})
    submission["Transported"] = submission["Transported"].astype(bool)

    submission.to_csv(output_csv, index=False)
    print(f"✅ Tahminler {output_csv} dosyasına kaydedildi!")

# 📌 1. Test veri setini işle (train ile uyumlu hale getir)
test_prepared = preprocess_dataset_test("test.csv", train_columns)

# 📌 2. Model ile tahmin yap ve submission.csv oluştur
test_random_forest(random_forest_model, test_prepared, "test.csv")


✅ Test verisi yüklendi: (4277, 13)
✅ Özellik mühendisliği tamamlandı!
✅ Test seti train setiyle uyumlu hale getirildi! Yeni şekil: (4277, 22)
✅ Tahminler submission.csv dosyasına kaydedildi!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["HomePlanet"].fillna("Earth", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["CryoSleep"].fillna(False, inplace=True)
  df["CryoSleep"].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interme