In [2]:
import numpy as np
import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

def preprocess_dataset_train(file_path):
    """
    Train veri setini i≈üler: Eksik deƒüerleri doldurur, kategorik deƒüi≈ükenleri i≈üler,
    √∂zellik m√ºhendisliƒüi yapar, √∂l√ßekleme ve encoding uygular.
    """
    # üìå 1. CSV dosyasƒ±nƒ± oku
    df = pd.read_csv(file_path)
    print(f"‚úÖ Veri y√ºklendi: {df.shape}")

    # üü¢ 2. Eksik Veri Doldurma
    df["HomePlanet"].fillna("Earth", inplace=True)
    df["CryoSleep"].fillna(False, inplace=True)
    df["Destination"].fillna("TRAPPIST-1e", inplace=True)
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["RoomService"].fillna(0, inplace=True)
    df["FoodCourt"].fillna(0, inplace=True)
    df["ShoppingMall"].fillna(0, inplace=True)
    df["Spa"].fillna(0, inplace=True)
    df["VRDeck"].fillna(0, inplace=True)

    # üü¢ 3. √ñzellik M√ºhendisliƒüi
    df["Group"] = df["PassengerId"].apply(lambda x: int(x.split("_")[0]))
    df[["Deck", "CabinNum", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df.drop(columns=["Cabin", "Name"], inplace=True)

    # Yeni deƒüi≈üken: Toplam harcama
    df["TotalSpend"] = df["RoomService"] + df["FoodCourt"] + df["ShoppingMall"] + df["Spa"] + df["VRDeck"]

    print("‚úÖ √ñzellik m√ºhendisliƒüi tamamlandƒ±!")

    # üü¢ 4. Kategorik Deƒüi≈ükenleri One-Hot Encoding
    df = pd.get_dummies(df, columns=["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"], drop_first=True)

    # üü¢ 5. Sayƒ±sal Deƒüi≈ükenleri √ñl√ßeklendirme
    num_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend"]
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    return df

def train_random_forest(df):
    """
    Random Forest modeli ile eƒüitim yapar ve doƒüruluk skorunu hesaplar.
    """
    y = df["Transported"].astype(int)
    X = df.drop(columns=["Transported", "PassengerId", "CabinNum"])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    start_time = time.time()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    elapsed_time = time.time() - start_time

    print("\n‚úÖ **Random Forest Modeli Eƒüitildi!**")
    print(f"üìä **Accuracy:** {accuracy:.5f}")
    print(f"‚è≥ **Training Time:** {elapsed_time:.5f} saniye")

    return model, X.columns

# üìå Train veri setini i≈üle
train_prepared = preprocess_dataset_train("train.csv")

# üìå Modeli eƒüit ve doƒüruluk deƒüerini al
random_forest_model, train_columns = train_random_forest(train_prepared)


‚úÖ Veri y√ºklendi: (8693, 14)
‚úÖ √ñzellik m√ºhendisliƒüi tamamlandƒ±!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["HomePlanet"].fillna("Earth", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["CryoSleep"].fillna(False, inplace=True)
  df["CryoSleep"].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interme


‚úÖ **Random Forest Modeli Eƒüitildi!**
üìä **Accuracy:** 0.80104
‚è≥ **Training Time:** 1.04672 saniye


In [3]:
def preprocess_dataset_test(file_path, train_columns):
    """
    Test veri setini i≈üler ve train setindeki s√ºtunlarla uyumlu hale getirir.
    """
    df = pd.read_csv(file_path)
    print(f"‚úÖ Test verisi y√ºklendi: {df.shape}")

    # Eksik veri doldurma
    df["HomePlanet"].fillna("Earth", inplace=True)
    df["CryoSleep"].fillna(False, inplace=True)
    df["Destination"].fillna("TRAPPIST-1e", inplace=True)
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["RoomService"].fillna(0, inplace=True)
    df["FoodCourt"].fillna(0, inplace=True)
    df["ShoppingMall"].fillna(0, inplace=True)
    df["Spa"].fillna(0, inplace=True)
    df["VRDeck"].fillna(0, inplace=True)

    # Cabin bilgisini b√∂l
    df["Group"] = df["PassengerId"].apply(lambda x: int(x.split("_")[0]))
    df[["Deck", "CabinNum", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df.drop(columns=["Cabin"], inplace=True)

    # Toplam harcama s√ºtunu ekle
    df["TotalSpend"] = df["RoomService"] + df["FoodCourt"] + df["ShoppingMall"] + df["Spa"] + df["VRDeck"]

    print("‚úÖ √ñzellik m√ºhendisliƒüi tamamlandƒ±!")

    # One-Hot Encoding
    df = pd.get_dummies(df, columns=["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"], drop_first=True)

    # Train setinde olup test setinde olmayan s√ºtunlarƒ± ekle
    missing_cols = set(train_columns) - set(df.columns)
    for col in missing_cols:
        df[col] = 0

    df = df[train_columns]
    print(f"‚úÖ Test seti train setiyle uyumlu hale getirildi! Yeni ≈üekil: {df.shape}")

    return df

def test_random_forest(model, test_df, test_csv_path, output_csv="submission.csv"):
    """
    Test veri setiyle tahmin yapar ve Kaggle submission dosyasƒ± olu≈üturur.
    """
    test_raw = pd.read_csv(test_csv_path)
    test_ids = test_raw["PassengerId"]

    predictions = model.predict(test_df)

    submission = pd.DataFrame({"PassengerId": test_ids, "Transported": predictions})
    submission["Transported"] = submission["Transported"].astype(bool)

    submission.to_csv(output_csv, index=False)
    print(f"‚úÖ Tahminler {output_csv} dosyasƒ±na kaydedildi!")

# üìå 1. Test veri setini i≈üle (train ile uyumlu hale getir)
test_prepared = preprocess_dataset_test("test.csv", train_columns)

# üìå 2. Model ile tahmin yap ve submission.csv olu≈ütur
test_random_forest(random_forest_model, test_prepared, "test.csv")


‚úÖ Test verisi y√ºklendi: (4277, 13)
‚úÖ √ñzellik m√ºhendisliƒüi tamamlandƒ±!
‚úÖ Test seti train setiyle uyumlu hale getirildi! Yeni ≈üekil: (4277, 22)
‚úÖ Tahminler submission.csv dosyasƒ±na kaydedildi!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["HomePlanet"].fillna("Earth", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["CryoSleep"].fillna(False, inplace=True)
  df["CryoSleep"].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interme