In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# ============================================================
# 1) IMPORTS
# ============================================================
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# ============================================================
# 2) SETTINGS → CHANGE ONLY THIS
# ============================================================
TARGET_COLUMN = "target"     # <---- CHANGE ONLY THIS

# ============================================================
# 3) LOAD DATA
# ============================================================
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# Auto-detect ID column
id_col = train.columns[0] if "id" in train.columns[0].lower() else None

# ============================================================
# 4) SPLIT TRAIN INTO X, y
# ============================================================
y = train[TARGET_COLUMN]
X = train.drop(columns=[TARGET_COLUMN])

if id_col and id_col in X.columns:
    X = X.drop(columns=[id_col])

if id_col and id_col in test.columns:
    test_ids = test[id_col]
    test = test.drop(columns=[id_col])
else:
    test_ids = np.arange(len(test))

# ============================================================
# 5) AUTO-DETECT COLUMN TYPES
# ============================================================
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# ============================================================
# 6) MISSING VALUE IMPUTATION
# ============================================================
imp_num = SimpleImputer(strategy="median")
imp_cat = SimpleImputer(strategy="most_frequent")

X[num_cols] = imp_num.fit_transform(X[num_cols])
test[num_cols] = imp_num.transform(test[num_cols])

X[cat_cols] = imp_cat.fit_transform(X[cat_cols])
test[cat_cols] = imp_cat.transform(test[cat_cols])

# ============================================================
# 7) ENCODING
# ============================================================
X = pd.get_dummies(X, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

# Ensure test matches train
test = test.reindex(columns=X.columns, fill_value=0)

# ============================================================
# 8) SCALING
# ============================================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test)

# ============================================================
# 9) TRAIN–TEST SPLIT
# ============================================================
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ============================================================
# 10) RANDOM FOREST REGRESSOR BASE MODEL
# ============================================================
rf = RandomForestRegressor(n_estimators=300, max_depth=20, random_state=42)
rf.fit(X_train, y_train)

pred_val = rf.predict(X_val)
print("Base RMSE:", np.sqrt(mean_squared_error(y_val, pred_val)))
print("Base R²:", r2_score(y_val, pred_val))

# ============================================================
# 11) GRID SEARCH (HYPERPARAMETER TUNING)
# ============================================================
params = {
    "n_estimators": [200, 300, 500],
    "max_depth": [10, 20, None]
}

grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    params, cv=3, n_jobs=-1
)

grid.fit(X_train, y_train)
print("Best Params:", grid.best_params_)

best_model = grid.best_estimator_

# ============================================================
# 12) FINAL EVALUATION
# ============================================================
final_pred = best_model.predict(X_val)

print("\nFinal RMSE:", np.sqrt(mean_squared_error(y_val, final_pred)))
print("Final R²:", r2_score(y_val, final_pred))

# ============================================================
# 13) FINAL PREDICTIONS FOR SUBMISSION
# ============================================================
test_pred = best_model.predict(test_scaled)

submission = pd.DataFrame({
    "id": test_ids,
    TARGET_COLUMN: test_pred
})

submission.to_csv("submission.csv", index=False)
print("submission.csv saved!")

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'