In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ===============================
# 1) IMPORTS
# ===============================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# ===============================
# 2) SETTINGS (CHANGE ONLY THESE)
# ===============================
TARGET = "NObeyesdad"        # target column
ID_COL = "id"            # id column
OUTPUT_TYPE = "class"     # "class" or "prob"

# CLASS_NAMES = ["Status_C", "Status_CL", "Status_D"]  
# ↑ only needed when OUTPUT_TYPE = "prob"

# ===============================
# 3) LOAD DATA
# ===============================
train = pd.read_csv("/kaggle/input/ai-201-b-mse-2-ai-a/train.csv")
test  = pd.read_csv("/kaggle/input/ai-201-b-mse-2-ai-a/test.csv")

y = train[TARGET]
X = train.drop(columns=[TARGET])

test_ids = test[ID_COL]
X_test = test.drop(columns=[ID_COL])

# ===============================
# 4) COLUMN TYPES
# ===============================
# ===============================
# 4) COLUMN TYPES  (FIXED)
# ===============================
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(exclude=np.number).columns


# ===============================
# 5) IMPUTATION
# ===============================
# ===============================
# 5) IMPUTATION (SAFE VERSION)
# ===============================
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

if len(num_cols) > 0:
    X[num_cols] = num_imputer.fit_transform(X[num_cols])
    X_test[num_cols] = num_imputer.transform(X_test[num_cols])

if len(cat_cols) > 0:
    X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
    X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])


# ===============================
# 6) ENCODING (SAFE FOR ALL DATA)
# ===============================
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

X_test = X_test.reindex(columns=X.columns, fill_value=0)

# ===============================
# 7) TARGET ENCODING
# ===============================
le = LabelEncoder()
y_enc = le.fit_transform(y)

# ===============================
# 8) TRAIN–VALIDATION SPLIT
# ===============================
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc, test_size=0.2, random_state=42
)

# ===============================
# 9) MODEL
# ===============================
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)

# ===============================
# 10) VALIDATION
# ===============================
val_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, val_pred))

# ===============================
# 11) FINAL PREDICTION
# ===============================
if OUTPUT_TYPE == "class":
    preds = model.predict(X_test)
    preds = le.inverse_transform(preds)

    submission = pd.DataFrame({
        ID_COL: test_ids,
        TARGET: preds
    })

else:  # OUTPUT_TYPE == "prob"
    probs = model.predict_proba(X_test)

    submission = pd.DataFrame(
        probs,
        columns=CLASS_NAMES
    )

    submission.insert(0, ID_COL, test_ids)

# ===============================
# 12) SAVE CSV
# ===============================
submission.to_csv("submissssion.csv", index=False)