In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

In [2]:
# === 1. Load test data ===
test_df = pd.read_csv("test_kaggle_features.csv")
test_ids = test_df["id"]
test_df.drop(columns=["id"], inplace=True)

In [3]:

# === 2. Preprocess test data (same as training) ===
test_df["age_years"] = (test_df["age"] / 365).astype(int)
test_df.drop(columns="age", inplace=True)
test_df.replace({'Yes': 1, 'No': 0}, inplace=True)

# One-hot encoding (same columns as training)
categorical_cols = ["gender", "cholesterol", "gluc"]
test_df = pd.get_dummies(test_df, columns=categorical_cols, prefix=categorical_cols, drop_first=False)

  test_df.replace({'Yes': 1, 'No': 0}, inplace=True)


In [4]:
# === 3. Align test data to model features ===
import numpy as np

raw_features = joblib.load("model_features.pkl")
model_features = list(np.array(raw_features).flatten())
model_features = [str(f) for f in model_features if isinstance(f, str)]
model_features = list(dict.fromkeys(model_features))  # Remove duplicates
print(f"Loaded {len(model_features)} model features:", model_features)

# Add any missing columns with default 0
for col in model_features:
    if col not in test_df.columns:
        test_df[col] = 0

test_df = test_df[model_features].copy() 

print(test_df)

Loaded 10 model features: ['cholesterol_3', 'gluc_3', 'age_years', 'weight', 'smoke', 'active', 'ap_lo', 'alco', 'gender_2', 'ap_hi']
       cholesterol_3  gluc_3  age_years  weight  smoke  active  ap_lo  alco  \
0              False   False         53    59.5      0       1     85     0   
1              False   False         57    59.0      0       1     90     0   
2              False   False         41    88.0      0       1     80     0   
3              False   False         51    62.0      0       1     90     0   
4              False   False         49    81.0      0       1     80     0   
...              ...     ...        ...     ...    ...     ...    ...   ...   
13995          False    True         57    88.0      0       1    100     0   
13996          False   False         61   105.0      0       1     90     0   
13997          False   False         59    87.0      0       1   1100     0   
13998          False   False         50    73.0      0       1     80     0 

In [5]:
# === 4. Scale test data ===
scaler = StandardScaler()
test_scaled = scaler.fit_transform(test_df)  # You can also load scaler if saved, for exact scaling

In [6]:
# === 5. Load model and predict ===
model = joblib.load("best_model_tuned_Gradient_Boosting.pkl")
predictions = model.predict(test_scaled)
pred_labels = ["Yes" if p == 1 else "No" for p in predictions]

In [7]:
# === 6. Save submission ===
submission_df = pd.DataFrame({
    "id": test_ids,
    "cardio": pred_labels
})
submission_df.to_csv("submission_final.csv", index=False)
print("Submission file saved as 'submission_final.csv'")

Submission file saved as 'submission_final.csv'
