<a href="https://colab.research.google.com/github/devps814/Summer-Analytics-W-1/blob/main/Summer_hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#📦 Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score


In [None]:

# 📥 Step 2: Load Data
train = pd.read_csv('/content/Train_Data.csv')
test = pd.read_csv('/content/Test_Data.csv')
sample_submission = pd.read_csv('/content/Sample_Submission.csv')


In [None]:
# 🔍 Clean age_group column
train['age_group'] = train['age_group'].astype(str).str.strip()

In [None]:
# 🧹 Handle categorical columns with 7/9 (assumed invalid/missing)
categorical_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']
for col in categorical_cols:
    train[col] = train[col].replace([7, 9], np.nan)
    test[col] = test[col].replace([7, 9], np.nan)
    train[col].fillna(train[col].mode()[0], inplace=True)
    test[col].fillna(train[col].mode()[0], inplace=True)
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are s

In [None]:
# 🔄 Encode target variable: Adult=0, Senior=1
train['age_group'] = train['age_group'].astype(str).str.strip().replace({'Adult': 0, 'Senior': 1})
train['age_group'] = pd.to_numeric(train['age_group'], errors='coerce').fillna(0).astype(int)



In [None]:
# 🔍 Fill numeric missing values with mean
numeric_cols = train.select_dtypes(include=np.number).columns.tolist()
numeric_cols.remove('age_group')  # exclude target

In [None]:
for col in numeric_cols:
    train[col].fillna(train[col].mean(), inplace=True)
    test[col].fillna(train[col].mean(), inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(train[col].mean(), inplace=True)


In [None]:
# 🧪 Feature Engineering
train['glucose_bmi'] = train['LBXGLU'] / (train['BMXBMI'] + 1)
test['glucose_bmi'] = test['LBXGLU'] / (test['BMXBMI'] + 1)
train['insulin_ratio'] = train['LBXIN'] / (train['LBXGLU'] + 1)
test['insulin_ratio'] = test['LBXIN'] / (test['LBXGLU'] + 1)
train['is_active'] = (train['PAQ605'] == 1).astype(int)
test['is_active'] = (test['PAQ605'] == 1).astype(int)

In [None]:
# 🔢 Extract test IDs and remove from features
if 'SEQN' in test.columns:
    test_ids = test['SEQN']
    test.drop(columns=['SEQN'], inplace=True)
if 'SEQN' in train.columns:
    train.drop(columns=['SEQN'], inplace=True)

In [None]:
# 🔢 Features and Target
X = train.drop('age_group', axis=1)
y = train['age_group']

In [None]:
# 🔄 Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test)



In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)



In [None]:
# 🚀 Train XGBoost Model
model = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.



In [None]:
# 📊 Evaluate
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print("F1 Score:", f1_score(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.98      0.91       331
           1       0.47      0.11      0.18        63

    accuracy                           0.84       394
   macro avg       0.66      0.54      0.54       394
weighted avg       0.79      0.84      0.79       394

F1 Score: 0.1794871794871795


In [None]:
test_preds = model.predict(test_scaled)

In [None]:
# 📄 Save Final Submission in /content/
final_submission = pd.DataFrame({'age_group': test_preds})
final_submission.to_csv('/content/final_submission.csv', index=False)
print("✅ Final submission file saved as 'final_submission.csv'")



✅ Final submission file saved as 'final_submission.csv'
