In [1]:
import xgboost as xgb

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import randint, uniform
from imblearn.over_sampling import SMOTE


In [3]:
#read dataset
df = pd.read_csv('Train_Data.csv')
print("Initial dataset shape:", df.shape)


Initial dataset shape: (1966, 9)


In [4]:
#calculating missing values
print(df.isnull().sum())


SEQN         12
RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group    14
dtype: int64


In [5]:
# Drop rows with missing target
df = df.dropna(subset=['age_group'])
# Drop sequence number column
df = df.drop('SEQN', axis=1)



In [6]:
#simple mean imputation
df.fillna(df.median(numeric_only=True), inplace=True)
df.isnull().sum()


RIAGENDR     0
PAQ605       0
BMXBMI       0
LBXGLU       0
DIQ010       0
LBXGLT       0
LBXIN        0
age_group    0
dtype: int64

In [7]:
# Encode age_group column 
label_encoder = LabelEncoder()
df['age_group'] = label_encoder.fit_transform(df['age_group'])

# Split into features and target
X = df.drop(columns=['age_group'])
y = df['age_group']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# XGBOOST 
# ============================================
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=127)
model.fit(X_train_bal, y_train_bal)

# 10. Evaluate
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Classification report with all original class labels
print(classification_report(
    y_test,
    y_pred,
    labels=list(range(len(label_encoder.classes_))),
    target_names=label_encoder.classes_
))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# ====================================================

# RANDOM FOREST
# ============================================
# model = RandomForestClassifier(class_weight="balanced", random_state=42)

# model.fit(X_train_bal, y_train_bal)

# # Predict on test set
# y_pred = model.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# print(f"\nAccuracy: {accuracy:.4f}")

# # Classification report with all original class labels
# print(classification_report(
#     y_test,
#     y_pred,
#     labels=list(range(len(label_encoder.classes_))),
#     target_names=label_encoder.classes_
# ))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# ====================================================




# KNC
# ====================================================
# model = KNeighborsClassifier(n_neighbors=5)

# model.fit(X_train_bal, y_train_bal)

# # Predict on test set
# y_pred = model.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# print(f"\nAccuracy: {accuracy:.4f}")

# # Classification report with all original class labels
# print(classification_report(
#     y_test,
#     y_pred,
#     labels=list(range(len(label_encoder.classes_))),
#     target_names=label_encoder.classes_
# ))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# ===================================



Accuracy: 0.6240
              precision    recall  f1-score   support

       Adult       0.87      0.65      0.74       328
      Senior       0.22      0.51      0.30        63

    accuracy                           0.62       391
   macro avg       0.54      0.58      0.52       391
weighted avg       0.77      0.62      0.67       391

Confusion Matrix:
 [[212 116]
 [ 31  32]]


In [8]:
#read test data
test_data = pd.read_csv("Test_Data.csv")
test_data.head()

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
0,77017.0,1.0,1.0,32.2,96.0,2.0,135.0,15.11
1,75580.0,2.0,2.0,26.3,100.0,2.0,141.0,15.26
2,73820.0,1.0,2.0,28.6,107.0,2.0,136.0,8.82
3,80489.0,2.0,1.0,22.1,93.0,2.0,111.0,12.13
4,82047.0,1.0,1.0,24.7,91.0,2.0,105.0,3.12


In [9]:
# Drop sequence number column
test_data = test_data.drop('SEQN', axis=1)
# Fill Missing values
test_data.fillna(df.median(numeric_only=True), inplace=True)
test_data.isnull().sum()

RIAGENDR    0
PAQ605      0
BMXBMI      0
LBXGLU      0
DIQ010      0
LBXGLT      0
LBXIN       0
dtype: int64

In [10]:
# Evaluate
y_test = model.predict(test_data)


In [11]:
y_test


array([1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,

In [12]:
# Export results
result = pd.DataFrame({
    'age_group': y_test
})

In [13]:
result

Unnamed: 0,age_group
0,1
1,0
2,1
3,0
4,1
...,...
307,0
308,0
309,0
310,1


In [14]:
result.to_csv("xg.csv", index=False)