In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data=pd.read_csv("/content/NHANES_age_prediction.csv")
data.drop(columns=['RIDAGEYR'], inplace=True)
print(data.describe())
count_missing = data.isnull().sum()
gender_frequencies=[data['RIAGENDR'].value_counts()]
age_frequencies=[data['age_group'].value_counts()]
activity_frequencies=[data['PAQ605'].value_counts()]
diabetes_frequencies=[data['DIQ010'].value_counts()]
print(gender_frequencies, age_frequencies, activity_frequencies, diabetes_frequencies)
sns.histplot(data['BMXBMI'])
plt.show()
sns.histplot(data['LBXGLU'])
plt.show()
sns.histplot(data['LBXGLT'])
plt.show()
sns.histplot(data['LBXIN'])
plt.show()
correlation_matrix= data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

[2.0    1165
1.0    1113
Name: RIAGENDR, dtype: int64] [Adult     1914
Senior     364
Name: age_group, dtype: int64] [2.0    1868
1.0     409
7.0       1
Name: PAQ605, dtype: int64] [2.0    2199
3.0      58
1.0      21
Name: DIQ010, dtype: int64]


In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import joblib
stratify_column = data['age_group']
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=13)
for train_index, test_index in stratified_split.split(data, stratify_column):
    X_train = data.iloc[train_index]
    X_test = data.iloc[test_index]
X=X_train.drop(columns=['age_group'])
Y=X_train['age_group']

LR=LogisticRegression(random_state=13, max_iter=1000, class_weight={'Adult':1, 'Senior':6}).fit(X, Y)
x_test=X_test.drop(columns=['age_group'])
y_test=X_test['age_group']
y_pred=LR.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', report)

joblib.dump(LR, 'LR.pkl')

Accuracy: 0.6667
Classification Report:
               precision    recall  f1-score   support

       Adult       0.92      0.66      0.77       575
      Senior       0.28      0.68      0.39       109

    accuracy                           0.67       684
   macro avg       0.60      0.67      0.58       684
weighted avg       0.81      0.67      0.71       684



['LR.pkl']

In [54]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
stratify_column = data['age_group']
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=13)
for train_index, test_index in stratified_split.split(data, stratify_column):
    X_train = data.iloc[train_index]
    X_test = data.iloc[test_index]
X=X_train.drop(columns=['age_group'])
Y=X_train['age_group']
svm = SVC(kernel='linear', class_weight={'Adult':1, 'Senior':5})
svm=svm.fit(X, Y)
x_test=X_test.drop(columns=['age_group'])
y_test=X_test['age_group']
y_pred=svm.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', report)
joblib.dump(svm, 'SVM.pkl')

Accuracy: 0.6140
Classification Report:
               precision    recall  f1-score   support

       Adult       0.95      0.57      0.71       575
      Senior       0.27      0.83      0.41       109

    accuracy                           0.61       684
   macro avg       0.61      0.70      0.56       684
weighted avg       0.84      0.61      0.67       684



['SVM.pkl']

In [35]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [49]:

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


X=data.drop(columns=['age_group'])
y=data['age_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
catboost_model = CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', class_weights={'Adult':1, 'Senior':3})
catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10, verbose=10)
y_pred = catboost_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', report)
joblib.dump(catboost_model, 'catboost_model.pkl')

0:	learn: 0.6756666	test: 0.6782376	best: 0.6782376 (0)	total: 1.26ms	remaining: 125ms
10:	learn: 0.5866451	test: 0.6044223	best: 0.6044223 (10)	total: 25.3ms	remaining: 204ms
20:	learn: 0.5491844	test: 0.5792663	best: 0.5792663 (20)	total: 52.6ms	remaining: 198ms
30:	learn: 0.5310532	test: 0.5699985	best: 0.5697043 (29)	total: 80.3ms	remaining: 179ms
40:	learn: 0.5188900	test: 0.5652095	best: 0.5649954 (39)	total: 108ms	remaining: 156ms
50:	learn: 0.5107706	test: 0.5605976	best: 0.5605976 (50)	total: 135ms	remaining: 130ms
60:	learn: 0.5012702	test: 0.5577687	best: 0.5574179 (59)	total: 160ms	remaining: 102ms
70:	learn: 0.4929686	test: 0.5573479	best: 0.5560251 (69)	total: 189ms	remaining: 77.4ms
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.5560250709
bestIteration = 69

Shrink model to first 70 iterations.
Accuracy: 0.8114
Classification Report:
               precision    recall  f1-score   support

       Adult       0.89      0.88      0.89       382
      S

['catboost_model.pkl']