In [1]:
#importing the required libraries

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Using pandas to read the train dataset

df=pd.read_csv('Train_Data.csv')
df.head()

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,61.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,26.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,16.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,32.0,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,73580.0,38.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult


In [3]:
#counting the number of null values in each feature
df.isnull().sum()

SEQN         12
RIDAGEYR      9
RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group    14
dtype: int64

In [4]:
#Splitting the data into features and target variable
feature_cols = [col for col in df.columns if col not in ['age_group']]
target_cols = ['age_group']


#cont_cols = [col for col in feature_cols if col not in cat_cols]
sns.heatmap(df.isnull(), cmap='Blues', cbar=False, yticklabels=False, xticklabels=df.columns);

In [5]:
#Importing the machine learning libraries 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.impute import MissingIndicator

In [6]:
#Using simple imputer to replace the null values with most frequent values of the feature which contains only string values

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')

df[target_cols] = imputer.fit_transform(df[target_cols])



In [7]:
df

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,61.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,26.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,16.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,32.0,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,73580.0,38.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult
...,...,...,...,...,...,...,...,...,...,...
1961,83711.0,38.0,2.0,2.0,33.5,100.0,2.0,73.0,6.53,Adult
1962,83712.0,61.0,1.0,2.0,30.0,93.0,2.0,208.0,13.02,Adult
1963,83713.0,34.0,1.0,2.0,23.7,103.0,2.0,124.0,21.41,Adult
1964,83718.0,60.0,2.0,2.0,27.4,90.0,2.0,108.0,4.99,Adult


In [8]:
# Using simple imputer to replace the null values with mean of the feature which contains only int values

imputer = SimpleImputer(strategy='mean')

df[feature_cols] = imputer.fit_transform(df[feature_cols])

In [9]:
df

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,61.0,2.0,2.0,35.7,110.000000,2.000000,150.0,14.91,Adult
1,73568.0,26.0,2.0,2.0,20.3,89.000000,2.000000,80.0,3.85,Adult
2,73576.0,16.0,1.0,2.0,23.2,89.000000,2.000000,68.0,6.14,Adult
3,73577.0,32.0,1.0,2.0,28.9,104.000000,2.015914,84.0,16.15,Adult
4,73580.0,38.0,2.0,1.0,35.9,103.000000,2.000000,81.0,10.92,Adult
...,...,...,...,...,...,...,...,...,...,...
1961,83711.0,38.0,2.0,2.0,33.5,100.000000,2.000000,73.0,6.53,Adult
1962,83712.0,61.0,1.0,2.0,30.0,93.000000,2.000000,208.0,13.02,Adult
1963,83713.0,34.0,1.0,2.0,23.7,103.000000,2.000000,124.0,21.41,Adult
1964,83718.0,60.0,2.0,2.0,27.4,90.000000,2.000000,108.0,4.99,Adult


In [10]:
sns.heatmap(df.isnull(), cmap='Blues', cbar=False, yticklabels=False, xticklabels=df.columns);

In [11]:
#encode the target(age_group) values to int, eg:0,1

label_encoder = LabelEncoder()
df['age_group'] = label_encoder.fit_transform(df['age_group'])
df

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,61.0,2.0,2.0,35.7,110.000000,2.000000,150.0,14.91,0
1,73568.0,26.0,2.0,2.0,20.3,89.000000,2.000000,80.0,3.85,0
2,73576.0,16.0,1.0,2.0,23.2,89.000000,2.000000,68.0,6.14,0
3,73577.0,32.0,1.0,2.0,28.9,104.000000,2.015914,84.0,16.15,0
4,73580.0,38.0,2.0,1.0,35.9,103.000000,2.000000,81.0,10.92,0
...,...,...,...,...,...,...,...,...,...,...
1961,83711.0,38.0,2.0,2.0,33.5,100.000000,2.000000,73.0,6.53,0
1962,83712.0,61.0,1.0,2.0,30.0,93.000000,2.000000,208.0,13.02,0
1963,83713.0,34.0,1.0,2.0,23.7,103.000000,2.000000,124.0,21.41,0
1964,83718.0,60.0,2.0,2.0,27.4,90.000000,2.000000,108.0,4.99,0


In [12]:
#Importing the machine learning models for selecting the best model

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [13]:
#Splitting the model into test and train data

from sklearn.model_selection import train_test_split
X=df.drop(columns=['age_group'])
y=df['age_group']
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
#Scaling the features using StandardScaler() function


from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.fit_transform(X_test)
X_scaled=scaler.fit_transform(X)

In [15]:
#Using the logistic regression model to check the accuracy

lr=LogisticRegression()
lr.fit(X_train_scaled,y_train)
lr.score(X_test_scaled,y_test)

0.9873096446700508

In [16]:
#Using support vector machines to check the accuracy

svm=SVC()
svm.fit(X_train_scaled,y_train)
svm.score(X_test_scaled,y_test)

0.9695431472081218

In [17]:
#Using random forest classifier to check the accuracy

rf=RandomForestClassifier(n_estimators=30)
rf.fit(X_train_scaled,y_train)
rf.score(X_test_scaled,y_test)

1.0

In [18]:
#This function is used for checking the accuracy of each model

def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [19]:
# We use three fold cross validation for choosing the best model

from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

scores_logistic = []
scores_svm = []
scores_rf = []


for train_index, test_index in folds.split(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_scaled=scaler.fit_transform(X_train)
    X_test_scaled=scaler.fit_transform(X_test)
    scores_logistic.append(get_score(LogisticRegression(), X_train_scaled, X_test_scaled, y_train, y_test))  
    scores_svm.append(get_score(SVC(), X_train_scaled, X_test_scaled, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=30), X_train_scaled, X_test_scaled, y_train, y_test))

In [20]:
scores_logistic

[0.9873096446700508, 0.9873096446700508, 0.9873096446700508]

In [21]:
scores_svm

[0.9695431472081218, 0.9695431472081218, 0.9695431472081218]

In [22]:
scores_rf

[0.9974619289340102, 1.0, 1.0]

In [23]:
#Alternatively, we can use cross_val_score to check the accuracy of each model

from sklearn.model_selection import cross_val_score

lr_score=cross_val_score(lr,X_scaled,y)
svm_score=cross_val_score(svm,X_scaled,y)
rf_score=cross_val_score(rf,X_scaled,y)
print(lr_score)
print(svm_score)
print(rf_score)


[0.96954315 0.99236641 0.99236641 0.98982188 0.98982188]
[0.92639594 0.97709924 0.97709924 0.96437659 0.95928753]
[0.99746193 0.99745547 0.99745547 1.         0.99491094]


In [24]:
#SO RANDOM FOREST IS USED FOR CLASSIFICATION AS IT HAS HIGHEST NUMBER OF ACCURACY


In [25]:
model=RandomForestClassifier(n_estimators=30)
model.fit(X_train_scaled,y_train)
model.score(X_test_scaled,y_test)

0.9974619289340102

In [26]:
y_predict=model.predict(X_test_scaled)
y_predict

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [27]:
#Confusion matrix is used to check the number of true and false values between actual and predicted test values

cm = confusion_matrix(y_test, y_predict)
cm

array([[338,   0],
       [  1,  55]], dtype=int64)

In [28]:
# Classification report mentions the accuracy report of actual test and predicted test values

print(classification_report(
    y_test,
    y_predict,
    labels=list(range(len(label_encoder.classes_))),
    target_names=label_encoder.classes_
))

              precision    recall  f1-score   support

       Adult       1.00      1.00      1.00       338
      Senior       1.00      0.98      0.99        56

    accuracy                           1.00       394
   macro avg       1.00      0.99      0.99       394
weighted avg       1.00      1.00      1.00       394



In [29]:
# The model is tested using another data

test_data=pd.read_csv('Test_Data.csv')
test_data

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
0,77017.0,34.0,1.0,1.0,32.2,96.0,2.0,135.0,15.11
1,75580.0,12.0,2.0,2.0,26.3,100.0,2.0,141.0,15.26
2,73820.0,56.0,1.0,2.0,28.6,107.0,2.0,136.0,8.82
3,80489.0,20.0,2.0,1.0,22.1,93.0,2.0,111.0,12.13
4,82047.0,64.0,1.0,1.0,24.7,91.0,2.0,105.0,3.12
...,...,...,...,...,...,...,...,...,...
307,74150.0,20.0,2.0,2.0,21.9,82.0,2.0,82.0,2.54
308,82550.0,34.0,2.0,1.0,33.3,95.0,2.0,77.0,6.36
309,77835.0,64.0,2.0,2.0,41.5,91.0,2.0,149.0,15.52
310,79281.0,23.0,2.0,2.0,22.5,82.0,2.0,93.0,1.39


In [30]:
# The test data is scaled using StandardScaler()
test_scaled=scaler.fit_transform(test_data)
test_scaled

array([[-0.58611158, -0.32458196, -1.04622875, ..., -0.12116886,
         0.39711946,  0.36038962],
       [-1.08140539, -1.41153181,  0.95581392, ..., -0.12116886,
         0.51190457,  0.37608814],
       [-1.6880283 ,  0.76236788, -1.04622875, ..., -0.12116886,
         0.41625031, -0.29790176],
       ...,
       [-0.3041698 ,  1.15762237,  0.95581392, ..., -0.12116886,
         0.66495138,  0.40329892],
       [ 0.19422607, -0.86805688,  0.95581392, ..., -0.12116886,
        -0.40637632, -1.07550192],
       [-0.51821117, -1.41153181, -1.04622875, ..., -0.12116886,
         1.58323227, -0.32406597]])

In [31]:
y_test=model.predict(test_scaled)
y_test

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [32]:
# The result is converted into dataframe

result = pd.DataFrame({
    'age_group': y_test
})
result

Unnamed: 0,age_group
0,0
1,0
2,0
3,0
4,1
...,...
307,0
308,0
309,1
310,0


In [33]:
#The result is submitted to another file

result.to_csv("submission.csv", index=False) 