In [425]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [427]:
#upload and merge data

pd.set_option('display.max_columns', None)
df1 = pd.read_csv('personal_info_train.csv')
df2 = pd.read_csv('measurements_results_train.csv')

df = pd.merge(df1, df2, on="patient_id")

In [429]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64788 entries, 0 to 64787
Data columns (total 39 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   patient_id   64788 non-null  object 
 1   created_at   64788 non-null  object 
 2   birth_date   64788 non-null  object 
 3   gender       64788 non-null  object 
 4   country      64788 non-null  object 
 5   region       0 non-null      float64
 6   HMO          38811 non-null  object 
 7   height       64788 non-null  float64
 8   bmi          64660 non-null  float64
 9   heart_rate   64788 non-null  float64
 10  steps_day_1  64788 non-null  int64  
 11  steps_day_2  61515 non-null  float64
 12  steps_day_3  64788 non-null  int64  
 13  steps_day_4  64788 non-null  int64  
 14  steps_day_5  64788 non-null  int64  
 15  city         64788 non-null  object 
 16  employment   64788 non-null  int64  
 17  weight       64788 non-null  float64
 18  label        64788 non-null  int64  
 19  test

In [431]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
region,0.0,,,,,,,
height,64788.0,166.707699,14.11075,1.49992,159.854853,167.278849,174.94224,197.9218
bmi,64660.0,7716.384287,1167427.0,14.004048,23.397359,25.005442,26.622584,213364800.0
heart_rate,64788.0,77.69251,14.14733,50.473672,65.439226,77.741777,89.910432,104.5811
steps_day_1,64788.0,451.610298,371.8757,10.0,174.0,343.0,625.0,2324.0
steps_day_2,61515.0,446.852004,367.6409,10.0,173.0,339.0,618.0,2325.0
steps_day_3,64788.0,442.450979,364.2989,10.0,173.0,336.0,609.0,2314.0
steps_day_4,64788.0,439.57665,362.4054,10.0,171.0,335.0,605.0,2326.0
steps_day_5,64788.0,436.866611,364.7725,10.0,169.0,327.0,602.25,2345.0
employment,64788.0,1.990847,1.415629,0.0,1.0,2.0,3.0,4.0


In [419]:
df.columns

Index(['gender', 'HMO', 'height', 'bmi', 'heart_rate', 'steps_day_1',
       'steps_day_2', 'steps_day_3', 'steps_day_4', 'steps_day_5', 'city',
       'employment', 'weight', 'label', 'test_0', 'test_1', 'test_2', 'test_3',
       'test_4', 'test_5', 'test_6', 'test_7', 'test_8', 'test_9', 'test_10',
       'test_11', 'test_12', 'test_13', 'test_14', 'test_15', 'test_16',
       'test_17', 'test_18', 'test_19', 'age', 'year_created', 'steps_median',
       'corr_bmi', 'steps_mean'],
      dtype='object')

In [404]:
#fill missing values in the measurements file

columns_to_fill = ['test_2','test_6', 'test_8', 'test_10', 'test_12', 'test_15']
mean_values = df[columns_to_fill].mean()
df.fillna(mean_values, inplace=True)

#create age column

from datetime import datetime
current_year = datetime.now().year

df['birth_date'] = pd.to_datetime(df['birth_date'])
df['age'] = current_year - df['birth_date'].dt.year

# Map gender to integer values
df['gender'] = df['gender'].map({'M': 0, 'F': 1})

# make the created at a time colum

df['created_at'] = pd.to_datetime(df['created_at'])

df['year_created'] = df['created_at'].dt.year

#fix steps

df['steps_median'] = df[['steps_day_1', 'steps_day_2', 'steps_day_3', 'steps_day_4', 'steps_day_5']].median(axis=1)
df.loc[:, 'steps_day_2'] = df['steps_day_2'].fillna(df['steps_median'])

#make missing HMO category

df['HMO'] = df['HMO'].fillna('missingHMO')

#fix weight column

df['weight'] = df['weight'].apply(lambda x: x/1000 if x > 500 else df['weight'].mean() if x<10 else x)

# fix height colum

df['height'] = df['height'].apply(lambda x: x/100 if x > 100 else df['height'].mean() if x<10 else x)

# new BMI column

df['corr_bmi'] = df['weight'] / (df['height'] ** 2)

#make steps mean column

df['steps_mean'] = df[['steps_day_1', 'steps_day_2', 'steps_day_3', 'steps_day_4', 'steps_day_5']].mean(axis=1)

# Categorize city
label_encoder = LabelEncoder()
label_encoder.fit(df['city'])
df['city'] = label_encoder.transform(df['city'])

# Categorize HMO
label_encoder = LabelEncoder()
label_encoder.fit(df['HMO'])
df['HMO'] = label_encoder.transform(df['HMO'])

# Drop irrelevant columns
df.drop(['created_at', 'country', 'region', 'birth_date', 'patient_id'], axis=1,inplace=True)
df.head()

Unnamed: 0,gender,HMO,height,bmi,heart_rate,steps_day_1,steps_day_2,steps_day_3,steps_day_4,steps_day_5,city,employment,weight,label,test_0,test_1,test_2,test_3,test_4,test_5,test_6,test_7,test_8,test_9,test_10,test_11,test_12,test_13,test_14,test_15,test_16,test_17,test_18,test_19,age,year_created,steps_median,corr_bmi,steps_mean
0,0,3,1.692972,25.948374,61.600474,96,81.0,49,62,63,14,3,74.372008,0,-0.304823,2.526928,1.695177,0.883312,-0.304823,-0.467827,0.060159,2.967887,1.695177,0.492788,0.695177,2.007762,1.695177,-0.736405,0.695177,100.305745,2.592215,98.746159,0.074492,101.669656,59,2019,63.0,25.948374,70.2
1,0,2,1.71637,23.463067,57.429018,50,22.0,25,28,23,0,2,69.120473,0,-2.409156,-1.430361,-0.409156,-0.995013,-2.409156,-2.298425,-0.409156,-0.611722,-3.409156,0.471265,-0.409156,0.36937,-3.409156,-2.01552,-3.409156,97.467493,-2.048522,96.767409,-1.840644,102.044307,56,2020,25.0,23.463067,29.6
2,1,3,1.594097,23.600005,80.609092,280,86.0,196,272,305,2,0,59.971076,0,-0.23768,-2.109256,-2.23768,1.387978,1.76232,0.377945,-0.23768,-0.757449,0.76232,0.092166,-0.23768,-1.533615,-1.23768,-1.140273,-0.23768,99.619753,-0.744591,100.9202,-0.954887,99.51866,78,2018,272.0,23.600005,227.8
3,1,3,1.59415,24.070889,89.218106,277,354.0,423,678,654,15,4,61.171685,0,0.303403,-4.001235,-3.696597,-0.070918,3.303403,-0.08375,-1.696597,-2.559142,0.303403,-0.705679,-1.696597,-3.215954,0.002483,-0.788122,-1.696597,99.184532,-2.482011,102.026019,-0.461391,97.200491,86,2018,423.0,24.070889,477.2
4,0,2,1.808763,25.322567,68.951826,531,441.0,346,273,374,6,3,82.845908,0,-2.361192,1.176278,0.638808,-0.064485,-1.361192,0.561864,0.638808,2.294657,-0.361192,-0.244779,0.638808,1.350373,0.638808,-2.392055,-0.361192,98.793925,1.40142,97.168459,-1.38358,101.809453,60,2018,374.0,25.322567,393.0


In [405]:
#training xgboost model

X = df.drop(['label'], axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',  # For binary classification
    'max_depth': 6,
    'eta': 0.3,  # Learning rate
    'eval_metric': 'logloss'
}

num_round = 100  # Number of boosting rounds
model = xgb.train(params, dtrain, num_round)

predictions = model.predict(dtest)
# Convert probabilities to binary predictions
predictions = [1 if pred > 0.5 else 0 for pred in predictions]

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.99


In [406]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99     12422
           1       0.95      0.73      0.83       536

    accuracy                           0.99     12958
   macro avg       0.97      0.86      0.91     12958
weighted avg       0.99      0.99      0.99     12958



In [407]:
print(confusion_matrix(y_test, predictions))

[[12403    19]
 [  145   391]]
