# Diabetes Health Prediction

In [None]:
#import library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import math
from sklearn import metrics

## Data Extraction

In [None]:
df = pd.read_csv('../input/diabetes-health-indicators-dataset/diabetes_binary_health_indicators_BRFSS2015.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
#transform data
df['Diabetes_binary'] = df['Diabetes_binary'].astype('int')
df['HighBP'] = df['HighBP'].astype('int')
df['HighChol'] = df['HighChol'].astype('int')
df['CholCheck'] = df['CholCheck'].astype('int')
df['BMI'] = df['BMI'].astype('int')
df['Smoker'] = df['Smoker'].astype('int')
df['Stroke'] = df['Stroke'].astype('int')
df['HeartDiseaseorAttack'] = df['HeartDiseaseorAttack'].astype('int')
df['PhysActivity'] = df['PhysActivity'].astype('int')
df['Fruits'] = df['Fruits'].astype('int')
df['Veggies'] = df['Veggies'].astype('int')

df['HvyAlcoholConsump'] = df['HvyAlcoholConsump'].astype('int')
df['AnyHealthcare'] = df['AnyHealthcare'].astype('int')
df['NoDocbcCost'] = df['NoDocbcCost'].astype('int')
df['GenHlth'] = df['GenHlth'].astype('int')
df['MentHlth'] = df['MentHlth'].astype('int')
df['PhysHlth'] = df['PhysHlth'].astype('int')
df['DiffWalk'] = df['DiffWalk'].astype('int')
df['Sex'] = df['Sex'].astype('int')
df['Age'] = df['Age'].astype('int')
df['Education'] = df['Education'].astype('int')
df['Income'] = df['Income'].astype('int')
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#heatmap correlation
plt.figure(figsize = (10,6))
sns.heatmap(df.corr(), vmax = 0.9, square = True)
plt.title("Pearson Correlation")
plt.show()

In [None]:
#split data
X = df.drop('Diabetes_binary', axis = 1)
y = df['Diabetes_binary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Random Forest Model

In [None]:
#build model
rf = RandomForestClassifier(random_state = 1, max_features = 'sqrt', n_jobs = 1, verbose = 1)
%time rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
#prediction
y_pred = rf.predict(X_test)
print(y_pred)

In [None]:
#check MSE & RMSE 
mse = metrics.mean_squared_error(y_test, y_pred)
print('Mean Squared Error : '+ str(mse))
rmse = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error : '+ str(rmse))

In [None]:
#confusion matrix
matrix = metrics.confusion_matrix(y_test, y_pred)
print(matrix)

#heatmap matrix
plt.figure(figsize = (8,6))
sns.heatmap(matrix, annot = True, fmt = ".0f", cmap = 'viridis')
plt.title("Confusion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()

In [None]:
#classification report
report = metrics.classification_report(y_test, y_pred)
print(report)

## Check Feature Importance

In [None]:
#defining of feature
feature = pd.Series(rf.feature_importances_, index = X_train.columns).sort_values(ascending = False)
print(feature)

In [None]:
#visualize feature
plt.figure(figsize = (10,6))
sns.barplot(x = feature, y = feature.index)
plt.title("Feature Importance")
plt.xlabel('Score')
plt.ylabel('Features')
plt.show()

## Visualization

In [None]:
#transform data
df.Diabetes_binary[df['Diabetes_binary'] == 0] = 'No Diabetes'
df.Diabetes_binary[df['Diabetes_binary'] == 1] = 'Diabetes'

df.HighBP[df['HighBP'] == 0] = 'No High'
df.HighBP[df['HighBP'] == 1] = 'High BP'

df.HighChol[df['HighChol'] == 0] = 'No High Cholesterol'
df.HighChol[df['HighChol'] == 1] = 'High Cholesterol'

df.CholCheck[df['CholCheck'] == 0] = 'No Cholesterol Check in 5 Years'
df.CholCheck[df['CholCheck'] == 1] = 'Cholesterol Check in 5 Years'

df.Smoker[df['Smoker'] == 0] = 'No'
df.Smoker[df['Smoker'] == 1] = 'Yes'

df.Stroke[df['Stroke'] == 0] = 'No'
df.Stroke[df['Stroke'] == 1] = 'Yes'

df.HeartDiseaseorAttack[df['HeartDiseaseorAttack'] == 0] = 'No'
df.HeartDiseaseorAttack[df['HeartDiseaseorAttack'] == 1] = 'Yes'

df.PhysActivity[df['PhysActivity'] == 0] = 'No'
df.PhysActivity[df['PhysActivity'] == 1] = 'Yes'

df.Fruits[df['Fruits'] == 0] = 'No'
df.Fruits[df['Fruits'] == 1] = 'Yes'

df.Veggies[df['Veggies'] == 0] = 'No'
df.Veggies[df['Veggies'] == 1] = 'Yes'

df.HvyAlcoholConsump[df['HvyAlcoholConsump'] == 0] = 'No'
df.HvyAlcoholConsump[df['HvyAlcoholConsump'] == 1] = 'Yes'

df.AnyHealthcare[df['AnyHealthcare'] == 0] = 'No'
df.AnyHealthcare[df['AnyHealthcare'] == 1] = 'Yes'

df.NoDocbcCost[df['NoDocbcCost'] == 0] = 'No'
df.NoDocbcCost[df['NoDocbcCost'] == 1] = 'Yes'

df.GenHlth[df['GenHlth'] == 1] = 'Excellent'
df.GenHlth[df['GenHlth'] == 2] = 'Very Good'
df.GenHlth[df['GenHlth'] == 3] = 'Good'
df.GenHlth[df['GenHlth'] == 4] = 'Fair'
df.GenHlth[df['GenHlth'] == 5] = 'Poor'

df.DiffWalk[df['DiffWalk'] == 0] = 'No'
df.DiffWalk[df['DiffWalk'] == 1] = 'Yes'

df.Sex[df['Sex'] == 0] = 'Female'
df.Sex[df['Sex'] == 1] = 'Male'

df.Education[df['Education'] == 1] = 'Never Attended School'
df.Education[df['Education'] == 2] = 'Elementary'
df.Education[df['Education'] == 3] = 'Junior High School'
df.Education[df['Education'] == 4] = 'Senior High School'
df.Education[df['Education'] == 5] = 'Undergraduate Degree'
df.Education[df['Education'] == 6] = 'Magister'

df.Income[df['Income'] == 1] = 'Less Than $10,000'
df.Income[df['Income'] == 2] = 'Less Than $10,000'
df.Income[df['Income'] == 3] = 'Less Than $10,000'
df.Income[df['Income'] == 4] = 'Less Than $10,000'
df.Income[df['Income'] == 5] = 'Less Than $35,000'
df.Income[df['Income'] == 6] = 'Less Than $35,000'
df.Income[df['Income'] == 7] = 'Less Than $35,000'
df.Income[df['Income'] == 8] = '$75,000 or More'
df.head()

In [None]:
#visualize diabetes status
plt.figure(figsize = (8,6))
sns.countplot(df['Diabetes_binary'])
plt.title("Diabetes Status")
plt.show()

In [None]:
#group diabetes status & BP
diabetes_bp = df.groupby(['Diabetes_binary', 'HighBP']).size().reset_index(name = 'Count')
print(diabetes_bp)

In [None]:
#visualize diabetes status ~ BP
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'HighBP', data = diabetes_bp, palette = 'Set1')
plt.title("Dibaetes Status ~ BP")
plt.show()

In [None]:
#group diabetes status & cholesterol status
diabetes_chol = df.groupby(['Diabetes_binary', 'HighChol']).size().reset_index(name = 'Count')
print(diabetes_chol)

In [None]:
#visualize diabetes status ~ cholesterol status
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'HighChol', data = diabetes_chol, palette = 'Set2')
plt.title("Dibaetes Status ~ Cholesterol Status")
plt.show()

In [None]:
#group diabetes status & cholesterol check
diabetes_check = df.groupby(['Diabetes_binary', 'CholCheck']).size().reset_index(name = 'Count')
print(diabetes_check)

In [None]:
#visualize diabetes status ~ cholesterol check 
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'CholCheck', data = diabetes_check)
plt.title("Dibaetes Status ~ Cholesterol Check")
plt.show()

In [None]:
#visualize diabetes status ~ BMI
plt.figure(figsize = (8,6))
sns.boxplot(data = df, x = 'Diabetes_binary', y = 'BMI', palette = 'Set1')
plt.title("Dibaetes Status ~ BMI")
plt.show()

In [None]:
#group diabetes status & smoker status
diabetes_smoker = df.groupby(['Diabetes_binary', 'Smoker']).size().reset_index(name = 'Count')
print(diabetes_smoker)

In [None]:
#visualize diabetes status ~ smoker status 
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'Smoker', data = diabetes_smoker, palette = 'Set2')
plt.title("Dibaetes Status ~ Smoker Status")
plt.show()

In [None]:
#group diabetes status & stroke status
diabetes_stroke = df.groupby(['Diabetes_binary', 'Stroke']).size().reset_index(name = 'Count')
print(diabetes_stroke)

In [None]:
#visualize diabetes status ~ stroke status 
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'Stroke', data = diabetes_stroke, palette = 'Set1')
plt.title("Dibaetes Status ~ Stroke Status")
plt.show()

In [None]:
#group diabetes status & heart diseaseor attack
diabetes_heart = df.groupby(['Diabetes_binary', 'HeartDiseaseorAttack']).size().reset_index(name = 'Count')
print(diabetes_heart)

In [None]:
#visualize diabetes status ~ heart diseaseor attack
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'HeartDiseaseorAttack', data = diabetes_heart, palette = 'Set2')
plt.title("Dibaetes Status ~ Heart Diseaseor Attack")
plt.show()

In [None]:
#group diabetes status & physical activity
diabetes_physical = df.groupby(['Diabetes_binary', 'PhysActivity']).size().reset_index(name = 'Count')
print(diabetes_physical)

In [None]:
#visualize diabetes status ~ physical activity
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'PhysActivity', data = diabetes_physical)
plt.title("Dibaetes Status ~ Physical Activity")
plt.show()

In [None]:
#group diabetes status & fruits
diabetes_fruit = df.groupby(['Diabetes_binary', 'Fruits']).size().reset_index(name = 'Count')
print(diabetes_fruit)

In [None]:
#visualize diabetes status ~ fruits
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'Fruits', data = diabetes_fruit, palette = 'Set1')
plt.title("Dibaetes Status ~ Fruits")
plt.show()

In [None]:
#group diabetes status & veggies
diabetes_veggies = df.groupby(['Diabetes_binary', 'Veggies']).size().reset_index(name = 'Count')
print(diabetes_veggies)

In [None]:
#visualize diabetes status ~ veggies
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'Veggies', data = diabetes_veggies, palette = 'Set2')
plt.title("Dibaetes Status ~ Veggies")
plt.show()

In [None]:
#group diabetes status & HvyAlcoholConsump
diabetes_alcohol = df.groupby(['Diabetes_binary', 'HvyAlcoholConsump']).size().reset_index(name = 'Count')
print(diabetes_alcohol)

In [None]:
#visualize diabetes status ~ HvyAlcoholConsump
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'HvyAlcoholConsump', data = diabetes_alcohol)
plt.title("Dibaetes Status ~ Alcohol Consumption")
plt.show()

In [None]:
#group diabetes status & AnyHealthcare
diabetes_healthcare = df.groupby(['Diabetes_binary', 'AnyHealthcare']).size().reset_index(name = 'Count')
print(diabetes_healthcare)

In [None]:
#visualize diabetes status ~ AnyHealthcare
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'AnyHealthcare', data = diabetes_healthcare, palette = 'Set1')
plt.title("Dibaetes Status ~ Healthcare")
plt.show()

In [None]:
#group diabetes status & doctor cost
diabetes_NoDocbcCost = df.groupby(['Diabetes_binary', 'NoDocbcCost']).size().reset_index(name = 'Count')
print(diabetes_NoDocbcCost)

In [None]:
#visualize diabetes status ~ doctor cost
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'NoDocbcCost', data = diabetes_NoDocbcCost, palette = 'Set2')
plt.title("Dibaetes Status ~ Doctor Cost")
plt.show()

In [None]:
#group diabetes status & general health
diabetes_general = df.groupby(['Diabetes_binary', 'GenHlth']).size().reset_index(name = 'Count')
print(diabetes_general)

In [None]:
#visualize diabetes status ~ general health
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'GenHlth', data = diabetes_general)
plt.title("Dibaetes Status ~ General Health")
plt.show()

In [None]:
#visualize diabetes status ~ mental health
plt.figure(figsize = (8,6))
sns.boxplot(data = df, x = 'Diabetes_binary', y = 'MentHlth', palette = 'Set1')
plt.title("Dibaetes Status ~ Mental Health")
plt.show()

In [None]:
#visualize diabetes status ~ physical health
plt.figure(figsize = (8,6))
sns.boxplot(data = df, x = 'Diabetes_binary', y = 'PhysHlth', palette = 'Set2')
plt.title("Dibaetes Status ~ Physical Health")
plt.show()

In [None]:
#group diabetes status & difficulty walking
diabetes_walk = df.groupby(['Diabetes_binary', 'DiffWalk']).size().reset_index(name = 'Count')
print(diabetes_walk)

In [None]:
#visualize diabetes status ~ difficulty walking
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'DiffWalk', data = diabetes_walk)
plt.title("Dibaetes Status ~ Difficulty Walking")
plt.show()

In [None]:
#group diabetes status & gender
diabetes_sex = df.groupby(['Diabetes_binary', 'Sex']).size().reset_index(name = 'Count')
print(diabetes_sex)

In [None]:
#visualize diabetes status ~ gender
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'Sex', data = diabetes_sex, palette = 'Set1')
plt.title("Dibaetes Status ~ Gender")
plt.show()

In [None]:
#visualize diabetes status ~ age
plt.figure(figsize = (8,6))
sns.boxplot(data = df, x = 'Diabetes_binary', y = 'Age', palette = 'Set2')
plt.title("Dibaetes Status ~ Age")
plt.show()

In [None]:
#group diabetes status & education
diabetes_education = df.groupby(['Diabetes_binary', 'Education']).size().reset_index(name = 'Count')
print(diabetes_education)

In [None]:
#visualize diabetes status ~ education
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'Education', data = diabetes_education)
plt.title("Dibaetes Status ~ Education")
plt.show()

In [None]:
#group diabetes status & income
diabetes_income = df.groupby(['Diabetes_binary', 'Income']).size().reset_index(name = 'Count')
print(diabetes_income)

In [None]:
#visualize diabetes status ~ income
plt.figure(figsize = (8,6))
sns.barplot(x = 'Diabetes_binary', y = 'Count', hue = 'Income', data = diabetes_income, palette = 'Set1')
plt.title("Dibaetes Status ~ Income")
plt.show()