In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [None]:
import pandas as pd
import io
from google.colab import files

uploaded = files.upload()

df = pd.read_csv(io.BytesIO(uploaded['diabetes_prediction_dataset.csv']))

print(df)

In [None]:
df = pd.read_csv('diabetes_prediction_dataset.csv')

In [None]:
df['diabetes'].value_counts()

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df['smoking_history'].unique()

In [None]:
df['gender'].unique()

In [None]:
df['smoking_history'] = df['smoking_history'].map({"never":1,"No Info":2, "current":3, "former":4, "ever":1, "not current": 5})
df['gender'] = df['gender'].map({"Female":1, "Male":0, "Other":2})

In [None]:
df.isnull().sum()

In [None]:
print(df.dtypes)

In [None]:
data= df.drop(columns='diabetes', axis=1)
target= df['diabetes']


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data, target, test_size = 0.3, random_state=2, stratify = target)

In [None]:
print(data.shape, X_train.shape, X_test.shape)

In [None]:
import seaborn as sns
before_scale= df.corr()
sns.heatmap(before_scale, cmap = 'YlGnBu')

In [None]:
# preprocessing using 0-1 scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)

X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(X_train_scaled, Y_train)

In [None]:
print("Training accuracy of the model is {:.2f}".format(knn.score(X_train_scaled, Y_train)))
print("Testing accuracy of the model is {:.2f}".format(knn.score(X_test_scaled, Y_test)))

In [None]:
predictions = knn.predict(X_test_scaled)
print(predictions)

In [None]:
from sklearn.metrics import confusion_matrix
mat=confusion_matrix(predictions, Y_test)
print(mat)

In [None]:
from seaborn import heatmap
heatmap(mat , cmap="Pastel1_r", xticklabels=['Negative', 'Positive'], yticklabels=['Negative','Positive'], annot=True)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision = precision_score(predictions, Y_test)
recall = recall_score(predictions, Y_test)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

F1Score= (2*precision*recall)/(precision+recall)
print(f'F1 Score: {F1Score:.4f}')

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_scaled, Y_train)

In [None]:
print("Training accuracy of the model is {:.2f}".format(gnb.score(X_train_scaled, Y_train)))
print("Testing accuracy of the model is {:.2f}".format(gnb.score(X_test_scaled, Y_test)))

In [None]:
predictions = gnb.predict(X_test_scaled)
print(predictions)

In [None]:
from sklearn.metrics import confusion_matrix
mat=confusion_matrix(predictions, Y_test)
print(mat)

In [None]:
from seaborn import heatmap
heatmap(mat , cmap="Pastel1_r", xticklabels=['Negative', 'Positive'], yticklabels=['Negative','Positive'], annot=True)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision = precision_score(predictions, Y_test)
recall = recall_score(predictions, Y_test)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

F1Score= (2*precision*recall)/(precision+recall)
print(f'F1 Score: {F1Score:.4f}')

In [None]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(max_iter=10000)
log.fit(X_train_scaled, Y_train)

In [None]:
print("The Training accuracy of the model is {:.2f}".format(log.score(X_train_scaled, Y_train)))
print("The Testing accuracy of the model is {:.2f}".format(log.score(X_test_scaled, Y_test)))

In [None]:
predictions = log.predict(X_test_scaled)

In [None]:
from sklearn.metrics import confusion_matrix
mat=confusion_matrix(predictions, Y_test)
print(mat)

In [None]:
from seaborn import heatmap
heatmap(mat , cmap="Pastel1_r", xticklabels=['Negative', 'Positive'], yticklabels=['Negative','Positive'], annot=True)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision = precision_score(predictions, Y_test)
recall = recall_score(predictions, Y_test)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

F1Score= (2*precision*recall)/(precision+recall)
print(f'F1 Score: {F1Score:.4f}')

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=50)
rfc.fit(X_train_scaled, Y_train)

In [None]:
print("The Training accuracy of the model is {:.2f}".format(rfc.score(X_train_scaled, Y_train)))
print("The Testing accuracy of the model is {:.2f}".format(rfc.score(X_test_scaled, Y_test)))

In [None]:
predictions = rfc.predict(X_test_scaled)

In [None]:
from sklearn.metrics import confusion_matrix
mat=confusion_matrix(predictions, Y_test)
print(mat)

In [None]:
from seaborn import heatmap
heatmap(mat , cmap="Pastel1_r", xticklabels=['Negative', 'Positive'], yticklabels=['Negative','Positive'], annot=True)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision = precision_score(predictions, Y_test)
recall = recall_score(predictions, Y_test)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

F1Score= (2*precision*recall)/(precision+recall)
print(f'F1 Score: {F1Score:.4f}')

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel="linear")
svc.fit(X_train_scaled, Y_train)

In [None]:
print("The Training accuracy of the model is {:.2f}".format(svc.score(X_train_scaled, Y_train)))
print("The Testing accuracy of the model is {:.2f}".format(svc.score(X_test_scaled, Y_test)))

In [None]:
predictions = svc.predict(X_test_scaled)

In [None]:
from sklearn.metrics import confusion_matrix
mat=confusion_matrix(predictions, Y_test)
print(mat)

In [None]:
from seaborn import heatmap
heatmap(mat , cmap="Pastel1_r", xticklabels=['Negative', 'Positive'], yticklabels=['Negative','Positive'], annot=True)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision = precision_score(predictions, Y_test)
recall = recall_score(predictions, Y_test)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

F1Score= (2*precision*recall)/(precision+recall)
print(f'F1 Score: {F1Score:.4f}')

In [None]:
ACCURACY = {"Models": ["K-Nearest Neighbour", "Naive Bayes", "Logistic Regression", "Random Forest", "Scalar Vector Classifier"],
            "Accuracy": [0.96, 0.90, 0.96, 0.97, 0.96]}


df_accuracy = pd.DataFrame(ACCURACY)

plt.figure(figsize=(8, 6))
sns.barplot(data=df_accuracy, x="Models", y="Accuracy", palette="Pastel1")
plt.title('Accuracy Comparison')
plt.xticks(rotation=45, ha="right")
plt.show()
