In [None]:
import numpy as np  # noqa
import pandas as pd  # noqa
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv("../data/diabetes.csv")
df.head(5)

In [None]:
df.isna().any()

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
f, ax = plt.subplots(1, figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, ax=ax)

In [None]:
sns.countplot(x=df.Outcome)

In [None]:
column_names = df.columns
column_names = column_names.drop("Outcome")
for name in column_names:
    print("{}\n".format(name))
    print(df.groupby(["Outcome"])[name].mean())
    print("*" * 50)
    print()

In [None]:
df.hist()
plt.show()

In [None]:
X = df.iloc[:, 0:8]
y = df.iloc[:, 8]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=0
)

# LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression(max_iter=1000)
reg.fit(X_train, y_train)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

ax = sns.heatmap(
    cnf_matrix,
    annot=True,
    xticklabels=["No Diabetes", "Diabetes"],
    yticklabels=["No Diabetes", "Diabetes"],
    cbar=False,
    cmap="Blues",
    fmt=".6g",
)
plt.title("Confusion Matrix")
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")
plt.show()
plt.clf()

In [None]:
print("Accuracy : ", metrics.accuracy_score(y_test, y_pred))

In [None]:
from pathlib import Path
import pickle

Path("../models").mkdir(exist_ok=True)
pickle.dump(reg, open("../models/logistic_reg_1000.sav", "wb"))

In [None]:
data = [[6, 148, 72, 35, 0, 33.6, 0.627, 50], [1, 85, 66, 29, 0, 26.6, 0.351, 31]]

df = pd.DataFrame(
    data,
    columns=[
        "Pregnancies",
        "Glucose",
        "BloodPressure",
        "SkinThickness",
        "Insulin",
        "BMI",
        "DiabetesPedigreeFunction",
        "Age",
    ],
)

pred = reg.predict(df)
print('data', data, '\npred', pred)