In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv("iris.csv")
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [3]:
df["is_setosa"] = (df["Species"] == "setosa") + 0
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,is_setosa
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1


In [4]:
df["is_setosa"].value_counts()

0    100
1     50
Name: is_setosa, dtype: int64

In [5]:
df["is_setosa"].value_counts(normalize = True)

0    0.666667
1    0.333333
Name: is_setosa, dtype: float64

In [6]:
model = GaussianNB().fit(X = df.iloc[:, :4],
                         y = df["is_setosa"])
model

In [7]:
model.class_prior_

array([0.66666667, 0.33333333])

In [8]:
model.theta_

array([[6.262, 2.872, 4.906, 1.676],
       [5.006, 3.428, 1.462, 0.246]])

In [9]:
pred = model.predict_proba(df.iloc[:, :4])
pred = pred[:, 1]
pred[:4]

array([1., 1., 1., 1.])

In [10]:
from sklearn.metrics import accuracy_score

In [11]:
pred_class = (pred > 0.99) + 0

In [12]:
accuracy_score(y_true = df["is_setosa"],
               y_pred = pred_class)

1.0

## 1번

In [13]:
df = pd.read_csv("diabetes.csv")
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [14]:
df_sub = df.loc[df["BMI"] > 0, ]

In [15]:
df_sub["Outcome"].value_counts(normalize = True) 

0    0.648613
1    0.351387
Name: Outcome, dtype: float64

## 2번

In [None]:
df = pd.read_csv("diabetes.csv")
df.head(2)

In [None]:
model = GaussianNB().fit(X = df.loc[:, ["Glucose", "BloodPressure", "Age"]], 
                         y = df["Outcome"])
pred = model.predict_proba(df.loc[:, ["Glucose", "BloodPressure", "Age"]])

In [None]:
pred_class = (pred[:, 1] > 0.5) + 0
pred_class[:4]

In [None]:
accuracy_score(y_pred = pred_class, y_true = df["Outcome"])

## 3번

In [8]:
df = pd.read_csv("diabetes.csv")
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [9]:
df = df.loc[df["BMI"] > 0, ]
df["Age_g"] = (df["Age"] // 10) * 10
df["is_preg"] = (df["Pregnancies"] > 0) + 0
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Age_g,is_preg
0,6,148,72,35,0,33.6,0.627,50,1,50,1
1,1,85,66,29,0,26.6,0.351,31,0,30,1


In [10]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size = 0.8, random_state = 123)
df_train.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Age_g,is_preg
247,0,165,90,33,680,52.3,0.427,23,0,20,0
659,3,80,82,31,70,34.2,1.292,27,1,20,1


In [11]:
model = GaussianNB().fit(X = df_train.loc[:, ["is_preg", "Age_g", "BMI", "Glucose"]],
                         y = df_train["Outcome"])
pred = model.predict_proba(df_test.loc[:, ["is_preg", "Age_g", "BMI", "Glucose"]])
pred[:4, ]

array([[0.09436402, 0.90563598],
       [0.74783283, 0.25216717],
       [0.11042961, 0.88957039],
       [0.57991266, 0.42008734]])

In [12]:
accuracy_score(y_pred = (pred[:, 1] > 0.5) + 0,
               y_true = df_test["Outcome"])

0.8026315789473685

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
model_lr = LogisticRegression()
model_lr.fit(X = df_train.loc[:, ["is_preg", "Age_g", "BMI", "Glucose"]],
             y = df_train["Outcome"])

LogisticRegression()

In [15]:
pred_lr = model_lr.predict_proba(df_test.loc[:, ["is_preg", "Age_g", "BMI", "Glucose"]])
pred_lr = pred_lr[:, 1]
pred_lr_class = (pred_lr > 0.5) + 0

In [16]:
accuracy_score(y_true = df_test["Outcome"],
               y_pred = pred_lr_class)

0.8289473684210527