In [1]:
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np

In [2]:
def process_admit(x):
    if x < 0.5  : return 0
    if x >= 0.5 : return 1

In [3]:
ad_train = pd.read_csv('Admission_train.xls')
ad_test = pd.read_csv('Admission_test.xls')

In [4]:
df = pd.concat([ad_train,ad_test], axis = 0)
#df = pd.DataFrame(ad_train)
#df = pd.DataFrame(ad_test)
len(ad_train), len(ad_test), len(df)

(400, 100, 500)

In [5]:
df = df.drop('Unnamed: 0', axis = 1)
df = df.drop('Serial No.', axis = 1)
df.head(2)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,318,109,3,3.5,4.0,9.22,1,0.68
1,336,118,5,4.5,4.0,9.19,1,0.92


In [6]:
df['Chance of Admit '] = pd.DataFrame(df['Chance of Admit '].apply(process_admit))

In [7]:
df.head(2)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,318,109,3,3.5,4.0,9.22,1,1
1,336,118,5,4.5,4.0,9.19,1,1


In [8]:
#le = LabelEncoder()

In [9]:
#for i in range(len(df.columns)):
#    df.iloc[:, i] = le.fit_transform(df.iloc[:, i])

In [10]:
# разделим датасет на признаки.
dfx = df.drop('Chance of Admit ', axis = 1)
dfx.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,318,109,3,3.5,4.0,9.22,1
1,336,118,5,4.5,4.0,9.19,1
2,324,110,3,3.5,3.0,9.22,1
3,334,120,5,4.0,5.0,9.87,1
4,312,103,3,3.5,4.0,8.78,0


In [11]:
# и целевую переменную
dfy = df[['Chance of Admit ']]
dfy.head()

Unnamed: 0,Chance of Admit
0,1
1,1
2,1
3,1
4,1


In [12]:
# разделим данные на train и test
# зафиксируем random_state=180
X_train, X_test, y_train, y_test = train_test_split(dfx, dfy, test_size=0.3, random_state=180) 
X_train.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
67,331,117,4,4.5,5.0,9.42,1
82,313,98,3,2.5,4.5,8.3,1
109,321,109,3,3.0,4.0,8.2,1
101,316,101,2,2.5,2.0,8.32,1
298,324,111,3,2.5,2.0,8.8,1


In [13]:
gnb = GaussianNB()

In [14]:
# обучаем модель
gnb.fit(X_train, y_train.values.ravel())

GaussianNB()

In [15]:
# делаем предсказания на тестовой выборке
y_pred = gnb.predict(X_test)
y_pred

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1])

In [16]:
# смотрим, где угадали и не угадали
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[  6,   0],
       [ 27, 117]])

In [17]:
share = y_train['Chance of Admit '].value_counts()
w0 = share[1]/(share[0]+share[1])
w = np.array([w0,1-w0])
w

array([0.91142857, 0.08857143])

In [18]:
np.bincount(y_train['Chance of Admit ']) # считает количество вхождений 0 и 1 в y_train['TenYearCHD']
w_b = y_train.shape[0]/ (2*np.bincount(y_train['Chance of Admit ']))
w_b

array([5.64516129, 0.54858934])

In [19]:
# Давайте убедимся, что отношения весов действительно одинаковые
# При этом бОльший по размеру класс (нулевой, то есть здоровые пациенты) имеет мЕньший вес
print('отношение интуитивных весов: ', w[0]/w[1])
print('отношение balanced весов: ', w_b[0]/w_b[1])

отношение интуитивных весов:  10.290322580645167
отношение balanced весов:  10.290322580645162


In [20]:
TN = cnf_matrix[0,0] # True Negative
TP = cnf_matrix[1,1] # True Positive
FN = cnf_matrix[1,0] # False Negative
FP = cnf_matrix[0,1] # False Positive
    
Ac = gnb.score(X_test, y_test)
Sens = TP/(TP+FN) 
Sp = TN/(TN+FP)
P = TP/(TP+FP)
typeI = FP/(FP+TN)
typeII = FN/(FN+TP)

print('Accuracy: ', Ac)
print('Sensitivity: ', Sens)
print('Specificity: ', Sp)
print('Pricision: ', P)
print('Type I error rate: ', typeI)
print('Type II error rate: ', typeII)

Accuracy:  0.82
Sensitivity:  0.8125
Specificity:  1.0
Pricision:  1.0
Type I error rate:  0.0
Type II error rate:  0.1875
