## Importando as bibliotecas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score
# evaluate a logistic regression model using k-fold cross-validation
from numpy import mean
from numpy import std

## Importando o dataset limpo

In [2]:
MAT_DIR = './clean_db.csv'
df = pd.read_csv(MAT_DIR, sep=',')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,famsup,studytime,famrel,absences,internet,G1,G2,G3,Mean
0,0,0,0,0,4,4,0,1,0,2,4,6,0,0,1,1,1
1,1,0,0,1,1,1,0,0,1,2,5,4,1,0,0,1,1
2,2,0,1,1,1,1,0,0,0,2,4,10,1,1,1,1,1
3,3,0,0,1,4,2,0,0,1,3,3,2,1,2,2,2,2
4,4,0,0,1,3,3,0,0,1,2,4,4,0,1,1,1,1


In [4]:
X = df[['address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'famsup', 'studytime', 'famrel', 'absences']]

In [5]:
Y1 = df['G1']
X.corrwith(Y1, axis=0)

address     -0.070344
famsize      0.075199
Pstatus     -0.020198
Medu         0.170044
Fedu         0.165489
Mjob         0.047195
Fjob         0.163136
famsup      -0.098737
studytime    0.114695
famrel      -0.003887
absences    -0.034170
dtype: float64

## Escolhendo o melhor numero de splits para o KFold

In [6]:
scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X, Y1, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))
# Subplots as having two types of quality
print(max(scr))
print(scr.index(max(scr))+2)
fold_Y1 = scr.index(max(scr))+2

(0.26315182279649285, 0.055029995385325325)
2


## Criando um modelo para com os parâmetros selecionados

In [7]:
kf_Y1 = KFold(n_splits=fold_Y1, random_state=42, shuffle=True)
bayes_Y1 = GaussianNB()
score_y1 = cross_val_score(bayes_Y1, X, Y1, scoring='accuracy', cv=kf_Y1)
print('Accuracy: %.3f (%.3f)' % (mean(score_y1), std(score_y1)))

Accuracy: 0.263 (0.055)


## Fazendo o mesmo para Y2

In [8]:
Y2 = df['G2']
X.corrwith(Y2, axis=0)

address     -0.098215
famsize      0.083806
Pstatus     -0.072038
Medu         0.218669
Fedu         0.158422
Mjob         0.043885
Fjob         0.139701
famsup      -0.040300
studytime    0.119371
famrel      -0.039439
absences    -0.040821
dtype: float64

In [9]:
scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X, Y2, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))
# Subplots as having two types of quality
print(max(scr))
print(scr.index(max(scr))+2)
fold_Y2 = scr.index(max(scr))+2

(0.4406850459482038, 0.11033579790127106)
21


In [10]:
kf_Y2 = KFold(n_splits=fold_Y2, random_state=42, shuffle=True)
bayes_Y2 = GaussianNB()
score_y2 = cross_val_score(bayes_Y2, X, Y2, scoring='accuracy', cv=kf_Y2)
print('Accuracy: %.3f (%.3f)' % (mean(score_y2), std(score_y2)))

Accuracy: 0.441 (0.110)


## E para Y3

In [11]:
Y3 = df['G3']
X.corrwith(Y3, axis=0)

address     -0.104551
famsize      0.077029
Pstatus     -0.079507
Medu         0.206651
Fedu         0.157953
Mjob         0.044866
Fjob         0.130110
famsup      -0.021155
studytime    0.108751
famrel       0.031782
absences     0.005778
dtype: float64

In [40]:
scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X, Y3, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))
# Subplots as having two types of quality
print(max(scr))
print(scr.index(max(scr))+2)
fold_Y3 = scr.index(max(scr))+2

(0.3977667493796526, 0.07012744308854979)
13


In [41]:
kf_Y3 = KFold(n_splits=fold_Y3, random_state=42, shuffle=True)
bayes_Y3 = GaussianNB()
score_y3 = cross_val_score(bayes_Y3, X, Y3, scoring='accuracy', cv=kf_Y3)
print('Accuracy: %.3f (%.3f)' % (mean(score_y3), std(score_y3)))

Accuracy: 0.398 (0.070)


## E, finalmente, Y médio

In [14]:
Y_mean = df['Mean']
X.corrwith(Y_mean, axis=0)

address     -0.091153
famsize      0.082624
Pstatus     -0.071170
Medu         0.231293
Fedu         0.180191
Mjob         0.079075
Fjob         0.145241
famsup      -0.063743
studytime    0.124417
famrel      -0.010751
absences     0.000731
dtype: float64

In [15]:
scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X, Y_mean, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))
# Subplots as having two types of quality
print(max(scr))
print(scr.index(max(scr))+2)
fold_Y_mean = scr.index(max(scr))+2

(0.4151412603189253, 0.01920217402450905)
2


In [16]:
kf_Y_mean = KFold(n_splits=fold_Y_mean, random_state=42, shuffle=True)
bayes_Y_mean = GaussianNB()
score_Y_mean = cross_val_score(bayes_Y_mean, X, Y_mean, scoring='accuracy', cv=kf_Y_mean)
print('Accuracy: %.3f (%.3f)' % (mean(score_Y_mean), std(score_Y_mean)))

Accuracy: 0.415 (0.019)


## Removendo as colunas referentes à profissão dos pais.

In [17]:
X_2 = df[['address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'famsup', 'studytime', 'famrel', 'absences']]

## Para Y1

In [18]:
X_2.corrwith(Y1, axis=0)

address     -0.070344
famsize      0.075199
Pstatus     -0.020198
Medu         0.170044
Fedu         0.165489
famsup      -0.098737
studytime    0.114695
famrel      -0.003887
absences    -0.034170
dtype: float64

In [19]:
scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X_2, Y1, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))
# Subplots as having two types of quality
print(max(scr))
print(scr.index(max(scr))+2)
fold_Y1_2 = scr.index(max(scr))+2

(0.4934640522875818, 0.08959472997711634)
22


In [20]:
kf_Y1_2 = KFold(n_splits=fold_Y1_2, random_state=42, shuffle=True)
bayes_Y1_2 = GaussianNB()
score_y1_2 = cross_val_score(bayes_Y1_2, X_2, Y1, scoring='accuracy', cv=kf_Y1_2)
print('Accuracy: %.3f (%.3f)' % (mean(score_y1_2), std(score_y1_2)))

Accuracy: 0.493 (0.090)


## Y2

In [21]:
X_2.corrwith(Y2, axis=0)

address     -0.098215
famsize      0.083806
Pstatus     -0.072038
Medu         0.218669
Fedu         0.158422
famsup      -0.040300
studytime    0.119371
famrel      -0.039439
absences    -0.040821
dtype: float64

In [22]:
scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X_2, Y2, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))
# Subplots as having two types of quality
print(max(scr))
print(scr.index(max(scr))+2)
fold_Y2_2 = scr.index(max(scr))+2

(0.45880893300248143, 0.05957563226190421)
13


In [23]:
kf_Y2_2 = KFold(n_splits=fold_Y2_2, random_state=42, shuffle=True)
bayes_Y2_2 = GaussianNB()
score_y2_2 = cross_val_score(bayes_Y2_2, X_2, Y2, scoring='accuracy', cv=kf_Y2_2)
print('Accuracy: %.3f (%.3f)' % (mean(score_y2_2), std(score_y2_2)))

Accuracy: 0.459 (0.060)


## Y3

In [24]:
X_2.corrwith(Y3, axis=0)

address     -0.104551
famsize      0.077029
Pstatus     -0.079507
Medu         0.206651
Fedu         0.157953
famsup      -0.021155
studytime    0.108751
famrel       0.031782
absences     0.005778
dtype: float64

In [25]:
scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X_2, Y3, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))
# Subplots as having two types of quality
print(max(scr))
print(scr.index(max(scr))+2)
fold_Y3_2 = scr.index(max(scr))+2

(0.4126211090496805, 0.023405111454462935)
4


In [26]:
kf_Y3_2 = KFold(n_splits=fold_Y3_2, random_state=42, shuffle=True)
bayes_Y3_2 = GaussianNB()
score_y3_2 = cross_val_score(bayes_Y3_2, X_2, Y3, scoring='accuracy', cv=kf_Y3_2)
print('Accuracy: %.3f (%.3f)' % (mean(score_y3_2), std(score_y3_2)))

Accuracy: 0.413 (0.023)


## Y médio

In [27]:
X_2.corrwith(Y_mean, axis=0)

address     -0.091153
famsize      0.082624
Pstatus     -0.071170
Medu         0.231293
Fedu         0.180191
famsup      -0.063743
studytime    0.124417
famrel      -0.010751
absences     0.000731
dtype: float64

In [28]:
scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X_2, Y_mean, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))
#  as having two types of quality
print(max(scr))
print(scr.index(max(scr))+2)
fold_Y_mean_2 = scr.index(max(scr))+2

(0.41012921089063226, 0.0010383017997231125)
2


In [29]:
kf_Y_mean_2 = KFold(n_splits=fold_Y_mean_2, random_state=42, shuffle=True)
bayes_Y_mean_2 = GaussianNB()
score_y_mean_2 = cross_val_score(bayes_Y_mean_2, X_2, Y_mean, scoring='accuracy', cv=kf_Y_mean_2)
print('Accuracy: %.3f (%.3f)' % (mean(score_y_mean_2), std(score_y_mean_2)))

Accuracy: 0.410 (0.001)


## Removendo as colunas referentes à educação dos pais

In [30]:
X_3 = df[['address', 'famsize', 'Pstatus', 'famsup', 'studytime', 'famrel', 'absences']]


## Y1

In [31]:
X_3.corrwith(Y1, axis=0)

scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X_3, Y1, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))

print(max(scr))
print(scr.index(max(scr))+2)
fold_Y1_3 = scr.index(max(scr))+2

kf_Y1_3 = KFold(n_splits=fold_Y1_3, random_state=42, shuffle=True)
bayes_Y1_3 = GaussianNB()
score_y1_3 = cross_val_score(bayes_Y1_3, X_3, Y1, scoring='accuracy', cv=kf_Y1_3)
print('Accuracy: %.3f (%.3f)' % (mean(score_y1_3), std(score_y1_3)))

(0.46067821067821063, 0.060144777911594324)
11
Accuracy: 0.461 (0.060)


## Y2

In [32]:
X_3.corrwith(Y2, axis=0)

scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X_3, Y2, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))

print(max(scr))
print(scr.index(max(scr))+2)
fold_Y2_3 = scr.index(max(scr))+2

kf_Y2_3 = KFold(n_splits=fold_Y2_3, random_state=42, shuffle=True)
bayes_Y2_3 = GaussianNB()
score_y2_3 = cross_val_score(bayes_Y2_3, X_3, Y2, scoring='accuracy', cv=kf_Y2_3)
print('Accuracy: %.3f (%.3f)' % (mean(score_y2_3), std(score_y2_3)))

(0.42830882352941174, 0.10979256878460833)
24
Accuracy: 0.428 (0.110)


## Y3

In [33]:
X_3.corrwith(Y3, axis=0)

scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X_3, Y3, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))

print(max(scr))
print(scr.index(max(scr))+2)
fold_Y3_3 = scr.index(max(scr))+2

kf_Y3_3 = KFold(n_splits=fold_Y3_3, random_state=42, shuffle=True)
bayes_Y3_3 = GaussianNB()
score_y3_3 = cross_val_score(bayes_Y3_3, X_3, Y3, scoring='accuracy', cv=kf_Y3_3)
print('Accuracy: %.3f (%.3f)' % (mean(score_y3_3), std(score_y3_3)))

(0.41531685137823243, 0.12376319252510067)
23
Accuracy: 0.415 (0.124)


## Y médio

In [34]:
X_3.corrwith(Y_mean, axis=0)

scr = []
for i in range(2,30):
    kf = KFold(n_splits=i, random_state=42, shuffle=True)
    model_tmp = GaussianNB()
    score_tmp = cross_val_score(model_tmp, X_3, Y_mean, scoring='accuracy', cv=kf)
    scr.append((mean(score_tmp), std(score_tmp)))

print(max(scr))
print(scr.index(max(scr))+2)
fold_Y_mean_3 = scr.index(max(scr))+2

kf_Y_mean_3 = KFold(n_splits=fold_Y_mean_3, random_state=42, shuffle=True)
bayes_Y_mean_3 = GaussianNB()
score_y_mean_3 = cross_val_score(bayes_Y_mean_3, X_3, Y_mean, scoring='accuracy', cv=kf_Y_mean_3)
print('Accuracy: %.3f (%.3f)' % (mean(score_y_mean_3), std(score_y_mean_3)))

(0.3758169934640523, 0.12728145763495138)
23
Accuracy: 0.376 (0.127)


## Comparando as acurácias

## Y1

In [35]:
print('Accuracy normal: %.3f +/- %.3f' % (mean(score_y1), std(score_y1)))
print('Accuracy s/ profissão: %.3f +/- %.3f' % (mean(score_y1_2), std(score_y1_2)))
print('Accuracy s/ educação: %.3f +/- %.3f' % (mean(score_y1_3), std(score_y1_3)))

Accuracy normal: 0.263 +/- 0.055
Accuracy s/ profissão: 0.493 +/- 0.090
Accuracy s/ educação: 0.461 +/- 0.060


## Y2

In [36]:
print('Accuracy normal: %.3f +/- %.3f' % (mean(score_y2), std(score_y2)))
print('Accuracy s/ profissão: %.3f +/- %.3f' % (mean(score_y2_2), std(score_y2_2)))
print('Accuracy s/ educação: %.3f +/- %.3f' % (mean(score_y2_3), std(score_y2_3)))

Accuracy normal: 0.441 +/- 0.110
Accuracy s/ profissão: 0.459 +/- 0.060
Accuracy s/ educação: 0.428 +/- 0.110


## Y3

In [37]:
print('Accuracy normal: %.3f +/- %.3f' % (mean(score_y3), std(score_y3)))
print('Accuracy s/ profissão: %.3f +/- %.3f' % (mean(score_y3_2), std(score_y3_2)))
print('Accuracy s/ educação: %.3f +/- %.3f' % (mean(score_y3_3), std(score_y3_3)))

Accuracy normal: 0.355 +/- 0.093
Accuracy s/ profissão: 0.413 +/- 0.023
Accuracy s/ educação: 0.415 +/- 0.124


## Y médio

In [38]:
print('Accuracy média normal: %.3f +/- %.3f' % (mean(score_Y_mean), std(score_Y_mean)))
print('Accuracy média s/ profissão: %.3f +/- %.3f' % (mean(score_y_mean_2), std(score_y_mean_2)))
print('Accuracy média s/ educação: %.3f +/- %.3f' % (mean(score_y_mean_3), std(score_y_mean_3)))

Accuracy média normal: 0.415 +/- 0.019
Accuracy média s/ profissão: 0.410 +/- 0.001
Accuracy média s/ educação: 0.376 +/- 0.127
