In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [2]:
dados = pd.read_csv('sample_data/student-mat.csv', sep= ';')
dados.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [3]:
# remover as instâncias com valores faltantes do DataFrame
dados = dados.dropna()
print(dados)

    school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... famrel freetime  goout  Dalc  Walc health 

In [4]:
#remover instâncias duplicadas
dados = dados.drop_duplicates()
print(dados)

    school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... famrel freetime  goout  Dalc  Walc health 

In [5]:
X = dados[['sex', 'age', 'Pstatus', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']]

In [6]:
X.head(10)

Unnamed: 0,sex,age,Pstatus,Medu,Fedu,traveltime,studytime,failures,schoolsup,famsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,F,18,A,4,4,2,2,0,yes,no,...,4,3,4,1,1,3,6,5,6,6
1,F,17,T,1,1,1,2,0,no,yes,...,5,3,3,1,1,3,4,5,5,6
2,F,15,T,1,1,1,2,3,yes,no,...,4,3,2,2,3,3,10,7,8,10
3,F,15,T,4,2,1,3,0,no,yes,...,3,2,2,1,1,5,2,15,14,15
4,F,16,T,3,3,1,2,0,no,yes,...,4,3,2,1,2,5,4,6,10,10
5,M,16,T,4,3,1,2,0,no,yes,...,5,4,2,1,2,5,10,15,15,15
6,M,16,T,2,2,1,2,0,no,no,...,4,4,4,1,1,3,0,12,12,11
7,F,17,A,4,4,2,2,0,yes,yes,...,4,1,4,1,1,1,6,6,5,6
8,M,15,A,3,2,1,2,0,no,yes,...,4,2,2,1,1,1,0,16,18,19
9,M,15,T,3,4,1,2,0,no,yes,...,5,5,1,1,1,5,0,14,15,15


In [7]:
#Aplicando One_Hot Encoding para as variaveis categoricas
X = pd.get_dummies(X, columns=['sex','Pstatus','schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic'])
X.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
0,18,4,4,2,2,0,4,3,4,1,...,1,0,0,1,0,1,1,0,1,0
1,17,1,1,1,2,0,5,3,3,1,...,1,0,1,0,0,1,0,1,1,0
2,15,1,1,1,2,3,4,3,2,2,...,1,0,0,1,0,1,0,1,1,0
3,15,4,2,1,3,0,3,2,2,1,...,0,1,0,1,0,1,0,1,0,1
4,16,3,3,1,2,0,4,3,2,1,...,1,0,0,1,0,1,1,0,1,0


In [8]:
# criando um novo dataframe com a média das duas colunas
df_media = pd.DataFrame({'media': (dados['G1'] + dados['G2'] + dados['G3'])/3})
df_media.head()

Unnamed: 0,media
0,5.666667
1,5.333333
2,8.333333
3,14.666667
4,8.666667


In [9]:
# criando um dataframe com uma coluna binária "Aprovado"
Y = pd.DataFrame({'Aprovado': [1 if media >= 12 else 0 for media in df_media['media']]})
Y.head()

Unnamed: 0,Aprovado
0,0
1,0
2,0
3,1
4,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)


In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# definindo quais colunas serão normalizadas
colunas_a_normalizar = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']

#fazendo os ajustes dos dados
scaler.fit(X_train[colunas_a_normalizar])


In [12]:
# normalizando as colunas selecionadas dos dados de treinamento e substituindo os dados originais no DataFrame
X_train[colunas_a_normalizar] = scaler.transform(X_train[colunas_a_normalizar])

# imprimindo o DataFrame resultante
X_train.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
17,0.142857,0.75,0.75,0.666667,0.333333,0.0,1.0,0.5,0.25,0.0,...,0,1,0,1,0,1,1,0,1,0
165,0.142857,0.75,0.5,0.333333,0.0,0.333333,0.75,1.0,0.25,0.0,...,0,1,1,0,1,0,1,0,1,0
24,0.0,0.5,1.0,0.0,0.666667,0.0,0.75,0.5,0.25,0.0,...,0,1,0,1,0,1,0,1,1,0
126,0.0,0.75,1.0,0.0,0.333333,0.0,1.0,0.5,0.25,0.0,...,0,1,0,1,0,1,0,1,0,1
346,0.428571,1.0,0.75,0.0,0.666667,0.0,1.0,0.5,0.25,0.0,...,1,0,0,1,0,1,0,1,0,1


In [13]:
# normalizando as colunas selecionadas dos dados de teste e substituindo os dados originais no DataFrame
X_test[colunas_a_normalizar] = scaler.transform(X_test[colunas_a_normalizar])

# imprimindo o DataFrame resultante
X_test.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
78,0.285714,0.5,0.25,0.333333,0.0,1.0,0.75,1.0,0.0,0.0,...,0,1,0,1,1,0,0,1,1,0
371,0.428571,0.25,0.5,0.666667,0.0,0.0,0.75,0.5,0.5,0.25,...,0,1,0,1,1,0,0,1,0,1
248,0.428571,0.75,0.75,0.0,0.333333,0.333333,0.75,0.5,0.5,0.0,...,1,0,0,1,0,1,0,1,0,1
55,0.142857,0.5,0.25,0.0,0.333333,0.0,1.0,0.5,0.75,0.0,...,0,1,0,1,0,1,0,1,0,1
390,0.714286,0.5,0.5,0.0,0.333333,0.666667,1.0,1.0,0.75,0.75,...,1,0,0,1,0,1,1,0,1,0


In [14]:
from sklearn.linear_model import LogisticRegression
modelo = LogisticRegression()

#treinando o modelo
modelo.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


In [15]:
#fazendo predições com os dados de teste
y_pred = modelo.predict(X_test)


In [16]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

#quantos aprovados e reprovados foram preditos corretamente
print("Acurácia: {:.2f}".format(accuracy))

#do nro total de preditos como aprovados, quantos são mesmo aprovados
print("Precisão: {:.2f}".format(precision))

#do nro total de aprovados na base de dados quantos foram preditos como aprovados
print("Recall: {:.2f}".format(recall))

#média harmônica entre precisão e recall
print("F1: {:.2f}".format(f1))

Acurácia: 0.92
Precisão: 0.91
Recall: 0.89
F1: 0.90
