# Qualidade de Vinhos

## Introdução

Vamos neste notebook implementar o modelo de floresta aleatória com os dados a respeito da qualidade do vinho. As colunas se referem a características como acidez, quantidade de açúcar, dentre outros, e a classe é "quality", que é uma pontuação de 0 a 10.

## Dados Iniciais

In [1]:
from pandas import read_csv

df = read_csv('WineQT.csv')

df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,2
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,3
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1592
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,1593
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1594
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1595


## Tratamento dos dados

Primeiramente, verifiquemos se há linhas repetidas e células vazias.

In [2]:
df[ df.duplicated() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [3]:
df[ df['fixed acidity'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [4]:
df[ df['volatile acidity'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [5]:
df[ df['citric acid'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [6]:
df[ df['residual sugar'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [7]:
df[ df['chlorides'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [8]:
df[ df['free sulfur dioxide'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [9]:
df[ df['total sulfur dioxide'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [10]:
df[ df['density'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [11]:
df[ df['pH'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [12]:
df[ df['sulphates'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [13]:
df[ df['alcohol'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [14]:
df[ df['quality'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


In [15]:
df[ df['Id'].isnull() ]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id


## Divisão de atributos e classes

In [18]:
atributos = df[ ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'] ]

atributos

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2


In [20]:
classe = df[['quality']]

classe

Unnamed: 0,quality
0,5
1,5
2,5
3,6
4,5
...,...
1138,6
1139,6
1140,5
1141,6


## Particionamento de treino e teste

In [25]:
from sklearn.model_selection import train_test_split

x = atributos.values
y = classe.iloc[:,0].values

xTreino, xTeste, yTreino, yTeste = train_test_split(x,y,
                                                    test_size = 0.3,
                                                    random_state = 0)

xTreino

array([[7.30e+00, 4.80e-01, 3.20e-01, ..., 3.30e+00, 6.50e-01, 1.00e+01],
       [6.80e+00, 5.60e-01, 3.00e-02, ..., 3.44e+00, 6.30e-01, 1.00e+01],
       [5.30e+00, 5.70e-01, 1.00e-02, ..., 3.57e+00, 8.40e-01, 1.25e+01],
       ...,
       [9.80e+00, 3.90e-01, 4.30e-01, ..., 3.19e+00, 4.60e-01, 1.14e+01],
       [7.10e+00, 5.20e-01, 3.00e-02, ..., 3.50e+00, 6.00e-01, 9.80e+00],
       [8.80e+00, 3.30e-01, 4.10e-01, ..., 3.30e+00, 6.20e-01, 1.21e+01]])

In [26]:
yTreino

array([7, 6, 7, 5, 7, 5, 5, 6, 7, 5, 5, 7, 6, 8, 5, 7, 6, 5, 5, 5, 6, 5,
       6, 5, 5, 6, 6, 5, 6, 5, 5, 7, 6, 5, 8, 5, 6, 6, 5, 7, 5, 5, 6, 5,
       6, 5, 6, 7, 6, 6, 5, 6, 7, 6, 5, 5, 5, 5, 5, 6, 6, 6, 5, 5, 5, 6,
       5, 5, 6, 6, 5, 6, 6, 5, 7, 6, 5, 5, 4, 7, 5, 5, 7, 6, 6, 5, 5, 6,
       4, 5, 8, 5, 5, 6, 6, 5, 6, 4, 7, 8, 7, 6, 4, 7, 6, 6, 7, 6, 6, 6,
       5, 7, 8, 6, 6, 6, 6, 5, 6, 7, 5, 6, 5, 6, 5, 7, 6, 6, 6, 5, 6, 6,
       5, 5, 6, 6, 5, 6, 5, 5, 6, 6, 5, 7, 5, 5, 5, 5, 5, 5, 5, 6, 5, 7,
       6, 5, 5, 5, 7, 5, 5, 8, 6, 5, 6, 5, 7, 5, 6, 6, 5, 5, 6, 7, 6, 5,
       6, 6, 5, 5, 5, 6, 6, 5, 6, 7, 6, 6, 5, 5, 5, 6, 6, 5, 5, 5, 5, 5,
       4, 5, 6, 6, 5, 6, 5, 7, 5, 7, 6, 5, 5, 5, 7, 5, 5, 5, 7, 5, 5, 6,
       6, 6, 5, 5, 5, 6, 5, 5, 6, 6, 5, 6, 6, 7, 6, 5, 6, 7, 6, 6, 7, 5,
       8, 7, 5, 6, 6, 5, 4, 6, 5, 5, 6, 5, 5, 7, 6, 6, 6, 5, 6, 3, 5, 6,
       5, 5, 6, 6, 5, 5, 7, 5, 5, 5, 6, 5, 5, 6, 6, 5, 6, 6, 5, 4, 7, 5,
       7, 6, 5, 6, 6, 5, 4, 6, 7, 5, 7, 7, 5, 6, 5,

## Implementação da Floresta Aleatória

Vamos usar 100 árvores para compor a floresta.

In [27]:
from sklearn.ensemble import RandomForestClassifier

floresta = RandomForestClassifier(n_estimators = 100)

floresta.fit(xTreino,yTreino)

previsao = floresta.predict(xTeste)


## Precisão do modelo

In [28]:
from sklearn.metrics import accuracy_score

taxa = accuracy_score(previsao,yTeste)

taxa

0.6588921282798834