## Estudo: Controle de Qualidade de Peças de uma Indústria

# Objetivo

O objetivo deste estudo é criar e avaliar diferentes modelos de classificação para prever a qualidade de peças de uma indústria.

`pieces` é um data set para o controle de qualidade de peças produzidas por uma indústria. São empregadas quatro medidas (A, B, C e D) para o controle da qualidade das peças. As peças são então *Accept, Refurbish* ou *Reject* segundo o controle de qualidade (atributo `Quality`). A indústria ainda conta com 3 unidades para a produção dessas peças (atributo `Unit`). 

# Imports

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importação dos dados
piecesdf = pd.read_csv('https://raw.githubusercontent.com/digonfernan/classificacao-qualidade-pecas-industria/main/pieces1.csv')

In [3]:
# Visualização inicial do dataset
display(piecesdf.shape)
display(piecesdf.head())
display(piecesdf.tail())

(500, 7)

Unnamed: 0,id,A,B,C,D,Quality,Unit
0,559,4.9,3.1,1.5,0.1,Reject,SP
1,629,4.8,3.4,1.6,0.2,Reject,SP
2,192,6.7,2.5,5.8,1.8,Refurbish,RJ
3,359,7.6,3.0,6.6,2.1,Refurbish,RJ
4,9,4.9,3.1,1.5,0.1,Reject,RJ


Unnamed: 0,id,A,B,C,D,Quality,Unit
495,155,4.6,3.6,1.0,0.2,Reject,SP
496,120,6.9,3.2,5.7,2.3,Refurbish,RJ
497,321,6.7,3.1,5.6,2.4,Refurbish,RJ
498,448,5.6,3.0,4.1,1.3,Accept,SP
499,290,4.4,2.9,1.4,0.2,Reject,BH


In [4]:
# Verificação da ausência de valores
piecesdf.isnull().values.any()

True

In [5]:
# Verificação da ausência de valores
piecesdf.isnull().sum()

id         0
A          0
B          9
C          6
D          4
Quality    0
Unit       0
dtype: int64

In [6]:
# Verifica-se a ausência de valores, portanto é necessário verificar se tem a necessidade de tratamento
piecesdf[piecesdf.isna().any(axis=1)]

Unnamed: 0,id,A,B,C,D,Quality,Unit
51,57,4.9,,3.3,1.0,Accept,RJ
68,143,6.8,3.2,,2.3,Refurbish,BH
95,574,4.9,,3.3,1.0,Accept,RJ
99,209,6.8,3.2,,2.3,Refurbish,BH
106,43,5.0,3.5,1.6,,Reject,RJ
109,80,5.5,,3.8,1.1,Accept,RJ
191,579,4.9,,3.3,1.0,Accept,RJ
196,394,5.5,,3.8,1.1,Accept,RJ
213,80,5.5,,3.8,1.1,Accept,RJ
218,209,6.8,3.2,,2.3,Refurbish,BH


In [7]:
# Os dados faltantes são nas medidas do controle de qualidade das peças. Esses dados serão substituidos pelo valor médio das demais.
piecesdf[['A', 'B', 'C', 'D']] = piecesdf[['A', 'B', 'C', 'D']].fillna(piecesdf[['A', 'B', 'C', 'D']].mean())

In [8]:
# Verificação da ausência de valores
piecesdf.isnull().values.any()

False

In [9]:
# A coluna ID não irá agregar muito valor para o modelo, portanto será removida
piecesdf = piecesdf.drop(columns = 'id')

In [10]:
# Visualização inicial do dataset
display(piecesdf.head())

Unnamed: 0,A,B,C,D,Quality,Unit
0,4.9,3.1,1.5,0.1,Reject,SP
1,4.8,3.4,1.6,0.2,Reject,SP
2,6.7,2.5,5.8,1.8,Refurbish,RJ
3,7.6,3.0,6.6,2.1,Refurbish,RJ
4,4.9,3.1,1.5,0.1,Reject,RJ


# Construção do modelo

In [11]:
# Imports
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [12]:
# Hot encode
hot_encode = OneHotEncoder(handle_unknown='ignore') 
hot_encode = hot_encode.fit(piecesdf[['Unit']])

In [13]:
# Verifica as colunas com hot encoded
display(hot_encode.categories_)

[array(['BH', 'RJ', 'SP'], dtype=object)]

In [14]:
# Transformar as colunas hot encoded e armazenar em nova variável
hot_encoded_data = hot_encode.transform(piecesdf[['Unit']]).toarray()

In [15]:
# Novo dataframe com as colunas hot encoded
ohepiecesdf = pd.DataFrame(hot_encoded_data, columns=hot_encode.categories_[0])

In [16]:
# Concatena os dataframes
ohepiecesdf = pd.concat([piecesdf, ohepiecesdf], axis=1)

In [17]:
# Visualização inicial do dataset
display(ohepiecesdf.head())

Unnamed: 0,A,B,C,D,Quality,Unit,BH,RJ,SP
0,4.9,3.1,1.5,0.1,Reject,SP,0.0,0.0,1.0
1,4.8,3.4,1.6,0.2,Reject,SP,0.0,0.0,1.0
2,6.7,2.5,5.8,1.8,Refurbish,RJ,0.0,1.0,0.0
3,7.6,3.0,6.6,2.1,Refurbish,RJ,0.0,1.0,0.0
4,4.9,3.1,1.5,0.1,Reject,RJ,0.0,1.0,0.0


# Normalização dos dados com StandardScaler

In [18]:
# Import
from sklearn.preprocessing import StandardScaler

In [19]:
# Normalização
# Cria o Objeto
scaler = StandardScaler()

# Remoção das colunas
scaler.fit(ohepiecesdf.drop(columns=['Quality', 'Unit']))

# Transformação dos dados
X = scaler.transform(ohepiecesdf.drop(columns=['Quality', 'Unit']))

# Visualização
X[0:5]

array([[-1.13864481,  0.05397611, -1.31636544, -1.47692919, -0.65777257,
        -0.57735027,  1.1100193 ],
       [-1.26113242,  0.7514042 , -1.25881814, -1.34683562, -0.65777257,
        -0.57735027,  1.1100193 ],
       [ 1.06613215, -1.34088009,  1.1581686 ,  0.73466146, -0.65777257,
         1.73205081, -0.90088523],
       [ 2.16852062, -0.17849993,  1.61854703,  1.12494216, -0.65777257,
         1.73205081, -0.90088523],
       [-1.13864481,  0.05397611, -1.31636544, -1.47692919, -0.65777257,
         1.73205081, -0.90088523]])

In [20]:
# Verificação da transformação
for l_index_state, l_state in zip([4,5,6], ["BH", "RJ", "SP"]):
    print(f"{l_state} -  {np.abs(X[:, l_index_state]).sum()}")

BH -  459.12525524087647
RJ -  433.0127018922194
SP -  497.2886485734416


# Separação de amostras de treinamento e teste

In [21]:
# Imports
from sklearn.model_selection import train_test_split

In [22]:
# Separação
x = X
y = ohepiecesdf.Quality

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y ,test_size=0.30, random_state=1984)

print( X_train.shape, X_test.shape, y_train.shape, y_test.shape )

(350, 7) (150, 7) (350,) (150,)


# K-vizinhos mais próximos

In [23]:
# Imports
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier
KNeighborsClassifier().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [24]:
# Construção do modelo
base_estimator = KNeighborsClassifier()
param_grid = {"n_neighbors" : [4,5,6,7,8,9,10,11], "metric" : ["euclidean", "manhattan"]}
clf = GridSearchCV(base_estimator, param_grid, cv=5 , scoring="accuracy")


clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [25]:
# Print verificações
print(f"Best Metric     \n{clf.best_params_}")
print(f"\n\nBest Estimator     \n{clf.best_estimator_}")
print(f"\n\nAccuracy        \n{accuracy_score(y_test, y_pred)}")
print(f"\n\nClassification    \n{classification_report(y_test, y_pred)}")

Best Metric     
{'metric': 'manhattan', 'n_neighbors': 7}


Best Estimator     
KNeighborsClassifier(metric='manhattan', n_neighbors=7)


Accuracy        
0.9533333333333334


Classification    
              precision    recall  f1-score   support

      Accept       0.88      0.98      0.93        47
   Refurbish       0.98      0.89      0.93        55
      Reject       1.00      1.00      1.00        48

    accuracy                           0.95       150
   macro avg       0.95      0.96      0.95       150
weighted avg       0.96      0.95      0.95       150



# Regressão logística

In [26]:
# Imports
from sklearn.linear_model import LogisticRegression
LogisticRegression().get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [27]:
# Construção do modelo
base_estimator = LogisticRegression()
param_grid = {}

class_f_log = GridSearchCV(base_estimator, param_grid, cv=5, scoring="accuracy")
class_f_log.fit(X_train, y_train)

y_pred_log = class_f_log.predict(X_test)

In [28]:
# Print verificações
print(f"Best Estimator     \n{class_f_log.best_estimator_}")
print(f"\n\nAcurracy       \n{accuracy_score(y_test , y_pred_log)}")
print(f"\n\nClassification    \n{classification_report(y_test, y_pred_log)}")

Best Estimator     
LogisticRegression()


Acurracy       
0.9733333333333334


Classification    
              precision    recall  f1-score   support

      Accept       0.92      1.00      0.96        47
   Refurbish       1.00      0.93      0.96        55
      Reject       1.00      1.00      1.00        48

    accuracy                           0.97       150
   macro avg       0.97      0.98      0.97       150
weighted avg       0.98      0.97      0.97       150



In [29]:
# Print verificações
print(f"""y_test : \nAccept - {(y_test == "Accept").sum()} \nRefurbish - {(y_test == "Refurbish").sum()} \nReject - {(y_test == "Reject").sum()}""")
print(f"""\n\ny_pred_log : \nAccept - {(y_pred_log == "Accept").sum()} \nRefurbish - {(y_pred_log == "Refurbish").sum()} \nReject - {(y_pred_log == "Reject").sum()}""")

y_test : 
Accept - 47 
Refurbish - 55 
Reject - 48


y_pred_log : 
Accept - 51 
Refurbish - 51 
Reject - 48


# Aplicando o melhor modelo

In [30]:
# Importação dos dados
newpiecesdf = pd.read_csv('https://raw.githubusercontent.com/digonfernan/classificacao-qualidade-pecas-industria/main/pieces_new.csv')

In [31]:
# Visualização inicial do dataset
display(newpiecesdf.shape)
display(newpiecesdf.head())
display(newpiecesdf.tail())

(5, 6)

Unnamed: 0,id,A,B,C,D,Unit
0,182,5.5,2.6,4.4,1.2,SP
1,345,5.8,2.7,4.1,1.0,SP
2,42,5.1,3.5,1.4,0.2,SP
3,37,6.3,2.5,4.9,1.5,BH
4,61,6.0,3.0,4.8,1.8,RJ


Unnamed: 0,id,A,B,C,D,Unit
0,182,5.5,2.6,4.4,1.2,SP
1,345,5.8,2.7,4.1,1.0,SP
2,42,5.1,3.5,1.4,0.2,SP
3,37,6.3,2.5,4.9,1.5,BH
4,61,6.0,3.0,4.8,1.8,RJ


In [32]:
# Verificação da ausência de valores
piecesdf.isnull().values.any()

False

In [33]:
# A coluna ID não irá agregar muito valor para o modelo, portanto será removida
newpiecesdf = newpiecesdf.drop(columns = 'id')

In [34]:
# Visualização inicial do dataset
display(newpiecesdf.head())

Unnamed: 0,A,B,C,D,Unit
0,5.5,2.6,4.4,1.2,SP
1,5.8,2.7,4.1,1.0,SP
2,5.1,3.5,1.4,0.2,SP
3,6.3,2.5,4.9,1.5,BH
4,6.0,3.0,4.8,1.8,RJ


# Construção do modelo

In [35]:
# Hot encode
hot_encode = OneHotEncoder(handle_unknown='ignore') 
hot_encode = hot_encode.fit(newpiecesdf[['Unit']])

In [36]:
# Verifica as colunas com hot encoded
display(hot_encode.categories_)

[array(['BH', 'RJ', 'SP'], dtype=object)]

In [37]:
# Transformar as colunas hot encoded e armazenar em nova variável
hot_encoded_data = hot_encode.transform(newpiecesdf[['Unit']]).toarray()

In [38]:
# Novo dataframe com as colunas hot encoded
ohenewpiecesdf = pd.DataFrame(hot_encoded_data, columns=hot_encode.categories_[0])

In [39]:
# Concatena os dataframes
ohenewpiecesdf = pd.concat([newpiecesdf, ohenewpiecesdf], axis=1)

In [40]:
# Drop coluna Unit
ohenewpiecesdf = ohenewpiecesdf.drop(columns = 'Unit')

In [41]:
# Visualização inicial do dataset
display(ohenewpiecesdf.head())

Unnamed: 0,A,B,C,D,BH,RJ,SP
0,5.5,2.6,4.4,1.2,0.0,0.0,1.0
1,5.8,2.7,4.1,1.0,0.0,0.0,1.0
2,5.1,3.5,1.4,0.2,0.0,0.0,1.0
3,6.3,2.5,4.9,1.5,1.0,0.0,0.0
4,6.0,3.0,4.8,1.8,0.0,1.0,0.0


# Normalização dos dados com StandardScaler

In [42]:
# Normalização
scaler.fit(ohenewpiecesdf)
X_new = scaler.transform(ohenewpiecesdf)
X_new 

array([[-0.5814019 , -0.72000341,  0.37147964,  0.11058147, -0.5       ,
        -0.5       ,  0.81649658],
       [ 0.14535047, -0.44307902,  0.13930487, -0.25802342, -0.5       ,
        -0.5       ,  0.81649658],
       [-1.55040507,  1.77231608, -1.95026813, -1.73244298, -0.5       ,
        -0.5       ,  0.81649658],
       [ 1.35660443, -0.9969278 ,  0.7584376 ,  0.6634888 ,  2.        ,
        -0.5       , -1.22474487],
       [ 0.62985206,  0.38769414,  0.68104601,  1.21639614, -0.5       ,
         2.        , -1.22474487]])

# Predição

In [43]:
# Predição
base_estimator  = neighbors.KNeighborsClassifier(n_neighbors=7, metric='manhattan') 
param_grid = {}

In [44]:
# Modelo Grid Search Cross Validation
clf = GridSearchCV(base_estimator, param_grid, cv=5, scoring='accuracy')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_new)

ohenewpiecesdf['Prediction'] = clf.predict(X_new)
ohenewpiecesdf

Unnamed: 0,A,B,C,D,BH,RJ,SP,Prediction
0,5.5,2.6,4.4,1.2,0.0,0.0,1.0,Accept
1,5.8,2.7,4.1,1.0,0.0,0.0,1.0,Accept
2,5.1,3.5,1.4,0.2,0.0,0.0,1.0,Reject
3,6.3,2.5,4.9,1.5,1.0,0.0,0.0,Accept
4,6.0,3.0,4.8,1.8,0.0,1.0,0.0,Refurbish


In [45]:
# Regressão logística
class_f_log.fit(X_train, y_train)

y_pred = class_f_log.predict(X_new)

ohenewpiecesdf['Prediction2'] = class_f_log.predict(X_new)
ohenewpiecesdf

Unnamed: 0,A,B,C,D,BH,RJ,SP,Prediction,Prediction2
0,5.5,2.6,4.4,1.2,0.0,0.0,1.0,Accept,Accept
1,5.8,2.7,4.1,1.0,0.0,0.0,1.0,Accept,Accept
2,5.1,3.5,1.4,0.2,0.0,0.0,1.0,Reject,Reject
3,6.3,2.5,4.9,1.5,1.0,0.0,0.0,Accept,Refurbish
4,6.0,3.0,4.8,1.8,0.0,1.0,0.0,Refurbish,Refurbish
