# Import

## Setup

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown

sns.set_theme(style="darkgrid")
pd.set_option('display.max_columns', None)  

DEBUG = False
SEED = 666

## Load Dataset

In [19]:
df = pd.read_pickle("data/apple.pickle")

In [20]:
print(df.shape)
df.head()

(4000, 8)


Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Target
0,-1.798424,-0.950373,2.993421,-1.42415,0.690545,-0.089872,-0.269415,0
1,-0.35906,-1.154404,2.127698,0.429746,0.176767,0.19702,-0.378997,0
2,0.109445,-0.225759,-0.652507,-0.946892,1.205422,-0.286156,1.206044,1
3,-0.079977,-0.800146,0.923916,-0.772399,1.619575,-2.08732,0.338315,0
4,0.968573,-0.19164,0.044164,-1.096894,1.305025,-0.961548,0.201472,0


# Baseline Model

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc

## Split

In [22]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('Target', axis=1), df['Target'], test_size=0.2, random_state=SEED)

# Print the shapes of the training and testing sets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (3200, 7)
Testing set shape: (800, 7)


## Train

In [23]:
# Create an instance of each classifier
rf_classifier = RandomForestClassifier()
svc_classifier = SVC()
lgbm_classifier = LGBMClassifier()
xgb_classifier = XGBClassifier()
knn_classifier = KNeighborsClassifier()

# Train the models
rf_classifier.fit(X_train, y_train)
svc_classifier.fit(X_train, y_train)
lgbm_classifier.fit(X_train, y_train)
xgb_classifier.fit(X_train, y_train)
knn_classifier.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_classifier.predict(X_test)
svc_predictions = svc_classifier.predict(X_test)
lgbm_predictions = lgbm_classifier.predict(X_test)
xgb_predictions = xgb_classifier.predict(X_test)
knn_predictions = knn_classifier.predict(X_test)



[LightGBM] [Info] Number of positive: 1586, number of negative: 1614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000554 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 3200, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495625 -> initscore=-0.017500
[LightGBM] [Info] Start training from score -0.017500


## Test

In [24]:
# Evaluate the models
rf_accuracy = accuracy_score(y_test, rf_predictions)
svc_accuracy = accuracy_score(y_test, svc_predictions)
lgbm_accuracy = accuracy_score(y_test, lgbm_predictions)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
knn_accuracy = accuracy_score(y_test, knn_predictions)



## Resultados

In [25]:
# Print the accuracies
print("Random Forest Accuracy:", rf_accuracy)
print("SVC Accuracy:", svc_accuracy)
print("LGBM Accuracy:", lgbm_accuracy)
print("XGB Accuracy:", xgb_accuracy)
print("KNN Accuracy:", knn_accuracy)

Random Forest Accuracy: 0.90875
SVC Accuracy: 0.9025
LGBM Accuracy: 0.89625
XGB Accuracy: 0.9075
KNN Accuracy: 0.915


# Questões

**Pergunta:** Qual é a diferença entre dados de treinamento e dados de teste ao treinar um modelo de classificação?


Dados de treinamento são os dados que o modelo usa para aprender os padrões dos dados. Os dados de testes são dados não vistos anteriormente para o modelo, usados para avaliar se o modelo consegue trabalhar com dados não vistos anteriormente.

**Pergunta:** Com base no dataset fornecido, a abordagem do treinamento do modelo deve ser `supervisionada` ou `não-supervisionada`? Por quê?


Supervisionado, pois o dataset possui a variável target que é a variável que o modelo deve prever.