In [81]:
import pandas as pd
import seaborn as sns

# Cargar el dataset de penguins
df = sns.load_dataset("penguins").dropna()

df.head().T

Unnamed: 0,0,1,2,4,5
species,Adelie,Adelie,Adelie,Adelie,Adelie
island,Torgersen,Torgersen,Torgersen,Torgersen,Torgersen
bill_length_mm,39.1,39.5,40.3,36.7,39.3
bill_depth_mm,18.7,17.4,18.0,19.3,20.6
flipper_length_mm,181.0,186.0,195.0,193.0,190.0
body_mass_g,3750.0,3800.0,3250.0,3450.0,3650.0
sex,Male,Female,Female,Female,Male


In [82]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
X = df.drop(columns=['species'])
y = df.species

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 1, stratify = y)

# Modificamos los valores de los tags en y_train, y_test por 0, 1 y 2
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

X_train.head().T
y_train

array([0, 2, 0, 0, 2, 2, 0, 2, 1, 1, 2, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0,
       0, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 1, 2, 0, 0, 0, 2, 1,
       1, 0, 0, 2, 0, 1, 2, 0, 1, 2, 0, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 2,
       0, 0, 2, 1, 2, 0, 2, 1, 2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 0, 2, 0,
       2, 0, 2, 1, 0, 2, 2, 2, 2, 0, 2, 2, 0, 0, 0, 2, 2, 0, 2, 1, 0, 2,
       2, 1, 1, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       2, 0, 1, 2, 0, 2, 0, 1, 0, 1, 0, 2, 0, 2, 0, 0, 1, 1, 2, 0, 0, 0,
       0, 0, 2, 1, 2, 1, 2, 0, 1, 1, 1, 1, 2, 1, 0, 2, 0, 2, 1, 0, 0, 2,
       0, 2, 0, 0, 1, 2, 1, 2, 0, 2, 1, 2, 2, 0, 0, 0, 1, 2, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 2, 1, 0, 1, 0, 1, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2,
       0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2,
       0, 2, 2, 2, 0, 1, 1, 0, 0, 0, 1, 0, 0, 2, 2, 2, 2, 1, 0, 2, 1, 1,
       2, 2])

## Codificación one-hot

In [83]:
from sklearn.feature_extraction import DictVectorizer

categorical = ['island', 'sex']
numerical = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']

train_dict = X_train[categorical + numerical].to_dict(orient='records')
test_dict = X_test[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [84]:
X_train = dv.transform(train_dict)
X_test = dv.transform(test_dict)
dv.get_feature_names_out()

array(['bill_depth_mm', 'bill_length_mm', 'flipper_length_mm',
       'island=Biscoe', 'island=Dream', 'island=Torgersen', 'sex=Female',
       'sex=Male'], dtype=object)

## Estandarización

In [85]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

## Entrenamiento de modelos

#### Logistic regression

In [86]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=100.0,
                        random_state = 1,
                        solver = 'lbfgs',
                        multi_class='ovr')
lr.fit(X_train_std, y_train)



#### SVM

In [87]:
from sklearn.svm import SVC

svm = SVC(kernel='linear',C=1.0, random_state=1, probability=True)

svm.fit(X_train_std, y_train)

#### Decision tree

In [88]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini',max_depth=4,
                                    random_state=1)
dt.fit(X_train_std,y_train)

#### KNN

In [89]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')

knn.fit(X_train_std, y_train)

## Serialización de los modelos

In [90]:
import pickle

with open('../models/lr.pck', 'wb') as f:
    pickle.dump((sc, lr), f)

with open('../models/svm.pck', 'wb') as f:
    pickle.dump((sc, svm), f)

with open('../models/dt.pck', 'wb') as f:
    pickle.dump((sc, dt), f)

with open('../models/knn.pck', 'wb') as f:
    pickle.dump((sc, knn), f)