In [1]:
import pandas as pd
import seaborn as sns

# Cargar el dataset de penguins
df = sns.load_dataset("penguins").dropna()

df.T

Unnamed: 0,0,1,2,4,5,6,7,12,13,14,...,332,333,334,335,337,338,340,341,342,343
species,Adelie,Adelie,Adelie,Adelie,Adelie,Adelie,Adelie,Adelie,Adelie,Adelie,...,Gentoo,Gentoo,Gentoo,Gentoo,Gentoo,Gentoo,Gentoo,Gentoo,Gentoo,Gentoo
island,Torgersen,Torgersen,Torgersen,Torgersen,Torgersen,Torgersen,Torgersen,Torgersen,Torgersen,Torgersen,...,Biscoe,Biscoe,Biscoe,Biscoe,Biscoe,Biscoe,Biscoe,Biscoe,Biscoe,Biscoe
bill_length_mm,39.1,39.5,40.3,36.7,39.3,38.9,39.2,41.1,38.6,34.6,...,43.5,51.5,46.2,55.1,48.8,47.2,46.8,50.4,45.2,49.9
bill_depth_mm,18.7,17.4,18.0,19.3,20.6,17.8,19.6,17.6,21.2,21.1,...,15.2,16.3,14.1,16.0,16.2,13.7,14.3,15.7,14.8,16.1
flipper_length_mm,181.0,186.0,195.0,193.0,190.0,181.0,195.0,182.0,191.0,198.0,...,213.0,230.0,217.0,230.0,222.0,214.0,215.0,222.0,212.0,213.0
body_mass_g,3750.0,3800.0,3250.0,3450.0,3650.0,3625.0,4675.0,3200.0,3800.0,4400.0,...,4650.0,5500.0,4375.0,5850.0,6000.0,4925.0,4850.0,5750.0,5200.0,5400.0
sex,Male,Female,Female,Female,Male,Female,Male,Female,Male,Male,...,Female,Male,Female,Male,Male,Female,Female,Male,Female,Male


In [157]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
X = df.drop(columns=['species'])
y = df.species

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 1, stratify = y)

# Modificamos los valores de los tags en y_train, y_test por 0, 1 y 2
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

## Codificación one-hot

In [158]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

categorical = ['island', 'sex']
numerical = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']

sc = StandardScaler()
sc.fit(X_train[numerical])
X_train_std = sc.transform(X_train[numerical])
X_test_std = sc.transform(X_test[numerical])

## Codificación One-hot

In [159]:
train_dict = X_train[categorical + numerical].to_dict(orient='records')
test_dict = X_test[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [160]:
X_train = dv.transform(train_dict)
X_test = dv.transform(test_dict)
dv.get_feature_names_out()

array(['bill_depth_mm', 'bill_length_mm', 'flipper_length_mm',
       'island=Biscoe', 'island=Dream', 'island=Torgersen', 'sex=Female',
       'sex=Male'], dtype=object)

## Entrenamiento de modelos

#### Logistic regression

In [161]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=100.0,
                        random_state = 1,
                        solver = 'lbfgs',
                        multi_class='ovr')
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### SVM

In [162]:
from sklearn.svm import SVC

svm = SVC(kernel='linear',C=1.0, random_state=1, probability=True)

svm.fit(X_train, y_train)

#### Decision tree

In [163]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini',max_depth=4,
                                    random_state=1)
dt.fit(X_train,y_train)

#### KNN

In [164]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')

knn.fit(X_train, y_train)

## Serialización de los modelos

In [165]:
import pickle

with open('../models/lr.pck', 'wb') as f:
    pickle.dump((dv, lr), f)

with open('../models/svm.pck', 'wb') as f:
    pickle.dump((dv, svm), f)

with open('../models/dt.pck', 'wb') as f:
    pickle.dump((dv, dt), f)

with open('../models/knn.pck', 'wb') as f:
    pickle.dump((dv, knn), f)