# ML families

## Data

In [None]:
import seaborn as sns

In [None]:
sns.get_dataset_names()

In [None]:
penguins = sns.load_dataset('penguins')

In [None]:
penguins.columns

In [None]:
penguins.shape

In [None]:
penguins.head(5)

In [None]:
features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

In [None]:
g = sns.pairplot(penguins, hue='species')

### Missing data

Note that there is missing data which we will need to handle somehow. There are 3 options

- Drop rows with msising values
- Impute (guess) missing values
- Use a ML algorithm that can work with missing values

In [None]:
X = penguins[features]
X.head(5)

In [None]:
y = penguins.species

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.impute import SimpleImputer

## K-nearest neighbors

Family = Nearest neighbor

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
pipe_knn = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer()),
    ('knn', KNeighborsClassifier()),
])

In [None]:
pipe_knn.fit(X_train, y_train)
pipe_knn.score(X_test, y_test)

## Multinomial logistic regression

Family = Linear models

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
pipe_lm = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer()),
    ('lr', LogisticRegression()),
])

In [None]:
pipe_lm.fit(X_train, y_train)
pipe_lm.score(X_test, y_test)

## Support vector clasifier

Family = Support Vector Machines

In [None]:
from sklearn.svm import SVC

In [None]:
pipe_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer()),
    ('svc', SVC()),
])

In [None]:
pipe_svm.fit(X_train, y_train)
pipe_svm.score(X_test, y_test)

## Random forest

Family = Decision Tree

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
pipe_tree = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer()),
    ('rf', RandomForestClassifier()),
])

In [None]:
pipe_tree.fit(X_train, y_train)
pipe_tree.score(X_test, y_test)

### Bonus — ML algorithm that works with missing data

This is not a `scikit-learn` model, but most Python ML models follow the same API and can be "plugged in".

Run `pip install catboost`

GitHub: https://github.com/catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
pipe_boost = Pipeline([
    ('scaler', StandardScaler()),
    ('cat', CatBoostClassifier(verbose=False)),
])

In [None]:
pipe_boost.fit(X_train, y_train)
pipe_boost.score(X_test, y_test)

## Perceptron

Family = Neural network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
pipe_nn = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer()),
    ('rf', MLPClassifier(max_iter=1000)),
])

In [None]:
pipe_nn.fit(X_train, y_train)
pipe_nn.score(X_test, y_test)