In [2]:
import pandas as pd
import numpy as np


In [3]:
penguins = pd.read_csv('penguins.csv')

In [5]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


In [6]:
penguins.dropna(inplace=True)

In [7]:
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [31]:
y = penguins['species']
X = penguins.iloc[:,1:-1]

In [40]:
cateogrical_features = X.select_dtypes(object)
numerical_features = X.select_dtypes(float)

In [33]:
from sklearn.preprocessing import OneHotEncoder

In [34]:
encoder = OneHotEncoder()
cateogrical_features_one_hot = encoder.fit_transform(cateogrical_features)

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [44]:
pipeline = ColumnTransformer([
    ('numerical', 'passthrough', numerical_features.columns)
    ,('categorical', OneHotEncoder(), cateogrical_features.columns)
])

In [45]:
X_prepared = pipeline.fit_transform(X)

In [46]:
X_prepared

array([[ 39.1,  18.7, 181. , ...,   1. ,   0. ,   1. ],
       [ 39.5,  17.4, 186. , ...,   1. ,   1. ,   0. ],
       [ 40.3,  18. , 195. , ...,   1. ,   1. ,   0. ],
       ...,
       [ 49.6,  18.2, 193. , ...,   0. ,   0. ,   1. ],
       [ 50.8,  19. , 210. , ...,   0. ,   0. ,   1. ],
       [ 50.2,  18.7, 198. , ...,   0. ,   1. ,   0. ]])

In [47]:
encoder.categories_

[array(['Biscoe', 'Dream', 'Torgersen'], dtype=object),
 array(['female', 'male'], dtype=object)]

In [49]:
pipeline.get_feature_names()

['bill_length_mm',
 'bill_depth_mm',
 'flipper_length_mm',
 'body_mass_g',
 'categorical__x0_Biscoe',
 'categorical__x0_Dream',
 'categorical__x0_Torgersen',
 'categorical__x1_female',
 'categorical__x1_male']

In [50]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [52]:
codes, uniques = pd.factorize(y)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_prepared, codes)

In [56]:
random_forest_classifier = RandomForestClassifier(random_state=15)

In [57]:
random_forest_classifier.fit(X_train, y_train)

RandomForestClassifier(random_state=15)

In [64]:
predictions = random_forest_classifier.predict(X_train)

In [65]:
score = accuracy_score(y_train,predictions )

In [66]:
score

1.0

In [67]:
import pickle

In [68]:
random_forest_pickle = open('random_forest_penguin.pickle', 'wb')
pickle.dump(random_forest_classifier, random_forest_pickle)
random_forest_pickle.close()

In [70]:
target_pickle = open('penguin_targets.pickle', 'wb')
pickle.dump(uniques, target_pickle)
target_pickle.close()