In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from matplotlib import pyplot
import os

In [3]:
df = pd.read_csv('penguins_size.csv')
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [4]:
# Data Preprocessing

# drop na's
df = df.dropna()

# Binary encode categorical variables
# df = pd.get_dummies(df, columns=['island', 'sex'])

# Split Predictor vs Target variables
X = df.loc[:, df.columns != 'species']
y = df.loc[:, df.columns == 'species']
X

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Torgersen,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...
338,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
340,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [5]:
X['island'] = pd.factorize(X['island'])[0]
X['sex'] = pd.factorize(X['sex'])[0]
X

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,39.1,18.7,181.0,3750.0,0
1,0,39.5,17.4,186.0,3800.0,1
2,0,40.3,18.0,195.0,3250.0,1
4,0,36.7,19.3,193.0,3450.0,1
5,0,39.3,20.6,190.0,3650.0,0
...,...,...,...,...,...,...
338,1,47.2,13.7,214.0,4925.0,1
340,1,46.8,14.3,215.0,4850.0,1
341,1,50.4,15.7,222.0,5750.0,0
342,1,45.2,14.8,212.0,5200.0,1


In [6]:
# factorize target variable 
y['species'] = pd.factorize(y['species'])[0]

In [7]:
# split into training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
     X, y, random_state=42)

### XGBoost Model 1

The model below correctly classified what species the penguin belongs to with an accuracy of 97.62%. However, if we change the learning rate of the classifier to .75 we get an accuracy of 98.81%. So increasing the rate at which the model learns is advantageous in this scenario. We are confident that the model isn't overfitting because we are testing on unseen data.

In [8]:
from xgboost import XGBClassifier

In [9]:
model = XGBClassifier(max_depth = 1,base_score = .2, learning_rate=.1)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.2, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=1, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

In [10]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy* 100))

Accuracy: 97.62%


### XGBoost Model 2

Increasing the learning rate to 0.5 improved the performance of the model. 

In [11]:
xgboost = XGBClassifier(n_estimators = 3, learning_rate = .5).fit(X_train, y_train, early_stopping_rounds = 10, eval_set = [(X_test, y_test)],verbose = False)
score_xgb = xgboost.score(X_test,y_test)
print("Accuracy: %.2f%%" % (round(score_xgb, 4)* 100))

Accuracy: 98.81%


### XGBoost Model 3

Having a learning rate very close to zero greatly decreases the accuracy of our model. Having a learning rate close to zero removes a lot of the advantages of boosting.

In [13]:
xgboost = XGBClassifier(n_estimators = 2, learning_rate = .0000001, num_parallel_tree=5).fit(X_train, y_train, early_stopping_rounds = 1, eval_set = [(X_test, y_test)],verbose = False)
score_xgb = xgboost.score(X_test,y_test)
print("Accuracy: %.2f%%" % (round(score_xgb, 4)* 100))

Accuracy: 47.62%
