In [38]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [39]:
penguins_df = pd.read_csv("https://raw.githubusercontent.com/mcnakhaee/palmerpenguins/master/palmerpenguins/data/penguins-raw.csv")
penguins_df.dropna(inplace = True)
penguins_df.reset_index(inplace = True, drop = True)
penguins_df.head(3)

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,7,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N4A1,No,2007-11-15,38.9,17.8,181.0,3625.0,FEMALE,9.18718,-25.21799,Nest never observed with full clutch.
1,PAL0708,8,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N4A2,No,2007-11-15,39.2,19.6,195.0,4675.0,MALE,9.4606,-24.89958,Nest never observed with full clutch.
2,PAL0708,29,Adelie Penguin (Pygoscelis adeliae),Anvers,Biscoe,"Adult, 1 Egg Stage",N18A1,No,2007-11-10,37.9,18.6,172.0,3150.0,FEMALE,8.38404,-25.19837,Nest never observed with full clutch.


In [40]:
df = penguins_df[penguins_df.dtypes[penguins_df.dtypes == "float64"].index]
df.drop(columns = ["Delta 15 N (o/oo)", "Delta 13 C (o/oo)"], inplace = True)
df["Sex"] = penguins_df["Sex"]
df.head()

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex
0,38.9,17.8,181.0,3625.0,FEMALE
1,39.2,19.6,195.0,4675.0,MALE
2,37.9,18.6,172.0,3150.0,FEMALE
3,40.5,18.9,180.0,3950.0,MALE
4,37.6,19.3,181.0,3300.0,FEMALE


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [42]:
x = df.drop(columns = ["Sex"])
y = df["Sex"]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [43]:
model = SVC()
model.fit(x_train, y_train)

SVC()

In [44]:
preds = model.predict(x_test)
acc = (preds == y_test).mean()
acc

0.7142857142857143

## HyperParameter Tuning

In [33]:
from sklearn.model_selection import GridSearchCV

In [45]:
parameters = {
    "C": [0.5, 0.7, 0.9, 1.1, 1.3],
    "degree": [1, 2, 4, 10, 12],
    "kernel": ["linear", "sigmoid"]
}

optimizer = GridSearchCV(SVC(), param_grid = parameters, verbose = 2)
optimizer.fit(x_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END .....................C=0.5, degree=1, kernel=linear; total time=   0.3s
[CV] END .....................C=0.5, degree=1, kernel=linear; total time=   0.1s
[CV] END .....................C=0.5, degree=1, kernel=linear; total time=   0.1s
[CV] END .....................C=0.5, degree=1, kernel=linear; total time=   0.0s
[CV] END .....................C=0.5, degree=1, kernel=linear; total time=   0.0s
[CV] END ....................C=0.5, degree=1, kernel=sigmoid; total time=   0.0s
[CV] END ....................C=0.5, degree=1, kernel=sigmoid; total time=   0.0s
[CV] END ....................C=0.5, degree=1, kernel=sigmoid; total time=   0.0s
[CV] END ....................C=0.5, degree=1, kernel=sigmoid; total time=   0.0s
[CV] END ....................C=0.5, degree=1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=0.5, degree=2, kernel=linear; total time=   0.1s
[CV] END .....................C=0.5, degree=2, 

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.5, 0.7, 0.9, 1.1, 1.3],
                         'degree': [1, 2, 4, 10, 12],
                         'kernel': ['linear', 'sigmoid']},
             verbose=2)

In [46]:
optimizer.best_params_

{'C': 0.5, 'degree': 1, 'kernel': 'linear'}

In [47]:
new_model = SVC(C = 0.5, degree = 1, kernel = "linear")
new_model.fit(x_train, y_train)

SVC(C=0.5, degree=1, kernel='linear')

In [48]:
new_preds = new_model.predict(x_test)
new_acc = (new_preds == y_test).mean()
new_acc

0.8571428571428571