In [58]:
#import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import tree, preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import itertools


#penguins data
url = "https://philchodrow.github.io/PIC16A/datasets/palmer_penguins.csv"
penguins = pd.read_csv(url)
penguins.head()

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,11/16/07,,,,,,,,Adult not sampled.
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,11/16/07,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,


In [32]:
#dropping columns: studyName, Individual ID, Date Egg, Sample Number
pdata = penguins.drop(['studyName', 'Individual ID', 'Date Egg', 'Region', 'Sample Number', 'Comments'], axis = 1)
pdata.head()

Unnamed: 0,Species,Island,Stage,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
0,Adelie Penguin (Pygoscelis adeliae),Torgersen,"Adult, 1 Egg Stage",Yes,39.1,18.7,181.0,3750.0,MALE,,
1,Adelie Penguin (Pygoscelis adeliae),Torgersen,"Adult, 1 Egg Stage",Yes,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454
2,Adelie Penguin (Pygoscelis adeliae),Torgersen,"Adult, 1 Egg Stage",Yes,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302
3,Adelie Penguin (Pygoscelis adeliae),Torgersen,"Adult, 1 Egg Stage",Yes,,,,,,,
4,Adelie Penguin (Pygoscelis adeliae),Torgersen,"Adult, 1 Egg Stage",Yes,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426


In [33]:
#checking info 
pdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Species              344 non-null    object 
 1   Island               344 non-null    object 
 2   Stage                344 non-null    object 
 3   Clutch Completion    344 non-null    object 
 4   Culmen Length (mm)   342 non-null    float64
 5   Culmen Depth (mm)    342 non-null    float64
 6   Flipper Length (mm)  342 non-null    float64
 7   Body Mass (g)        342 non-null    float64
 8   Sex                  334 non-null    object 
 9   Delta 15 N (o/oo)    330 non-null    float64
 10  Delta 13 C (o/oo)    331 non-null    float64
dtypes: float64(6), object(5)
memory usage: 29.7+ KB


In [34]:
#drop nan values
mydata = pdata.dropna()
mydata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 325 entries, 1 to 343
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Species              325 non-null    object 
 1   Island               325 non-null    object 
 2   Stage                325 non-null    object 
 3   Clutch Completion    325 non-null    object 
 4   Culmen Length (mm)   325 non-null    float64
 5   Culmen Depth (mm)    325 non-null    float64
 6   Flipper Length (mm)  325 non-null    float64
 7   Body Mass (g)        325 non-null    float64
 8   Sex                  325 non-null    object 
 9   Delta 15 N (o/oo)    325 non-null    float64
 10  Delta 13 C (o/oo)    325 non-null    float64
dtypes: float64(6), object(5)
memory usage: 30.5+ KB


In [35]:
#get rid of penguin with period for sex
data = mydata.drop(mydata.index[318])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 324 entries, 1 to 343
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Species              324 non-null    object 
 1   Island               324 non-null    object 
 2   Stage                324 non-null    object 
 3   Clutch Completion    324 non-null    object 
 4   Culmen Length (mm)   324 non-null    float64
 5   Culmen Depth (mm)    324 non-null    float64
 6   Flipper Length (mm)  324 non-null    float64
 7   Body Mass (g)        324 non-null    float64
 8   Sex                  324 non-null    object 
 9   Delta 15 N (o/oo)    324 non-null    float64
 10  Delta 13 C (o/oo)    324 non-null    float64
dtypes: float64(6), object(5)
memory usage: 30.4+ KB


In [36]:
#transforming cols that are not floats and ints
le = preprocessing.LabelEncoder()

#species 
data['Species'] = le.fit_transform(data['Species'])
#island
data['Island'] = le.fit_transform(data['Island'])
#stage
data['Stage'] = le.fit_transform(data['Stage'])
#clutch competition
data['Clutch Completion'] = le.fit_transform(data['Clutch Completion'])
#sex
data['Sex'] = le.fit_transform(data['Sex'])

data.head()

Unnamed: 0,Species,Island,Stage,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
1,0,2,0,1,39.5,17.4,186.0,3800.0,0,8.94956,-24.69454
2,0,2,0,1,40.3,18.0,195.0,3250.0,0,8.36821,-25.33302
4,0,2,0,1,36.7,19.3,193.0,3450.0,0,8.76651,-25.32426
5,0,2,0,1,39.3,20.6,190.0,3650.0,1,8.66496,-25.29805
6,0,2,0,0,38.9,17.8,181.0,3625.0,0,9.18718,-25.21799


In [37]:
#making combos
categories = list(data.columns)[1:]
combos = list(itertools.combinations(categories, 3))
combos

[('Island', 'Stage', 'Clutch Completion'),
 ('Island', 'Stage', 'Culmen Length (mm)'),
 ('Island', 'Stage', 'Culmen Depth (mm)'),
 ('Island', 'Stage', 'Flipper Length (mm)'),
 ('Island', 'Stage', 'Body Mass (g)'),
 ('Island', 'Stage', 'Sex'),
 ('Island', 'Stage', 'Delta 15 N (o/oo)'),
 ('Island', 'Stage', 'Delta 13 C (o/oo)'),
 ('Island', 'Clutch Completion', 'Culmen Length (mm)'),
 ('Island', 'Clutch Completion', 'Culmen Depth (mm)'),
 ('Island', 'Clutch Completion', 'Flipper Length (mm)'),
 ('Island', 'Clutch Completion', 'Body Mass (g)'),
 ('Island', 'Clutch Completion', 'Sex'),
 ('Island', 'Clutch Completion', 'Delta 15 N (o/oo)'),
 ('Island', 'Clutch Completion', 'Delta 13 C (o/oo)'),
 ('Island', 'Culmen Length (mm)', 'Culmen Depth (mm)'),
 ('Island', 'Culmen Length (mm)', 'Flipper Length (mm)'),
 ('Island', 'Culmen Length (mm)', 'Body Mass (g)'),
 ('Island', 'Culmen Length (mm)', 'Sex'),
 ('Island', 'Culmen Length (mm)', 'Delta 15 N (o/oo)'),
 ('Island', 'Culmen Length (mm)', 'De

In [38]:
#splitting the data in test and training
train, test = train_test_split(data, test_size = 0.3)
train.shape, test.shape

((226, 11), (98, 11))

In [39]:
x_train = train.drop(['Species'], axis = 1)
y_train = train['Species'].values

x_test = test.drop(['Species'], axis = 1)
y_test = test['Species'].values

In [40]:
y_train

array([0, 0, 2, 1, 2, 0, 0, 0, 1, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2, 1,
       1, 2, 2, 0, 0, 0, 2, 2, 0, 2, 1, 1, 2, 0, 1, 2, 0, 2, 2, 2, 1, 0,
       0, 0, 1, 2, 1, 0, 0, 0, 2, 2, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 0, 0,
       1, 0, 2, 0, 1, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0,
       1, 1, 0, 0, 1, 2, 2, 0, 2, 0, 1, 0, 1, 0, 0, 0, 2, 0, 2, 0, 0, 2,
       0, 1, 2, 0, 0, 2, 1, 1, 0, 1, 2, 0, 2, 0, 0, 1, 2, 0, 0, 1, 0, 0,
       0, 0, 1, 2, 0, 0, 2, 0, 0, 1, 1, 2, 1, 2, 0, 2, 0, 0, 0, 1, 2, 2,
       0, 2, 1, 0, 2, 2, 2, 0, 0, 0, 2, 2, 1, 2, 1, 2, 2, 0, 2, 0, 0, 2,
       2, 1, 1, 2, 0, 0, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 0, 2, 2, 0, 2, 0,
       0, 2, 2, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 1, 0, 0, 1, 2, 2, 2, 1, 1,
       1, 2, 1, 0, 2, 2])

In [48]:
def check_column_score(cols):
    """
    Trains and evaluates a model via cross-validation on the columns of the data
    with selected indices
    """
    clf = LogisticRegressionCV(cv=5, random_state=0, max_iter = 1000).fit(x_train[cols], y_train)
    return clf.score(x_train[cols], y_train)
    

In [None]:
clf = LogisticRegressionCV(cv=5, random_state=0).fit(X, y)

logreg = LogisticRegression(max_iter = 1000)
return cross_val_score(logreg, x_train[cols], y_train, cv = 5).mean()

clf.score(X, y)

In [49]:
D = {}

for i in range(len(combos)): 
    cols = list(combos[i])
    x = check_column_score(cols)
    D[i] = x

In [51]:
D


{0: 0.6460176991150443,
 1: 0.9513274336283186,
 2: 0.7654867256637168,
 3: 0.8008849557522124,
 4: 0.7699115044247787,
 5: 0.6460176991150443,
 6: 0.8053097345132744,
 7: 0.8230088495575221,
 8: 0.9513274336283186,
 9: 0.7522123893805309,
 10: 0.8185840707964602,
 11: 0.7566371681415929,
 12: 0.6460176991150443,
 13: 0.8053097345132744,
 14: 0.827433628318584,
 15: 1.0,
 16: 0.9690265486725663,
 17: 0.9778761061946902,
 18: 0.9911504424778761,
 19: 0.9823008849557522,
 20: 0.9734513274336283,
 21: 0.7920353982300885,
 22: 0.7876106194690266,
 23: 0.7920353982300885,
 24: 0.8628318584070797,
 25: 0.9336283185840708,
 26: 0.7787610619469026,
 27: 0.8097345132743363,
 28: 0.8761061946902655,
 29: 0.9557522123893806,
 30: 0.7831858407079646,
 31: 0.8362831858407079,
 32: 0.9203539823008849,
 33: 0.8053097345132744,
 34: 0.8230088495575221,
 35: 0.911504424778761,
 36: 0.7566371681415929,
 37: 0.7477876106194691,
 38: 0.8053097345132744,
 39: 0.7389380530973452,
 40: 0.4424778761061947,
 4

In [50]:
def test_column_score(tupCol):
    """
    Trains and evaluates a model on the test set using the columns of the data
    with selected indices
    """
    cols = list(tupCol)
    logreg = LogisticRegression(max_iter = 1000)
    logreg.fit(x_train[cols], y_train)
    return logreg.score(x_test[cols], y_test)

In [52]:
L = list(D.items())
L.sort(key = lambda tup: tup[1], reverse = True)

best = L[0:10]

for where, score in best:
    print("---- " + str(where))
    print(str(combos[where]) + ", Score: " + str(np.round(score, 5)))
    test_score = test_column_score(combos[where])
    print("Test score is: " + str(np.round(test_score, 5)))

---- 15
('Island', 'Culmen Length (mm)', 'Culmen Depth (mm)'), Score: 1.0
Test score is: 0.97959
---- 89
('Culmen Length (mm)', 'Culmen Depth (mm)', 'Delta 13 C (o/oo)'), Score: 1.0
Test score is: 1.0
---- 87
('Culmen Length (mm)', 'Culmen Depth (mm)', 'Sex'), Score: 0.99558
Test score is: 0.9898
---- 88
('Culmen Length (mm)', 'Culmen Depth (mm)', 'Delta 15 N (o/oo)'), Score: 0.99558
Test score is: 0.9898
---- 91
('Culmen Length (mm)', 'Flipper Length (mm)', 'Sex'), Score: 0.99558
Test score is: 0.94898
---- 18
('Island', 'Culmen Length (mm)', 'Sex'), Score: 0.99115
Test score is: 0.97959
---- 85
('Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)'), Score: 0.99115
Test score is: 0.95918
---- 86
('Culmen Length (mm)', 'Culmen Depth (mm)', 'Body Mass (g)'), Score: 0.99115
Test score is: 1.0
---- 93
('Culmen Length (mm)', 'Flipper Length (mm)', 'Delta 13 C (o/oo)'), Score: 0.99115
Test score is: 0.93878
---- 94
('Culmen Length (mm)', 'Body Mass (g)', 'Sex'), Score: 0.99115
T

In [53]:
X_train = train[list(combos[0])]
X_test = test[list(combos[0])]

In [60]:
X_train

Unnamed: 0,Island,Stage,Clutch Completion
64,0,0,1
50,0,0,1
338,0,0,0
174,1,0,0
322,0,0,1
...,...,...,...
288,0,0,1
198,1,0,1
116,2,0,1
279,0,0,1


In [55]:
def check_thing(cols):
    logreg = LogisticRegression(max_iter = 1000)
    return cross_val_score(logreg, x_train[cols], y_train, cv = 5).mean()

A = {}
for i in range(len(combos)): 
    cols = list(combos[i])
    x = check_thing(cols)
    A[i] = x

In [56]:
A

{0: 0.6285990338164251,
 1: 0.9470531400966185,
 2: 0.7522705314009661,
 3: 0.787342995169082,
 4: 0.6988405797101449,
 5: 0.6463768115942028,
 6: 0.7701449275362319,
 7: 0.8233816425120773,
 8: 0.9470531400966185,
 9: 0.7211594202898551,
 10: 0.7828985507246375,
 11: 0.6817391304347827,
 12: 0.6285990338164251,
 13: 0.7657004830917875,
 14: 0.8189371980676329,
 15: 0.9777777777777779,
 16: 0.9557487922705314,
 17: 0.9644444444444444,
 18: 0.9690821256038648,
 19: 0.9601932367149759,
 20: 0.9691787439613527,
 21: 0.7962318840579709,
 22: 0.787632850241546,
 23: 0.7788405797101449,
 24: 0.8406763285024155,
 25: 0.9160386473429952,
 26: 0.7520772946859904,
 27: 0.78743961352657,
 28: 0.8716908212560387,
 29: 0.9514009661835748,
 30: 0.7831884057971015,
 31: 0.7657004830917874,
 32: 0.8143961352657005,
 33: 0.7701449275362319,
 34: 0.8189371980676329,
 35: 0.8676328502415458,
 36: 0.7256038647342995,
 37: 0.7167149758454106,
 38: 0.787342995169082,
 39: 0.7210628019323672,
 40: 0.39362318

In [59]:
#logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))

NameError: name 'logreg' is not defined

In [73]:
lr = LogisticRegression(max_iter = 1000)
lr.fit(x_train, y_train)

logit_roc_auc = roc_auc_score(y_test, x_test)



ValueError: multi_class must be in ('ovo', 'ovr')

In [74]:
x_test

Unnamed: 0,Island,Stage,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
138,1,0,0,37.0,16.5,185.0,3400.0,0,8.61651,-26.07021
230,0,0,0,40.9,13.7,214.0,4650.0,0,8.19579,-25.39330
177,1,0,0,52.0,19.0,197.0,4150.0,1,9.36799,-24.47142
264,0,0,1,50.5,15.9,222.0,5550.0,1,8.46894,-26.60436
201,1,0,1,49.8,17.3,198.0,3675.0,0,9.32169,-24.41562
...,...,...,...,...,...,...,...,...,...,...
4,2,0,1,36.7,19.3,193.0,3450.0,0,8.76651,-25.32426
196,1,0,1,50.9,17.9,196.0,3675.0,0,9.43684,-24.16566
197,1,0,1,50.8,18.5,201.0,4450.0,1,9.45827,-24.35575
157,1,0,1,45.2,17.8,198.0,3950.0,0,8.88942,-24.49433
