Multinominal Logistic Regression

In [1]:
#import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import tree, preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression

In [23]:
#penguins data
url = "https://philchodrow.github.io/PIC16A/datasets/palmer_penguins.csv"
penguins = pd.read_csv(url)
penguins.head()

#need to prep data

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,11/16/07,,,,,,,,Adult not sampled.
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,11/16/07,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,


In [24]:
#dropping columns: studyName, Individual ID, Date Egg, Sample Number
pdata = penguins.drop(['studyName', 'Individual ID', 'Date Egg', 'Sample Number', 'Comments'], axis = 1)
pdata.head()

Unnamed: 0,Species,Region,Island,Stage,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
0,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,39.1,18.7,181.0,3750.0,MALE,,
1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454
2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302
3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,,,,,,,
4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426


In [25]:
pdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Species              344 non-null    object 
 1   Region               344 non-null    object 
 2   Island               344 non-null    object 
 3   Stage                344 non-null    object 
 4   Clutch Completion    344 non-null    object 
 5   Culmen Length (mm)   342 non-null    float64
 6   Culmen Depth (mm)    342 non-null    float64
 7   Flipper Length (mm)  342 non-null    float64
 8   Body Mass (g)        342 non-null    float64
 9   Sex                  334 non-null    object 
 10  Delta 15 N (o/oo)    330 non-null    float64
 11  Delta 13 C (o/oo)    331 non-null    float64
dtypes: float64(6), object(6)
memory usage: 32.4+ KB


In [26]:
#drop nan values
data = pdata.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 325 entries, 1 to 343
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Species              325 non-null    object 
 1   Region               325 non-null    object 
 2   Island               325 non-null    object 
 3   Stage                325 non-null    object 
 4   Clutch Completion    325 non-null    object 
 5   Culmen Length (mm)   325 non-null    float64
 6   Culmen Depth (mm)    325 non-null    float64
 7   Flipper Length (mm)  325 non-null    float64
 8   Body Mass (g)        325 non-null    float64
 9   Sex                  325 non-null    object 
 10  Delta 15 N (o/oo)    325 non-null    float64
 11  Delta 13 C (o/oo)    325 non-null    float64
dtypes: float64(6), object(6)
memory usage: 33.0+ KB


In [27]:
data.head()

Unnamed: 0,Species,Region,Island,Stage,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454
2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302
4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426
5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,39.3,20.6,190.0,3650.0,MALE,8.66496,-25.29805
6,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",No,38.9,17.8,181.0,3625.0,FEMALE,9.18718,-25.21799


In [52]:
for index, row in data.iterrows():
    print("row : " + str(row))
    print("index : " + str(index))

row : Species                Adelie Penguin (Pygoscelis adeliae)
Region                                              Anvers
Island                                           Torgersen
Stage                                   Adult, 1 Egg Stage
Clutch Completion                                      Yes
Culmen Length (mm)                                    39.5
Culmen Depth (mm)                                     17.4
Flipper Length (mm)                                    186
Body Mass (g)                                         3800
Sex                                                 FEMALE
Delta 15 N (o/oo)                                  8.94956
Delta 13 C (o/oo)                                 -24.6945
Name: 1, dtype: object
index : 1
row : Species                Adelie Penguin (Pygoscelis adeliae)
Region                                              Anvers
Island                                           Torgersen
Stage                                   Adult, 1 Egg Stage
Clutch Comp

Name: 124, dtype: object
index : 124
row : Species                Adelie Penguin (Pygoscelis adeliae)
Region                                              Anvers
Island                                           Torgersen
Stage                                   Adult, 1 Egg Stage
Clutch Completion                                      Yes
Culmen Length (mm)                                    40.6
Culmen Depth (mm)                                       19
Flipper Length (mm)                                    199
Body Mass (g)                                         4000
Sex                                                   MALE
Delta 15 N (o/oo)                                  9.22537
Delta 13 C (o/oo)                                 -25.6083
Name: 125, dtype: object
index : 125
row : Species                Adelie Penguin (Pygoscelis adeliae)
Region                                              Anvers
Island                                           Torgersen
Stage                        

Name: 182, dtype: object
index : 182
row : Species                Chinstrap penguin (Pygoscelis antarctica)
Region                                                    Anvers
Island                                                     Dream
Stage                                         Adult, 1 Egg Stage
Clutch Completion                                            Yes
Culmen Length (mm)                                          54.2
Culmen Depth (mm)                                           20.8
Flipper Length (mm)                                          201
Body Mass (g)                                               4300
Sex                                                         MALE
Delta 15 N (o/oo)                                        9.49283
Delta 13 C (o/oo)                                          -24.6
Name: 183, dtype: object
index : 183
row : Species                Chinstrap penguin (Pygoscelis antarctica)
Region                                                    Anvers
Isla

Name: 303, dtype: object
index : 303
row : Species                Gentoo penguin (Pygoscelis papua)
Region                                            Anvers
Island                                            Biscoe
Stage                                 Adult, 1 Egg Stage
Clutch Completion                                    Yes
Culmen Length (mm)                                  44.9
Culmen Depth (mm)                                   13.8
Flipper Length (mm)                                  212
Body Mass (g)                                       4750
Sex                                               FEMALE
Delta 15 N (o/oo)                                8.11238
Delta 13 C (o/oo)                               -26.2037
Name: 304, dtype: object
index : 304
row : Species                Gentoo penguin (Pygoscelis papua)
Region                                            Anvers
Island                                            Biscoe
Stage                                 Adult, 1 Egg Stage
Cl

In [54]:
for col in data:
    print(data[col])

1      Adelie Penguin (Pygoscelis adeliae)
2      Adelie Penguin (Pygoscelis adeliae)
4      Adelie Penguin (Pygoscelis adeliae)
5      Adelie Penguin (Pygoscelis adeliae)
6      Adelie Penguin (Pygoscelis adeliae)
                      ...                 
338      Gentoo penguin (Pygoscelis papua)
340      Gentoo penguin (Pygoscelis papua)
341      Gentoo penguin (Pygoscelis papua)
342      Gentoo penguin (Pygoscelis papua)
343      Gentoo penguin (Pygoscelis papua)
Name: Species, Length: 325, dtype: object
1      Anvers
2      Anvers
4      Anvers
5      Anvers
6      Anvers
        ...  
338    Anvers
340    Anvers
341    Anvers
342    Anvers
343    Anvers
Name: Region, Length: 325, dtype: object
1      Torgersen
2      Torgersen
4      Torgersen
5      Torgersen
6      Torgersen
         ...    
338       Biscoe
340       Biscoe
341       Biscoe
342       Biscoe
343       Biscoe
Name: Island, Length: 325, dtype: object
1      Adult, 1 Egg Stage
2      Adult, 1 Egg Stage
4      Adu

In [56]:
myData = data
myData

'Sp'

Unnamed: 0,Species,Region,Island,Stage,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454
2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302
4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426
5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,39.3,20.6,190.0,3650.0,MALE,8.66496,-25.29805
6,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",No,38.9,17.8,181.0,3625.0,FEMALE,9.18718,-25.21799
...,...,...,...,...,...,...,...,...,...,...,...,...
338,Gentoo penguin (Pygoscelis papua),Anvers,Biscoe,"Adult, 1 Egg Stage",No,47.2,13.7,214.0,4925.0,FEMALE,7.99184,-26.20538
340,Gentoo penguin (Pygoscelis papua),Anvers,Biscoe,"Adult, 1 Egg Stage",Yes,46.8,14.3,215.0,4850.0,FEMALE,8.41151,-26.13832
341,Gentoo penguin (Pygoscelis papua),Anvers,Biscoe,"Adult, 1 Egg Stage",Yes,50.4,15.7,222.0,5750.0,MALE,8.30166,-26.04117
342,Gentoo penguin (Pygoscelis papua),Anvers,Biscoe,"Adult, 1 Egg Stage",Yes,45.2,14.8,212.0,5200.0,FEMALE,8.24246,-26.11969


In [62]:
#transforming cols that are not floats and ints
le = preprocessing.LabelEncoder()

#species 
myData['Species'] = le.fit_transform(myData['Species'])
#regions
myData['Region'] = le.fit_transform(myData['Region'])
#island
myData['Island'] = le.fit_transform(myData['Island'])
#stage
myData['Stage'] = le.fit_transform(myData['Stage'])
#clutch competition
myData['Clutch Completion'] = le.fit_transform(myData['Clutch Completion'])
#sex
myData['Sex'] = le.fit_transform(myData['Sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  myData['Species'] = le.fit_transform(myData['Species'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  myData['Region'] = le.fit_transform(myData['Region'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  myData['Island'] = le.fit_transform(myData['Island'])
A value is trying to be set on a copy of 

In [63]:
myData

Unnamed: 0,Species,Region,Island,Stage,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
1,0,0,2,0,1,39.5,17.4,186.0,3800.0,1,8.94956,-24.69454
2,0,0,2,0,1,40.3,18.0,195.0,3250.0,1,8.36821,-25.33302
4,0,0,2,0,1,36.7,19.3,193.0,3450.0,1,8.76651,-25.32426
5,0,0,2,0,1,39.3,20.6,190.0,3650.0,2,8.66496,-25.29805
6,0,0,2,0,0,38.9,17.8,181.0,3625.0,1,9.18718,-25.21799
...,...,...,...,...,...,...,...,...,...,...,...,...
338,2,0,0,0,0,47.2,13.7,214.0,4925.0,1,7.99184,-26.20538
340,2,0,0,0,1,46.8,14.3,215.0,4850.0,1,8.41151,-26.13832
341,2,0,0,0,1,50.4,15.7,222.0,5750.0,2,8.30166,-26.04117
342,2,0,0,0,1,45.2,14.8,212.0,5200.0,1,8.24246,-26.11969


In [69]:
x_slice = myData[['Region', 'Island', 'Stage', 'Clutch Completion', 'Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)', 'Sex', 'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)']]
y_slice = myData[['Species']]

KeyError: ('Region', 'Island', 'Stage', 'Clutch Completion', 'Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)', 'Sex', 'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)')

In [68]:
x_slice

['Region',
 'Island',
 'Stage',
 'Clutch Completion',
 'Culmen Length (mm)',
 'Culmen Depth (mm)',
 'Flipper Length (mm)',
 'Body Mass (g)',
 'Sex',
 'Delta 15 N (o/oo)',
 'Delta 13 C (o/oo)']

In [67]:
x_train, x_test, y_train, y_test = train_test_split(x_slice, y_slice, test_size = 0.2)

ValueError: Found input variables with inconsistent numbers of samples: [11, 1]