In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [6]:
# Load in our data
surveys_df = pd.read_csv("../data/raw/surveys.csv")

In [7]:
surveys_df.head(2)

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,


In [8]:
surveys_df.shape

(35549, 9)

In [9]:
surveys_df = surveys_df.dropna()

In [10]:
surveys_df.shape

(30676, 9)

In [11]:
# Predict the sex of an animal based on hindfoot length & weight
X = surveys_df[['hindfoot_length', 'weight']]

In [12]:
X.head(2)

Unnamed: 0,hindfoot_length,weight
62,35.0,40.0
63,37.0,48.0


In [13]:
y = surveys_df['sex']

In [14]:
y.head(2)

62    M
63    M
Name: sex, dtype: object

In [15]:
clf = RandomForestClassifier()

In [16]:
X.shape

(30676, 2)

In [17]:
clf.fit(X, y)

RandomForestClassifier()

In [18]:
predictions = clf.predict(X)

In [19]:
X.head(2)

Unnamed: 0,hindfoot_length,weight
62,35.0,40.0
63,37.0,48.0


In [20]:
predictions

array(['M', 'M', 'F', ..., 'M', 'F', 'M'], dtype=object)

In [21]:
y

62       M
63       M
64       F
65       F
66       M
        ..
35540    F
35541    F
35542    F
35546    F
35547    M
Name: sex, Length: 30676, dtype: object

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
accuracy_score(y, predictions)

0.6406637110444647

#### Challenge - Try out another classifier
* Pick another classifier from https://scikit-learn.org/stable/supervised_learning.html
* Train this classifier on the data in the same way
* (Optional) Tryout setting different hyperparameters for the classifier, look at the documentation of the classifier you picked. HINT: for the `RandomForestClassifer` you can set the number of trees with `clf = RandomForestClassifier(n_estimators=200)`

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html#sklearn.linear_model.RidgeClassifier

In [24]:
from sklearn import ensemble

In [25]:
ensemble # A module (subpackage) of the sklearn package
# Constists of functions and classes

<module 'sklearn.ensemble' from '/Users/svenvanderburg/opt/anaconda3/envs/teaching/lib/python3.7/site-packages/sklearn/ensemble/__init__.py'>

In [27]:
clf = ensemble.GradientBoostingClassifier()

In [28]:
clf.fit(X, y)

GradientBoostingClassifier()

In [29]:
predictions = clf.predict(X)

In [30]:
accuracy_score(y, predictions)

0.6097600730212545

In [31]:
# We can import like this
from sklearn import linear_model
linear_model.Lasso()

In [32]:
# Or like this:
from sklearn.linear_model import Lasso
Lasso()

Lasso()

In [33]:
clf = Lasso()

In [35]:
y

62       M
63       M
64       F
65       F
66       M
        ..
35540    F
35541    F
35542    F
35546    F
35547    M
Name: sex, Length: 30676, dtype: object

In [34]:
clf.fit(X, y)

ValueError: could not convert string to float: 'M'

In [36]:
from sklearn.preprocessing import LabelBinarizer

In [37]:
binarizer = LabelBinarizer()

In [42]:
new_y = binarizer.fit_transform(y)

In [44]:
clf.fit(X, new_y)

Lasso()

In [39]:
binarizer = LabelBinarizer()
binarizer.fit_transform(surveys_df['species_id'])

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [41]:
surveys_df['species_id']

62       DM
63       DM
64       DM
65       DM
66       DM
         ..
35540    PB
35541    PB
35542    PB
35546    RM
35547    DO
Name: species_id, Length: 30676, dtype: object

In [None]:
DM = 1 PB = 2 [1, 1, 2, , 3, 45,1,]