In [87]:
# Standard libs
import numpy as np
import pandas as pd

# Debugging
from icecream import ic

# Predictive modelling
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, BaggingClassifier, BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR, SVC

# NN functions
from keras.layers import Dense, Dropout, BatchNormalization
from keras.models import Sequential

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix
# CV ; splitting
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

We can actually try to predict every numeric and categorical feature in this dataset, but for now let's do classification on possum population class `Pop`.

In [2]:
d = pd.read_csv('possum.csv')

In [3]:
d.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [4]:
d.isna().sum()

case        0
site        0
Pop         0
sex         0
age         2
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
dtype: int64

**Preprocessing**

In [5]:
d['age'] = d['age'].fillna(d['age'].median())
d['footlgth'] = d['footlgth'].fillna(d['footlgth'].mean())

In [7]:
d.isna().sum()

case        0
site        0
Pop         0
sex         0
age         0
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    0
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [8]:
# One-hot encode categorical variables and concatenate with dataframe
d = pd.concat([pd.get_dummies(d['site'], drop_first=True), 
               pd.get_dummies(d['Pop'], drop_first=True),
               pd.get_dummies(d['sex'], drop_first=True),
              d.drop(['site','Pop','sex', 'case'], axis = 1)], axis = 1)

In [9]:
# Convert age to integer value for categorization
d = d.astype({'age': int})

In [10]:
# Create models
# For later...
# regressors = [LinearRegression(), 
#               DecisionTreeRegressor(), 
#               RandomForestRegressor(), 
#               BaggingRegressor(), 
#               AdaBoostRegressor(), 
#               GradientBoostingRegressor(), 
#               KNeighborsRegressor()]


# classifiers = [LogisticRegression(), 
#                DecisionTreeClassifier(), 
#                RandomForestClassifier(), 
#                BaggingClassifier(),
#                AdaBoostClassifier(), 
#                GradientBoostingClassifier(), 
#                KNeighborsClassifier()]

In [11]:
d.head()

Unnamed: 0,2,3,4,5,6,7,other,m,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,0,0,0,0,0,0,0,1,8,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,0,0,0,0,0,0,0,0,6,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,0,0,0,0,0,0,0,0,6,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,0,0,0,0,0,0,0,0,6,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,0,0,0,0,0,0,0,0,2,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [12]:
X = d.drop('other', axis = 1)
y = d['other']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15)

In [14]:
ic(X_train.head()); ic(X_test.head())

ic| X_train.head():     2  3  4  5  6  7  m  age  hdlngth  skullw  totlngth  taill  footlgth  \
                    71  0  0  0  1  0  0  1    1     85.9    52.4      80.5   35.0      62.0   
                    73  0  0  0  0  1  0  0    4     88.7    52.0      83.0   38.0      61.5   
                    77  0  0  0  0  1  0  1    1     86.5    51.0      81.0   36.5      63.0   
                    26  0  0  0  0  0  0  0    2     90.5    54.5      85.0   35.0      70.3   
                    11  0  0  0  0  0  0  0    5     94.9    55.6      92.0   35.5      71.7   
                    
                        earconch   eye  chest  belly  
                    71      42.4  14.1   25.5   30.0  
                    73      45.9  14.7   26.0   34.0  
                    77      44.3  13.2   23.0   28.0  
                    26      50.8  14.2   23.0   28.0  
                    11      51.0  15.3   28.0   33.0  
ic| X_test.head():     2  3  4  5  6  7  m  age  hdlngth  skullw  totlngt

Unnamed: 0,2,3,4,5,6,7,m,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
64,0,0,0,1,0,0,0,5,93.5,57.4,88.5,38.0,68.2,41.7,14.0,29.0,38.5
36,1,0,0,0,0,0,0,2,89.3,54.8,82.5,35.0,71.2,52.0,13.6,28.0,31.5
15,0,0,0,0,0,0,1,4,91.6,56.0,86.0,34.5,73.0,51.4,14.4,28.0,32.0
31,0,0,0,0,0,0,0,4,94.3,56.7,94.0,39.0,74.8,52.0,14.9,28.0,34.0
43,1,0,0,0,0,0,1,3,85.1,51.5,76.0,35.5,70.3,52.6,14.4,23.0,27.0


In [32]:
# Predict where the population came from
lrc = LogisticRegression(max_iter=1000)
# Site has numeric dtype names and forces using .values attr
lrc.fit(X_train.values, y_train.values)

In [33]:
lrc.predict(X_test[:3].values)

array([1, 0, 0], dtype=uint8)

In [34]:
accuracy_score(lrc.predict(X_test[:3].values), y_test[:3].values)

1.0

In [35]:
accuracy_score(lrc.predict(X_test.values), y_test.values)

0.9375

In [36]:
confusion_matrix(lrc.predict(X_test.values), y_test.values)

array([[9, 1],
       [0, 6]], dtype=int64)

The test set size is quite small due to having only 104 possums for our sample. We can use cross validation to get a better understanding of how our model is performing.

In [39]:
cv = cross_val_score(LogisticRegression(max_iter=1000), X_train.values, y_train.values, cv = 5);
ic(cv)
cv = cross_val_score(LogisticRegression(max_iter=1000), X_train.values, y_train.values, cv = 15);
ic(cv)


ic| cv: array([1.        , 1.        , 1.        , 1.        , 0.94117647])
ic| cv: array([1.        , 1.        , 1.        , 1.        , 1.        ,
               1.        , 1.        , 1.        , 1.        , 1.        ,
               1.        , 1.        , 0.83333333, 1.        , 1.        ])


array([1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.83333333, 1.        , 1.        ])

The classifier is working pretty well. We should expect a poorer performance on at least one of the splits due to the sample size.

In [40]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train.values, y_train.values)

In [45]:
# Just to show the unregularized tree will severly overfit the data
dtc.predict(X_test.values) == y_test.values

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [167]:
max_depth = 10 # Set up depth of heirarchy
min_samples_split = 10

dtc_params = {
    'criterion' : ['gini', 'entropy', 'log_loss'], # Expect similar trees with all methods
    'splitter' : ['best', 'random'],               # Random should speed up training if the task was bigger
    'max_depth' : list(range(1, max_depth)),          # Prevent tree from becoming "pure" (overfit)
    'min_samples_split' : list(range(1, min_samples_split)), # Regularizing : prevents unnecessary computations
    'min_samples_leaf' : [1]                      # Regularizing
}

gs = GridSearchCV(DecisionTreeClassifier(), param_grid=dtc_params, verbose = 0)

In [168]:
gs.fit(X_train.values, y_train.values)

In [166]:
accuracy_score(gs.predict(X_test.values), y_test.values)

1.0

The accuracy is 100%, but on multiple runs we see a drop... < 10%. Again not surprising.

In [140]:
# store importances because the attr name is long
importances = gs.best_estimator_.feature_importances_.round(2)
col_names = X_train.columns # also longish

# zip features with its importance
feats = list(zip(importances, col_names))
ic(feats)

# sort by importance
feats.sort(key=lambda feat: feat[0])
# important features to the front
feats.reverse()

ic| feats: [(0.18, 2),
            (0.0, 3),
            (0.0, 4),
            (0.0, 5),
            (0.0, 6),
            (0.09, 7),
            (0.0, 'm'),
            (0.0, 'age'),
            (0.0, 'hdlngth'),
            (0.0, 'skullw'),
            (0.0, 'totlngth'),
            (0.0, 'taill'),
            (0.0, 'footlgth'),
            (0.74, 'earconch'),
            (0.0, 'eye'),
            (0.0, 'chest'),
            (0.0, 'belly')]


In [171]:
# Print important features
headers = ['feature', 'importance']
print('{:>15} {:>15}'.format(headers[0], headers[1]))
print('---------------------------------------------')

for (i, j) in feats:
    
    # If i is not 0
    if i:
        print(f'{j:>15}', f'{i:>15}')
        
    # At the first 0 occurence, break
    # bc everything else is 0
    else: 
        break

        feature      importance
---------------------------------------------
       earconch            0.74
              2            0.18
              7            0.09


What is moderately surprising is that the decision tree finds the possum's ear size as the best predictor of the possum's population. This is confirmed by choosing "best" for the `splitter` argument as it will choose `earconch` as the only parameter to determine the `Pop`.

In [69]:
# Function for testing

'''
1. Input kwargs ()
2. Do cross validation (cv)
3. write outputs (verbose)
4. df (T/F)
5. save

'''

'\n1. Input kwargs ()\n2. Do cross validation (cv)\n3. write outputs (verbose)\n4. df (T/F)\n5. save\n\n'