In [47]:
import numpy as np
import pandas as pd

from icecream import ic

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, BaggingClassifier, BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR, SVC

from keras.layers import Dense, Dropout, BatchNormalization
from keras.models import Sequential

We can actually try to predict every numeric and categorical feature in this dataset.

In [34]:
d = pd.read_csv('possum.csv')

In [14]:
d.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [15]:
d.isna().sum()

case        0
site        0
Pop         0
sex         0
age         2
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
dtype: int64

**Preprocessing**

In [35]:
d['age'] = d['age'].fillna(d['age'].median())
d['footlgth'] = d['footlgth'].fillna(d['footlgth'].mean())

In [36]:
d.isna().sum()

case        0
site        0
Pop         0
sex         0
age         0
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    0
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [38]:
# One-hot encode categorical variables and concatenate with dataframe
d = pd.concat([pd.get_dummies(d['site'], drop_first=True), 
               pd.get_dummies(d['Pop'], drop_first=True),
               pd.get_dummies(d['sex'], drop_first=True),
              d.drop(['site','Pop','sex', 'case'], axis = 1)], axis = 1)

In [43]:
# Convert age to integer value for categorization
d = d.astype({'age': int})

In [48]:
# Create models
regressors = [LinearRegression(), 
              DecisionTreeRegressor(), 
              RandomForestRegressor(), 
              BaggingRegressor(), 
              AdaBoostRegressor(), 
              GradientBoostingRegressor(), 
              KNeighborsRegressor()]


classifiers = [LogisticRegression(), 
               DecisionTreeClassifier(), 
               RandomForestClassifier(), 
               BaggingClassifier(),
               AdaBoostClassifier(), 
               GradientBoostingClassifier(), 
               KNeighborsClassifier()]

In [56]:
type(LinearRegression().)

sklearn.linear_model._base.LinearRegression

In [59]:
type(LinearRegression())

sklearn.linear_model._base.LinearRegression

In [None]:
'''
1. Input kwargs ()
2. Do cross validation (cv)
3. write outputs (verbose)
4. df (T/F)
5. save

'''