In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, model_selection, preprocessing, metrics

# Objetivos

Tentaremos criar e otimizar alguns modelos de regressão para a variável 'hdlngth' e um classificador para 'Pop' e 'sex'.

In [98]:
df = pd.read_csv('data/possum.csv')
df.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


# Seperação dos dados

In [99]:
df_train, df_test = model_selection.train_test_split(df,
                                                     test_size=0.25,
                                                     random_state=42)

# Processando os dados

Dados categoricos serão transformados em numéricos e aplicaremos algumas normalizações nos dados para ajustar as escalas.

## Dados categóricos

In [105]:
oh_enc = preprocessing.OneHotEncoder()
oh_enc.fit_transform(df_train[['Pop','sex']]).toarray()

array([[1., 0., 0., 1.],
       [1., 0., 1., 0.],
       [1., 0., 1., 0.],
       [1., 0., 1., 0.],
       [0., 1., 1., 0.],
       [1., 0., 1., 0.],
       [0., 1., 1., 0.],
       [0., 1., 0., 1.],
       [1., 0., 1., 0.],
       [0., 1., 0., 1.],
       [1., 0., 1., 0.],
       [0., 1., 1., 0.],
       [0., 1., 1., 0.],
       [1., 0., 1., 0.],
       [1., 0., 0., 1.],
       [1., 0., 1., 0.],
       [0., 1., 0., 1.],
       [1., 0., 0., 1.],
       [1., 0., 1., 0.],
       [1., 0., 0., 1.],
       [0., 1., 0., 1.],
       [0., 1., 1., 0.],
       [1., 0., 0., 1.],
       [1., 0., 1., 0.],
       [0., 1., 0., 1.],
       [1., 0., 0., 1.],
       [0., 1., 1., 0.],
       [0., 1., 1., 0.],
       [1., 0., 0., 1.],
       [1., 0., 0., 1.],
       [1., 0., 1., 0.],
       [1., 0., 0., 1.],
       [1., 0., 1., 0.],
       [1., 0., 1., 0.],
       [0., 1., 1., 0.],
       [1., 0., 0., 1.],
       [0., 1., 0., 1.],
       [1., 0., 1., 0.],
       [0., 1., 0., 1.],
       [0., 1., 0., 1.],


In [107]:
oh_enc.get_feature_names_out()

array(['Pop_Vic', 'Pop_other', 'sex_f', 'sex_m'], dtype=object)

In [109]:
pd.DataFrame(oh_enc.fit_transform(df_train[['Pop','sex']]).toarray(),
             columns=oh_enc.get_feature_names_out(),
             index=df_train.index)

Unnamed: 0,Pop_Vic,Pop_other,sex_f,sex_m
15,1.0,0.0,0.0,1.0
42,1.0,0.0,1.0,0.0
40,1.0,0.0,1.0,0.0
9,1.0,0.0,1.0,0.0
85,0.0,1.0,1.0,0.0
...,...,...,...,...
71,0.0,1.0,0.0,1.0
14,1.0,0.0,0.0,1.0
92,0.0,1.0,0.0,1.0
51,0.0,1.0,0.0,1.0


In [92]:
df_train = pd.concat([df_train,
           pd.DataFrame(oh_enc.fit_transform(df_train[['Pop','sex']]).toarray(),
                        columns=oh_enc.get_feature_names_out(),
                        index=df_train.index)],
                        axis=1)
df_train.drop(['Pop','sex'], 
              axis='columns',
              inplace=True)

In [93]:
df_train

Unnamed: 0,case,site,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,Pop_Vic,Pop_other,sex_f,sex_m
15,16,1,4.0,91.6,56.0,86.0,34.5,73.0,51.4,14.4,28.0,32.0,1.0,0.0,0.0,1.0
42,43,2,2.0,90.0,55.5,81.0,32.0,72.0,49.4,13.4,29.0,31.0,1.0,0.0,1.0,0.0
40,41,2,5.0,88.4,57.0,83.0,36.5,,40.3,15.9,27.0,30.5,1.0,0.0,1.0,0.0
9,10,1,6.0,91.8,58.0,89.5,37.5,70.9,53.4,14.4,27.5,32.0,1.0,0.0,1.0,0.0
85,86,6,3.0,88.2,53.2,86.5,38.5,60.3,43.7,13.6,26.0,31.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,72,5,1.0,85.9,52.4,80.5,35.0,62.0,42.4,14.1,25.5,30.0,0.0,1.0,0.0,1.0
14,15,1,5.0,92.9,57.6,85.5,34.0,69.7,51.8,15.7,28.0,35.0,1.0,0.0,0.0,1.0
92,93,7,3.0,89.2,54.0,82.0,38.0,63.8,44.9,12.8,24.0,31.0,0.0,1.0,0.0,1.0
51,52,3,6.0,97.6,61.0,93.5,40.0,67.9,44.3,15.8,28.5,32.5,0.0,1.0,0.0,1.0
