In [1]:
import pandas as pd

https://archive.ics.uci.edu/ml/datasets/Abalone

In [2]:
columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole_Weight', 
           'Shucked_Weight', 'Viscera_Weight', 'Shell_Weight', 'Rings']
df = pd.read_csv('abalone.data', names=columns)

In [3]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_Weight,Shucked_Weight,Viscera_Weight,Shell_Weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
label = 'Rings'
categorical = ['Sex']
numerical = ['Length', 'Diameter', 'Height', 'Whole_Weight', 
             'Shucked_Weight', 'Viscera_Weight', 'Shell_Weight']

In [5]:
(df[label] >= 10).sum(), (df[label] < 10).sum()

(2081, 2096)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X = df.drop(labels=label, axis=1)
y = (df[label] < 10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,
                                                    random_state=42)

A biblioteca sklearn.base possui ferramentas para criarmos nossos proprios transformadores e estimadores. Usaremos ela para criar um transformador que separa o nosso dataset apenas nas colunas que desejamos usar. Assim, podemos criar um dataframe apenas com dados categóricos e outra apenas com dados numéricos.

In [8]:
from sklearn.base import TransformerMixin

In [9]:
class DataframeSelector(TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        try:
            return pd.DataFrame(data=X[self.columns].values, columns=self.columns)
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

Com essa separação, criaremos pipelines para tratar de cada tipo de dado separadamente

In [10]:
num_df = DataframeSelector(numerical)
cat_df = DataframeSelector(categorical)

Tratamos os dados faltantes de cada um dos dataframes, aplicando metodos de preenchimento diferentes

In [11]:
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

Como os dados numéricos e categóricos estão separados, podemos aplicar a normalização em um dataframe, e a codificação dos dados categóricos em outro

In [12]:
from sklearn.preprocessing import StandardScaler
num_scaler = StandardScaler()

In [13]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder(sparse=False)

Com os transformadores criados, criamos as pipelines. Numa pipeline, os transformadores são aplicados na ordem passada, com cada um passando os dados transformados por si para o próximo.

In [14]:
from sklearn.pipeline import make_pipeline
num_pipeline = make_pipeline(num_df, num_imputer, num_scaler)
cat_pipeline = make_pipeline(cat_df, cat_imputer, cat_encoder)

Como dividimos o nosso dataset, precisamos uni-lo. Para isso podemos usar o make_union. Esse transformador aplica cada transformador que foi passado ao dataset, e ao fim realiza a união dos resultados de cada um.

In [15]:
from sklearn.pipeline import make_union
union = make_union(num_pipeline, cat_pipeline)

In [16]:
X_train_transformed = union.fit_transform(X_train)

In [17]:
X_train_transformed

array([[-0.61286535, -0.5794474 ,  0.1227306 , ...,  1.        ,
         0.        ,  0.        ],
       [ 1.21496297,  1.17862016,  0.94211414, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.59183968,  0.57585414,  0.1227306 , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.77903156, -0.67990841, -3.27185836, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.88263055,  0.67631514,  0.82505935, ...,  1.        ,
         0.        ,  0.        ],
       [-1.19444709, -1.08175242, -1.16487211, ...,  0.        ,
         1.        ,  0.        ]])

In [18]:
X_test_transformed = union.transform(X_test)

In [19]:
X_test_transformed

array([[ 0.34259036,  0.32470163,  0.82505935, ...,  0.        ,
         0.        ,  1.        ],
       [-1.02828088, -0.93106092, -0.93076252, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.841089  ,  0.62608464,  0.1227306 , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.58883695,  1.78138619,  2.11266206, ...,  1.        ,
         0.        ,  0.        ],
       [-1.27753019, -1.33290493, -1.16487211, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.01025794,  0.22424063, -0.34548857, ...,  1.        ,
         0.        ,  0.        ]])

Temos por fim nossos dados de treino e test transformados. Podemos agora aplicar os dados a um estimador.

In [20]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()

In [21]:
clf.fit(X_train_transformed, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [22]:
clf.score(X_test_transformed, y_test)

0.7930622009569378

Produzido por: Matheus Xavier