This is a simple classification task. Poor results are given by default. For increased accuracy data treated as a 3-category classification problem as per the document.

Here we are making the label categorization and problem solving with:
* KBinsDiscretizer / LogisticRegression
* custom transformation / Keras MLP 

ref: http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names

In [1]:
from pandas import read_csv
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

In [2]:
df = read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
df.dtypes

0     object
1    float64
2    float64
3    float64
4    float64
5    float64
6    float64
7    float64
8      int64
dtype: object

In [5]:
array = df.values
X = array[:, 0:8]
y = array[:, 8].astype('int')

ct = ColumnTransformer([
    ('c1', OrdinalEncoder(), [0])],
    remainder='passthrough'
)

enc_X = ct.fit_transform(X)

"enc_X, y" are used as read only source onwards.

## KBinsDiscretizer / LogisticRegression

In [6]:
kbin = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
kbin_y = np.squeeze(kbin.fit_transform(y[:, np.newaxis])).astype('int')

In [7]:
np.unique(kbin_y, return_counts=True)

(array([0, 1, 2]), array([2730, 1385,   62], dtype=int64))

In [8]:
train_X, test_X, train_y, test_y = train_test_split(enc_X, kbin_y, test_size=0.2, random_state=4)

In [9]:
model = LogisticRegression(solver='liblinear', multi_class='auto')
model.fit(train_X, train_y)
pred_y = model.predict(test_X)

In [10]:
print(classification_report(test_y, pred_y, labels=[0, 1]))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83       565
           1       0.61      0.50      0.55       260

   micro avg       0.74      0.75      0.75       825
   macro avg       0.70      0.69      0.69       825
weighted avg       0.73      0.75      0.74       825



## custom transformation / Keras MLP

In [11]:
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense
from matplotlib import pyplot as plt

Using TensorFlow backend.


In [12]:
def cbin_fnc(x):
    if x < 9:
        ret = 0
    elif x < 11:
        ret = 1
    else:
        ret = 2
    return ret

v_cbin_fnc = np.vectorize(cbin_fnc)
cbin_y = v_cbin_fnc(y)

In [13]:
np.unique(cbin_y, return_counts=True)

(array([0, 1, 2]), array([1407, 1323, 1447], dtype=int64))

In [14]:
train_X, test_X, train_y, test_y = train_test_split(enc_X, cbin_y, test_size=0.2, random_state=4)

In [15]:
model = Sequential()
model.add(Dense(25, input_dim=8, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(train_X, OneHotEncoder(categories=[[0, 1, 2]], sparse=False).fit_transform(train_y[:, np.newaxis]),
                    validation_data=(
                        test_X, OneHotEncoder(categories=[[0, 1, 2]], sparse=False).fit_transform(test_y[:, np.newaxis])),
                    epochs=150, verbose=0)
pred_y = model.predict_classes(test_X)

In [16]:
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

           0       0.76      0.70      0.73       286
           1       0.53      0.40      0.46       279
           2       0.58      0.78      0.67       271

    accuracy                           0.63       836
   macro avg       0.62      0.63      0.62       836
weighted avg       0.63      0.63      0.62       836

