In [1]:
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
import numpy

In [2]:
# The dataset we will use in this tutorial is the Sonar dataset. 
# This is a dataset that describes sonar chirp returns bouncing off different surfaces. 
# The 60 input variables are the strength of the returns at different angles. 
# It is a binary classification problem that requires a model to differentiate rocks from metal cylinders.
# All of the variables are continuous and generally in the range of 0 to 1. 
# The output variable is a string M for mine and R for rock, which will need to be converted to integers 1 and 0. 

In [3]:
_seed = 7
numpy.random.seed(_seed)

In [4]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data'

In [5]:
_dataframe = read_csv(_uri)

In [17]:
print('{}\nShape:\n{}\n{}'.format(100*'=',_dataframe.shape,100*'='))

Shape:
(207, 61)


In [7]:
_array = _dataframe.values

In [18]:
print('_array:\n{}\n{}'.format(_array[:5,:],100*'='))

_array:
[[0.0453 0.0523 0.0843 0.0689 0.1183 0.2583 0.2156 0.3481 0.3337 0.2872
  0.4918 0.6552 0.6919 0.7797 0.7464 0.9444 1.0 0.8874 0.8024 0.7818 0.5212
  0.4052 0.3957 0.3914 0.325 0.32 0.3271 0.2767 0.4423 0.2028 0.3788 0.2947
  0.1984 0.2341 0.1306 0.4182 0.3835 0.1057 0.184 0.197 0.1674 0.0583
  0.1401 0.1628 0.0621 0.0203 0.053 0.0742 0.0409 0.0061 0.0125 0.0084
  0.0089 0.0048 0.0094 0.0191 0.014 0.0049 0.0052 0.0044 'R']
 [0.0262 0.0582 0.1099 0.1083 0.0974 0.228 0.2431 0.3771 0.5598 0.6194
  0.6333 0.706 0.5544 0.532 0.6479 0.6931 0.6759 0.7551 0.8929 0.8619
  0.7974 0.6737 0.4293 0.3648 0.5331 0.2413 0.507 0.8533 0.6036 0.8514
  0.8512 0.5045 0.1862 0.2709 0.4232 0.3043 0.6116 0.6756 0.5375 0.4719
  0.4647 0.2587 0.2129 0.2222 0.2111 0.0176 0.1348 0.0744 0.013 0.0106
  0.0033 0.0232 0.0166 0.0095 0.018 0.0244 0.0316 0.0164 0.0095 0.0078 'R']
 [0.01 0.0171 0.0623 0.0205 0.0205 0.0368 0.1098 0.1276 0.0598 0.1264
  0.0881 0.1992 0.0184 0.2261 0.1729 0.2131 0.0693 0.2281 0.406 

In [9]:
_X = _array[:,0:60]

In [19]:
print('_X\n{}\n{}'.format(_X[:5,:],100*'='))

_X
[[0.0453 0.0523 0.0843 0.0689 0.1183 0.2583 0.2156 0.3481 0.3337 0.2872
  0.4918 0.6552 0.6919 0.7797 0.7464 0.9444 1.0 0.8874 0.8024 0.7818 0.5212
  0.4052 0.3957 0.3914 0.325 0.32 0.3271 0.2767 0.4423 0.2028 0.3788 0.2947
  0.1984 0.2341 0.1306 0.4182 0.3835 0.1057 0.184 0.197 0.1674 0.0583
  0.1401 0.1628 0.0621 0.0203 0.053 0.0742 0.0409 0.0061 0.0125 0.0084
  0.0089 0.0048 0.0094 0.0191 0.014 0.0049 0.0052 0.0044]
 [0.0262 0.0582 0.1099 0.1083 0.0974 0.228 0.2431 0.3771 0.5598 0.6194
  0.6333 0.706 0.5544 0.532 0.6479 0.6931 0.6759 0.7551 0.8929 0.8619
  0.7974 0.6737 0.4293 0.3648 0.5331 0.2413 0.507 0.8533 0.6036 0.8514
  0.8512 0.5045 0.1862 0.2709 0.4232 0.3043 0.6116 0.6756 0.5375 0.4719
  0.4647 0.2587 0.2129 0.2222 0.2111 0.0176 0.1348 0.0744 0.013 0.0106
  0.0033 0.0232 0.0166 0.0095 0.018 0.0244 0.0316 0.0164 0.0095 0.0078]
 [0.01 0.0171 0.0623 0.0205 0.0205 0.0368 0.1098 0.1276 0.0598 0.1264
  0.0881 0.1992 0.0184 0.2261 0.1729 0.2131 0.0693 0.2281 0.406 0.3973
  0.27

In [11]:
_Y = _array[:,60:]

In [20]:
print('_Y\n{}\n{}'.format(_Y[:5],100*'='))

_Y
['R' 'R' 'R' 'R' 'R']


In [13]:
_Y = numpy.ravel(_Y)

In [22]:
print('_Y raveled\n{}\n{}'.format(_Y[:5],100*'='))

_Y raveled
['R' 'R' 'R' 'R' 'R']


In [15]:
# encode class values as integers
_Y_encoded = LabelEncoder().fit_transform(_Y)

In [23]:
print('_Y_encoded\n{}\n{}'.format(_Y_encoded[:5],100*'='))

_Y_encoded
[1 1 1 1 1]
