In [1]:
from pandas import read_csv
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [2]:
# load the dataset
dataset = read_csv('breast-cancer.csv', header=0)
dataset

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,premefalse,15-19,0-2,True,3,right,left_up,False,recurrence-events
1,50-59,ge40,15-19,0-2,False,1,right,central,False,false-recurrence-events
2,50-59,ge40,35-39,0-2,False,2,left,left_low,False,recurrence-events
3,40-49,premefalse,35-39,0-2,True,3,right,left_low,True,false-recurrence-events
4,40-49,premefalse,30-34,3-5,True,2,left,right_up,False,recurrence-events
...,...,...,...,...,...,...,...,...,...,...
267,50-59,ge40,30-34,6-8,True,2,left,left_low,False,false-recurrence-events
268,50-59,premefalse,25-29,3-5,True,2,left,left_low,True,false-recurrence-events
269,30-39,premefalse,30-34,6-8,True,2,right,right_up,False,false-recurrence-events
270,50-59,premefalse,15-19,0-2,False,2,right,left_low,False,false-recurrence-events


In [3]:
# retrieve the array of data
data = dataset.values

In [4]:
# seperate into input and output column
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

In [5]:
# summarize
print('Input', X.shape)
print('Output', y.shape)

Input (272, 9)
Output (272,)


In [6]:
# ordinal encode inpute variables
ordinal = OrdinalEncoder()
X = ordinal.fit_transform(X)

In [7]:
# ordinal encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [8]:
# summarize the transformed
print('Input', X.shape)
print(X[:5, :])
print('Output', y.shape)
print(y[:5])

Input (272, 9)
[[2. 2. 2. 0. 1. 2. 1. 2. 0.]
 [3. 0. 2. 0. 0. 0. 1. 0. 0.]
 [3. 0. 6. 0. 0. 1. 0. 1. 0.]
 [2. 2. 6. 0. 1. 2. 1. 1. 1.]
 [2. 2. 5. 4. 1. 1. 0. 5. 0.]]
Output (272,)
[1 0 1 0 1]


In [9]:
# evaluate logistic regression on the breast cancer dataset with an ordinal encoding
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [10]:
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [11]:
# ordinal encode inpute variables
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_train)
X_train = ordinal_encoder.transform(X_train)
X_test = ordinal_encoder.transform(X_test)

In [12]:
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

In [13]:
# define the model
model = LogisticRegression()

In [14]:
# fit on the training set
model.fit(X_train, y_train)

In [15]:
# predict on test set
yhat = model.predict(X_test)

In [16]:
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 74.44
