In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, log_loss

### Get Data

In [2]:
path = "https://archive.ics.uci.edu/ml/machine-learning-databases\
/breast-cancer/breast-cancer.data"

columns = ("target", "age", "menopause", "tumor_size", "inv_nodes", \
           "node_caps", "deg_malig", "breast", "breast_quad", "irradiat")

data = pd.read_csv(path, header=None, names=columns)

In [3]:
data.head()

Unnamed: 0,target,age,menopause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [4]:
data.shape

(286, 10)

### Preprocessing

In [5]:
# convert categorical strings to numbers
for column in data.columns:
    le = LabelEncoder()
    data['{}'.format(column)] = le.fit_transform(data['{}'.format(column)])

In [6]:
data.head()

Unnamed: 0,target,age,menopause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat
0,0,1,2,5,0,1,2,0,2,0
1,0,2,2,3,0,1,1,1,5,0
2,0,2,2,3,0,1,1,0,2,0
3,0,4,0,2,0,1,1,1,3,0
4,0,2,2,0,0,1,1,1,4,0


In [7]:
data.shape

(286, 10)

In [8]:
# separate target
target = data.pop("target")

In [9]:
# one-hot encode features
ohe = OneHotEncoder()
data = ohe.fit_transform(data)

In [10]:
data.shape

(286, 43)

### Train/Test Split

In [11]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                    test_size=0.10,
                                                    random_state=43)

### Feature Selection w/Chi2

In [12]:
chi2_stats, pval = chi2(X_train, y_train)

In [13]:
chi2_stats

array([4.27777778e-01, 2.03802733e+00, 2.66481997e-01, 1.72921910e-01,
       1.53662400e-01, 5.05351130e-01, 2.18660219e-01, 3.25156325e-02,
       1.43673469e-01, 1.16234668e+00, 8.87094223e+00, 2.81493506e-01,
       1.53662400e-01, 8.63498204e-01, 2.35761387e+00, 4.28708134e-01,
       2.36994000e-02, 1.62578163e-02, 1.71111111e+00, 6.44197073e-03,
       6.18340660e+00, 1.92614238e+00, 2.15001443e+00, 2.33766234e+00,
       3.98976431e+00, 8.07288961e+00, 2.80955588e+00, 5.54772212e-01,
       4.27207719e+00, 1.41370629e+01, 4.09617161e+00, 5.43769120e+00,
       2.21774564e+01, 4.00658496e-01, 4.94726143e-01, 2.33766234e+00,
       1.81842485e+00, 5.52765753e-01, 4.72164850e-01, 1.64505301e-01,
       8.77832512e-01, 3.03992837e+00, 9.36104927e+00])

In [14]:
pval

array([5.13081378e-01, 1.53408479e-01, 6.05701502e-01, 6.77527963e-01,
       6.95059721e-01, 4.77158271e-01, 6.40062666e-01, 8.56900637e-01,
       7.04656057e-01, 2.80979391e-01, 2.89745554e-03, 5.95723986e-01,
       6.95059721e-01, 3.52761592e-01, 1.24672757e-01, 5.12623529e-01,
       8.77652239e-01, 8.98539812e-01, 1.90841273e-01, 9.36028969e-01,
       1.28953770e-02, 1.65181107e-01, 1.42568550e-01, 1.26278929e-01,
       4.57774672e-02, 4.49324065e-03, 9.37043141e-02, 4.56374300e-01,
       3.87435417e-02, 1.69961871e-04, 4.29804319e-02, 1.97067018e-02,
       2.48575472e-06, 5.26749378e-01, 4.81826621e-01, 1.26278929e-01,
       1.77500886e-01, 4.57189806e-01, 4.91992919e-01, 6.85042017e-01,
       3.48796025e-01, 8.12395017e-02, 2.21645225e-03])

In [15]:
# columns where chi2 significant
cols = np.where(pval < 0.05)
X_train_feature_selection = X_train[:, cols[0]]
X_test_feature_selection = X_test[:, cols[0]]

### Model Building

In [16]:
kitchen_sink = LogisticRegression(penalty='l2', 
                                  random_state=42, 
                                  multi_class='multinomial', 
                                  solver='lbfgs')

kitchen_sink.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
fs = LogisticRegression(penalty='l2', 
                            random_state=42, 
                            multi_class='multinomial', 
                            solver='lbfgs')

fs.fit(X_train_feature_selection, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
lasso = LogisticRegression(penalty='l1', 
                            random_state=42, 
                            multi_class='multinomial', 
                            solver='saga')

lasso.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l1', random_state=42, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

### Predictions

In [19]:
ks_preds = kitchen_sink.predict(X_test)
ks_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [20]:
fs_preds = fs.predict(X_test_feature_selection)
fs_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [21]:
lasso_preds = lasso.predict(X_test)
lasso_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [22]:
ks_accuracy = accuracy_score(y_test, ks_preds)
fs_accuracy = accuracy_score(y_test, fs_preds)
lasso_accuracy = accuracy_score(y_test, lasso_preds)

In [23]:
mydict = {"ks":ks_accuracy, "fs":fs_accuracy, "lasso":lasso_accuracy}
pd.Series(mydict, dtype='float')

fs       0.689655
ks       0.620690
lasso    0.689655
dtype: float64

### DT & CV

In [47]:
from sklearn.tree import DecisionTreeClassifier

i = 0
datasets = (X_train, X_train_feature_selection)
for dataset in datasets:
    i += 1
    if i == 1:
        print("[X_train]")
    else:
        print("-" * 25)
        print("\n[X_train_feature_selection]")
    depths = (1,2,3,4,5,6)
    for depth in depths:
        dt = DecisionTreeClassifier(criterion='gini', 
                                    max_depth=depth,
                                    min_samples_split=4)
        output = cross_validate(dt, 
                                dataset, y_train,
                                scoring='neg_log_loss',
                                cv=5,
                                n_jobs=-1,
                                return_train_score=True)
        print("Depth:", depth)
        print("Train:", np.mean(output['train_score']))
        print("Test:", np.mean(output['test_score']))
        print()

[X_train]
Depth: 1
Train: -0.5509345983393196
Test: -0.5555417212831533

Depth: 2
Train: -0.5060648714707121
Test: -0.8425206463370373

Depth: 3
Train: -0.46684567149659256
Test: -1.094986903241076

Depth: 4
Train: -0.4195330177521549
Test: -1.7527059030996726

Depth: 5
Train: -0.36858350031551146
Test: -3.387515488163936

Depth: 6
Train: -0.32072001795414273
Test: -4.156940341208259

-------------------------

[X_train_feature_selection]
Depth: 1
Train: -0.5509345983393196
Test: -0.5555417212831533

Depth: 2
Train: -0.5047619494050309
Test: -0.5698325022023213

Depth: 3
Train: -0.48178093791654664
Test: -0.8107584209591044

Depth: 4
Train: -0.45065433208516587
Test: -1.567188516583124

Depth: 5
Train: -0.4221276741502645
Test: -2.574520283778304

Depth: 6
Train: -0.40206252571573325
Test: -2.9496799181254123

