In [None]:
# setup
from mlwpy import *
%matplotlib inline

iris = datasets.load_iris()

tts = skms.train_test_split(iris.data, iris.target, 
                            test_size=.33, random_state=21)

(iris_train_ftrs, iris_test_ftrs, 
 iris_train_tgt,  iris_test_tgt) = tts

In [None]:
# normal usage:  build-fit-predict-evaluate
baseline = dummy.DummyClassifier(strategy="most_frequent")
baseline.fit(iris_train_ftrs, iris_train_tgt)
base_preds = baseline.predict(iris_test_ftrs)
base_acc = metrics.accuracy_score(base_preds, iris_test_tgt)
print(base_acc)

In [None]:
strategies = ['constant', 'uniform', 'stratified', 
              'prior', 'most_frequent']

# setup args to create diff. DummyClassifier strategies
baseline_args = [{'strategy':s} for s in strategies]
baseline_args[0]['constant'] = 0 # class 0 is setosa 

accuracies = []
for bla in baseline_args:
    baseline = dummy.DummyClassifier(**bla)
    baseline.fit(iris_train_ftrs, iris_train_tgt)
    base_preds = baseline.predict(iris_test_ftrs)
    accuracies.append(metrics.accuracy_score(base_preds, iris_test_tgt))
    
display(pd.DataFrame({'accuracy':accuracies}, index=strategies))

In [None]:
# helpful stdlib tool for cleaning up printouts
import textwrap
print(textwrap.fill(str(sorted(metrics.SCORERS.keys())), 
                    width=70))

In [None]:
knn = neighbors.KNeighborsClassifier()

# help(knn.score) # verbose, but complete

print(knn.score.__doc__.splitlines()[0])
print('\n---and---\n')
print("\n".join(knn.score.__doc__.splitlines()[-6:]))

In [None]:
tgt_preds = (neighbors.KNeighborsClassifier()
                      .fit(iris_train_ftrs, iris_train_tgt)
                      .predict(iris_test_ftrs))

print("accuracy:", metrics.accuracy_score(iris_test_tgt, 
                                          tgt_preds))

cm = metrics.confusion_matrix(iris_test_tgt, 
                              tgt_preds)
print("confusion matrix:", cm, sep="\n")

In [None]:
fig, ax = plt.subplots(1,1,figsize=(4,4))
cm = metrics.confusion_matrix(iris_test_tgt, tgt_preds)
ax = sns.heatmap(cm, annot=True, square=True,
                 xticklabels=iris.target_names, 
                 yticklabels=iris.target_names)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual');

In [None]:
macro_prec = metrics.precision_score(iris_test_tgt, 
                                     tgt_preds, 
                                     average='macro')
print("macro:", macro_prec)

cm = metrics.confusion_matrix(iris_test_tgt, tgt_preds)
n_labels = len(iris.target_names)
print("should equal 'macro avg':", 
      # correct           column              # columns
      (np.diag(cm) / cm.sum(axis=0)).sum() / n_labels)

In [None]:
print("micro:", metrics.precision_score(iris_test_tgt, 
                                        tgt_preds, 
                                        average='micro'))

cm = metrics.confusion_matrix(iris_test_tgt, tgt_preds)
print("should equal avg='micro':", 
      # TP.sum()        / (TP&FP).sum() --> 
      # all correct     / all preds
      np.diag(cm).sum() / cm.sum())

In [None]:
print(metrics.classification_report(iris_test_tgt, 
                                    tgt_preds))
# average is a weighted macro average (see text)

# verify sums-across-rows
cm = metrics.confusion_matrix(iris_test_tgt, tgt_preds)
print("row counts equal support:", cm.sum(axis=1))

In [None]:
# warning: this is 1 "one" not l "ell"
is_versicolor = iris.target == 1
tts_1c = skms.train_test_split(iris.data, is_versicolor, 
                               test_size=.33, random_state = 21)
(iris_1c_train_ftrs, iris_1c_test_ftrs, 
 iris_1c_train_tgt,  iris_1c_test_tgt) = tts_1c

# build, fit, predict (probability scores) for NB model
gnb = naive_bayes.GaussianNB()
prob_true = (gnb.fit(iris_1c_train_ftrs, iris_1c_train_tgt)
                .predict_proba(iris_1c_test_ftrs)[:,1]) # [:,1]=="True"

In [None]:
fpr, tpr, thresh = metrics.roc_curve(iris_1c_test_tgt, 
                                     prob_true)
auc = metrics.auc(fpr, tpr)
print("FPR : {}".format(fpr), 
      "TPR : {}".format(tpr), sep='\n')


# create the main graph
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(fpr, tpr, 'o--')
ax.set_title("1-Class Iris ROC Curve\nAUC:{:.3f}".format(auc))
ax.set_xlabel("FPR") 
ax.set_ylabel("TPR");

# do a bit of work to label some points with their
# respective thresholds
investigate = np.array([1,3,5])
for idx in investigate:
    th, f, t = thresh[idx], fpr[idx], tpr[idx]
    ax.annotate('thresh = {:.3f}'.format(th), 
                xy=(f+.01, t-.01), xytext=(f+.1, t),
                arrowprops = {'arrowstyle':'->'})

In [None]:
title_fmt = "Threshold {}\n~{:5.3f}\nTPR : {:.3f}\nFPR : {:.3f}"

pn = ['Positive', 'Negative']
add_args = {'xticklabels': pn,
            'yticklabels': pn,
            'square':True}

fig, axes = plt.subplots(1,3, sharey = True, figsize=(12,4))
for ax, thresh_idx in zip(axes.flat, investigate):
    preds_at_th = prob_true < thresh[thresh_idx]
    cm = metrics.confusion_matrix(1-iris_1c_test_tgt, preds_at_th)
    sns.heatmap(cm, annot=True, cbar=False, ax=ax,
                **add_args)

    ax.set_xlabel('Predicted')
    ax.set_title(title_fmt.format(thresh_idx, 
                                  thresh[thresh_idx],
                                  tpr[thresh_idx], 
                                  fpr[thresh_idx]))

axes[0].set_ylabel('Actual');
# note: e.g. for threshold 3
# FPR = 1-spec = 1 - 31/(31+2) = 1 - 31/33 = 0.0606...

In [None]:
fig,ax = plt.subplots(1,1,figsize=(3,3))
model = neighbors.KNeighborsClassifier(3)
cv_auc = skms.cross_val_score(model, iris.data, iris.target==1, 
                              scoring='roc_auc', cv=10)
ax = sns.swarmplot(cv_auc, orient='v')
ax.set_title('10-Fold AUCs');

In [None]:
checkout = [0,50,100]
print("Original Encoding")
print(iris.target[checkout])

In [None]:
print("'Multi-label' Encoding")
print(skpre.label_binarize(iris.target, [0,1,2])[checkout])

In [None]:
iris_multi_tgt = skpre.label_binarize(iris.target, [0,1,2])

# im --> "iris multi"
(im_train_ftrs, im_test_ftrs, 
 im_train_tgt,  im_test_tgt) = skms.train_test_split(iris.data, 
                                                     iris_multi_tgt,
                                                     test_size=.33,
                                                     random_state=21)

# knn wrapped up in one-versus-rest (3 classifiers)
knn        = neighbors.KNeighborsClassifier(n_neighbors=5)
ovr_knn    = skmulti.OneVsRestClassifier(knn) 
pred_probs = (ovr_knn.fit(im_train_ftrs, im_train_tgt)
                     .predict_proba(im_test_ftrs))

# make ROC plots
lbl_fmt = "Class {} vs Rest (AUC = {:.2f})"
fig,ax = plt.subplots(figsize=(8,4))
for cls in [0,1,2]:
    fpr, tpr, _ = metrics.roc_curve(im_test_tgt[:,cls], 
                                    pred_probs[:,cls])
    label = lbl_fmt.format(cls, metrics.auc(fpr,tpr))
    ax.plot(fpr, tpr, 'o--', label=label)
ax.legend()
ax.set_xlabel("FPR")
ax.set_ylabel("TPR");

In [None]:
knn         = neighbors.KNeighborsClassifier(n_neighbors=5)
ovo_knn     = skmulti.OneVsOneClassifier(knn) 
pred_scores = (ovo_knn.fit(iris_train_ftrs, iris_train_tgt)
                     .decision_function(iris_test_ftrs))
df = pd.DataFrame(pred_scores)
df['class'] = df.values.argmax(axis=1)
display(df.head())

In [None]:
# note: ugly to make column headers
mi = pd.MultiIndex([['Class Indicator', 'Vote'], [0, 1, 2]],
                    [[0]*3+[1]*3,list(range(3)) * 2])
df = pd.DataFrame(np.c_[im_test_tgt, pred_scores], 
                  columns=mi)
display(df.head())

In [None]:
def hand_and_till_M_statistic(test_tgt, test_probs, weighted=False):
    def auc_helper(truth, probs):
        fpr, tpr, _ = metrics.roc_curve(truth, probs)
        return metrics.auc(fpr, tpr)

    classes   = np.unique(test_tgt)
    n_classes = len(classes)

    indicator = skpre.label_binarize(test_tgt, classes)
    avg_auc_sum = 0.0

    # comparing class i and class j
    for ij in it.combinations(classes, 2):
        # use use sum to act like a logical or
        ij_indicator = indicator[:,ij].sum(axis=1, 
                                           dtype=np.bool)
        
        # slightly ugly, can't broadcast these as indexes
        # use .ix_ to save the day
        ij_probs    = test_probs[np.ix_(ij_indicator, ij)]
        ij_test_tgt = test_tgt[ij_indicator]

        i,j = ij
        auc_ij = auc_helper(ij_test_tgt==i, ij_probs[:,0]) 
        auc_ji = auc_helper(ij_test_tgt==j, ij_probs[:,1]) 

        # compared to Hand & Till reference
        # no / 2 ... factor it out since it will cancel
        avg_auc_ij = (auc_ij + auc_ji) 

        if weighted:
            avg_auc_ij *= ij_indicator.sum() / len(test_tgt)
        avg_auc_sum += avg_auc_ij

    # compared to Hand & Till reference
    # no * 2 ... factored out above and they cancel
    M = avg_auc_sum / (n_classes * (n_classes-1)) 
    return M

In [None]:
knn = neighbors.KNeighborsClassifier()
knn.fit(iris_train_ftrs, iris_train_tgt)
test_probs = knn.predict_proba(iris_test_ftrs)
hand_and_till_M_statistic(iris_test_tgt, test_probs)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(3,3))
htm_scorer = metrics.make_scorer(hand_and_till_M_statistic, 
                                 needs_proba=True)
cv_auc = skms.cross_val_score(model, 
                              iris.data, iris.target, 
                              scoring=htm_scorer, cv=10)
sns.swarmplot(cv_auc, orient='v')
ax.set_title('10-Fold H&T Ms');

In [None]:
fig,ax = plt.subplots(figsize=(6,3))
for cls in [0,1,2]:
    prc = metrics.precision_recall_curve
    precision, recall, _ = prc(im_test_tgt[:,cls], 
                               pred_probs[:,cls])
    prc_auc = metrics.auc(recall, precision)
    label = "Class {} vs Rest (AUC) = {:.2f})".format(cls, prc_auc) 
    ax.plot(recall, precision, 'o--', label=label)
ax.legend()
ax.set_xlabel('Recall')
ax.set_ylabel('Precision');

In [None]:
# negate b/c we want big values first
myorder = np.argsort(-prob_true)

# cumulative sum then to percent (last value is total)
realpct_myorder = iris_1c_test_tgt[myorder].cumsum()       
realpct_myorder = realpct_myorder / realpct_myorder[-1]

# convert counts of data into percents
N = iris_1c_test_tgt.size
xs = np.linspace(1/N,1,N)

print(myorder[:3])

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(8,4))
fig.tight_layout()

# cumulative response
ax1.plot(xs, realpct_myorder, 'r.')
ax1.plot(xs, xs, 'b-')
ax1.axes.set_aspect('equal')

ax1.set_title("Cumulative Response")
ax1.set_ylabel("Percent of Actual Hits")
ax1.set_xlabel("Percent Of Population\n" +
               "Starting with Highest Predicted Hits")

# lift
# replace divide by zero with 1.0
ax2.plot(xs, realpct_myorder / np.where(xs > 0, xs, 1))

ax2.set_title("Lift Versus Random")
ax2.set_ylabel("X-Fold Improvement") # not cross-fold!
ax2.set_xlabel("Percent Of Population\n" + 
               "Starting with Highest Predicted Hits")
ax2.yaxis.tick_right()
ax2.yaxis.set_label_position('right');

In [None]:
classifiers = {'base'  : baseline,
               'gnb'   : naive_bayes.GaussianNB(),
               '3-NN'  : neighbors.KNeighborsClassifier(n_neighbors=3),
               '10-NN' : neighbors.KNeighborsClassifier(n_neighbors=10)}

In [None]:
# define the one_class iris problem so we don't have random ==1 around
iris_onec_ftrs = iris.data
iris_onec_tgt  = iris.target==1

In [None]:
msrs = ['accuracy', 'average_precision', 'roc_auc']

fig, axes = plt.subplots(len(msrs), 1, figsize=(6, 2*len(msrs)))
fig.tight_layout()

for mod_name, model in classifiers.items():
    # abbreviate
    cvs = skms.cross_val_score
    cv_results = {msr:cvs(model, iris_onec_ftrs, iris_onec_tgt,
                          scoring=msr, cv=10) for msr in msrs}
    
    for ax, msr in zip(axes, msrs):
        msr_results = cv_results[msr]
        my_lbl = "{:12s} {:.3f} {:.2f}".format(mod_name, 
                                               msr_results.mean(), 
                                               msr_results.std())
        ax.plot(msr_results, 'o--', label=my_lbl)
        ax.set_title(msr)
        ax.legend(loc='lower center', ncol=2)

In [None]:
fig, axes = plt.subplots(2,2, figsize=(4,4), sharex=True, sharey=True)
fig.tight_layout()

for ax, (mod_name, model) in zip(axes.flat, classifiers.items()):
    preds = skms.cross_val_predict(model, 
                                   iris_onec_ftrs, iris_onec_tgt, 
                                   cv=10)
    
    cm = metrics.confusion_matrix(iris.target==1, preds)
    sns.heatmap(cm, annot=True, ax=ax, 
                cbar=False, square=True, fmt="d")
    
    ax.set_title(mod_name)
    
axes[1,0].set_xlabel('Predicted')
axes[1,1].set_xlabel('Predicted')
axes[0,0].set_ylabel('Actual')
axes[1,0].set_ylabel('Actual');

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6,4))

cv_prob_true = {}
for mod_name, model in classifiers.items():
    cv_probs = skms.cross_val_predict(model, 
                                      iris_onec_ftrs, iris_onec_tgt, 
                                      cv=10, method='predict_proba')
    cv_prob_true[mod_name] = cv_probs[:,1]
    
    fpr, tpr, thresh = metrics.roc_curve(iris_onec_tgt, 
                                         cv_prob_true[mod_name])
    
    auc = metrics.auc(fpr, tpr)
    ax.plot(fpr, tpr, 'o--', label="{}:{}".format(mod_name, auc))

ax.set_title('ROC Curves')
ax.legend();

In [None]:
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(10,5))

N = len(iris_onec_tgt)
xs = np.linspace(1/N,1,N)

ax1.plot(xs, xs, 'b-')

for mod_name in classifiers:    
    # negate b/c we want big values first
    myorder = np.argsort(-cv_prob_true[mod_name])

    # cumulative sum then to percent (last value is total)
    realpct_myorder = iris_onec_tgt[myorder].cumsum()       
    realpct_myorder = realpct_myorder / realpct_myorder[-1]
    
    ax1.plot(xs, realpct_myorder, '.', label=mod_name)
    
    ax2.plot(xs, 
            realpct_myorder / np.where(xs > 0, xs, 1),
            label=mod_name)
ax1.legend()
ax2.legend()

ax1.set_title("Cumulative Response")
ax2.set_title("Lift versus Random");

In [None]:
student_df = pd.read_csv('data/portugese_student_numeric_discrete.csv')
student_df['grade'] = pd.Categorical(student_df['grade'], 
                                     categories=['low', 'mid', 'high'], 
                                     ordered=True)

In [None]:
student_ftrs = student_df[student_df.columns[:-1]]
student_tgt  = student_df['grade'].cat.codes

In [None]:
fig,ax = plt.subplots(1,1,figsize=(3,3))
model = neighbors.KNeighborsClassifier(3)
cv_auc = skms.cross_val_score(model, 
                              student_ftrs, student_tgt, 
                              scoring='accuracy', cv=10)
ax = sns.swarmplot(cv_auc, orient='v')
ax.set_title('10-Fold Accuracy');

In [None]:
model = neighbors.KNeighborsClassifier(3)
my_scorer = metrics.make_scorer(metrics.precision_score,
                                average='macro')
cv_auc = skms.cross_val_score(model, 
                              student_ftrs, student_tgt, 
                              scoring=my_scorer, cv=10)
fig,ax = plt.subplots(1,1,figsize=(3,3))
sns.swarmplot(cv_auc, orient='v')
ax.set_title('10-Fold Macro Precision');

In [None]:
htm_scorer = metrics.make_scorer(hand_and_till_M_statistic, 
                                 needs_proba=True)
cv_auc = skms.cross_val_score(model, 
                              student_ftrs, student_tgt, 
                              scoring=htm_scorer, cv=10)

fig,ax = plt.subplots(1,1,figsize=(3,3))
sns.swarmplot(cv_auc, orient='v')
ax.set_title('10-Fold H&T Ms');

In [None]:
classifiers = {'base'  : dummy.DummyClassifier(strategy="most_frequent"),
               'gnb'   : naive_bayes.GaussianNB(),
               '3-NN'  : neighbors.KNeighborsClassifier(n_neighbors=10),
               '10-NN' : neighbors.KNeighborsClassifier(n_neighbors=3)}

In [None]:
macro_precision = metrics.make_scorer(metrics.precision_score,
                                      average='macro')
macro_recall    = metrics.make_scorer(metrics.recall_score,
                                      average='macro')
htm_scorer = metrics.make_scorer(hand_and_till_M_statistic, 
                                 needs_proba=True)

msrs = ['accuracy', macro_precision, 
        macro_recall, htm_scorer]

fig, axes = plt.subplots(len(msrs), 1, figsize=(6, 2*len(msrs)))
fig.tight_layout()

for mod_name, model in classifiers.items():
    # abbreviate
    cvs = skms.cross_val_score
    cv_results = {msr:cvs(model, student_ftrs, student_tgt,
                          scoring=msr, cv=10) for msr in msrs}
    
    for ax, msr in zip(axes, msrs):
        msr_results = cv_results[msr]
        my_lbl = "{:12s} {:.3f} {:.2f}".format(mod_name, 
                                               msr_results.mean(), 
                                               msr_results.std())
        ax.plot(msr_results, 'o--')
        ax.set_title(msr)
        # uncomment to see summary stats (clutters plots)
        #ax.legend(loc='lower center') 

In [None]:
fig, axes = plt.subplots(2,2, figsize=(5,5), sharex=True, sharey=True)
fig.tight_layout()

for ax, (mod_name, model) in zip(axes.flat, 
                                 classifiers.items()):
    preds = skms.cross_val_predict(model, 
                                   student_ftrs, student_tgt, 
                                   cv=10)
    
    cm = metrics.confusion_matrix(student_tgt, preds)
    sns.heatmap(cm, annot=True, ax=ax, 
                cbar=False, square=True, fmt="d",
                xticklabels=['low', 'med', 'high'],
                yticklabels=['low', 'med', 'high'])
    
    ax.set_title(mod_name)
axes[1,0].set_xlabel('Predicted')
axes[1,1].set_xlabel('Predicted')
axes[0,0].set_ylabel('Actual')
axes[1,0].set_ylabel('Actual');

In [None]:
student_url = ('https://archive.ics.uci.edu/' + 
               'ml/machine-learning-databases/00320/student.zip')
def grab_student_numeric_discrete():
    # download zip file and unzip
    # unzipping unknown files can be a security hazard
    import urllib.request, zipfile
    urllib.request.urlretrieve(student_url,
                               'port_student.zip')
    zipfile.ZipFile('port_student.zip').extract('student-mat.csv')

    # preprocessing
    df = pd.read_csv('student-mat.csv', sep=';')
    
    # g1 & g2 are highly correlated with g3;
    # dropping them makes the problem sig. harder
    # we also remove all non-numeric columns
    # and discretize the final grade by 0-50-75-100 percentile
    # which were determined by hand
    df = df.drop(columns=['G1', 'G2']).select_dtypes(include=['number'])
    df['grade'] = pd.cut(df['G3'], [0, 11, 14, 20], 
                         labels=['low', 'mid', 'high'],
                         include_lowest=True)
    df.drop(columns=['G3'], inplace=True)

    # save as
    df.to_csv('portugese_student_numeric_discrete.csv', index=False)