# Stratified Cross Validation

Example Solution using wine dataset.

April 2019

In [1]:
import os
import re
import numpy as np
import pandas as pd
import urllib.request
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix)

from bm_util import download_url_to_filepath

def stratified_sampling(df, cls_col, k):
    """Find distribution by class, and distribute classes evenly amongst k folds."""
    class_dict = defaultdict(list)
    fold_idxs = defaultdict(list)

    # find distribution by class
    for i in range(len(df)):
        cls = int(df.iloc[i,:][cls_col])
        class_dict[cls].append(i)
    
    print("Initial Class Distribution:")
    for i, cls in enumerate(class_dict.keys()):
        print(f"{i}: {len(class_dict[cls])}, {100*(len(class_dict[cls])/len(wine_df)):.1f}%")
        
    # distributed classed indexes evenly by k folds
    for cls in class_dict.keys():
        idxs = class_dict[cls]
        i = 0
        while len(idxs)>0:
            idx = np.random.choice(idxs,replace=False)
            popped = idxs.pop(idxs.index(idx))
            fold_idxs[i].append(popped)
            if i == k-1:
                i = 0
            else:
                i+=1
    
    assert len(set(df.iloc[fold_idxs[k-1]].index))==len(wine_df.iloc[fold_idxs[k-1]].index)
    return fold_idxs

def test_stratified_sampling(df, cls_col, fold_idxs):
    """Test the functionality of stratified_sampling."""
    print(f"\nThere are {len(fold_idxs.keys())} folds.")
    for k in fold_idxs.keys():
        dfk = df.iloc[fold_idxs[k]]
        print(f"\nFold {k}\nshape: {dfk.shape}")
        dfk = pd.DataFrame(dfk.groupby(cls_col)[cls_col].count())
        dfk.columns = ['count']
        dfk['pct'] = dfk['count'].apply(lambda x: f"{100*(x / np.sum(dfk['count'])):.1f}%")
        print(dfk)
        
def train_test_fold_split(df, fold_idxs):
    """Determine train test split for k folds."""
    train_dict = defaultdict(list)
    test_dict = defaultdict(list)

    for k in fold_idxs.keys():
        print(f"\nFold {k}:")
        train = df.drop(fold_idxs[k])
        test = df.iloc[fold_idxs[k]]
        print(train.shape, test.shape)
        train_dict[k] = train.sort_index()
        test_dict[k] = test.sort_index()
#         print(train_dict[k].head())
#         print(test_dict[k].head())
    return train_dict, test_dict

In [2]:
PATH = "/tmp/data/"
np.random.seed(21)

In [3]:
fp = download_url_to_filepath(PATH + 'wine.data','https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data')

/tmp/data/wine.data already exists.


In [4]:
wine_df = pd.read_csv(fp,header=None)
wine_df.columns = ['wine','alcohol','malic_acid','ash','alcalinity','magnesium',
               'phenols','flavanoids','nonflavanoid_phenols','proanthocyanins',
               'color_intensity','hue','od280','proline']
print(wine_df.shape)
wine_df.head()

(178, 14)


Unnamed: 0,wine,alcohol,malic_acid,ash,alcalinity,magnesium,phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [5]:
fold_idxs = stratified_sampling(wine_df, "wine",5)
test_stratified_sampling(wine_df, "wine", fold_idxs)

Initial Class Distribution:
0: 59, 33.1%
1: 71, 39.9%
2: 48, 27.0%

There are 5 folds.

Fold 0
shape: (37, 14)
      count    pct
wine              
1        12  32.4%
2        15  40.5%
3        10  27.0%

Fold 1
shape: (36, 14)
      count    pct
wine              
1        12  33.3%
2        14  38.9%
3        10  27.8%

Fold 2
shape: (36, 14)
      count    pct
wine              
1        12  33.3%
2        14  38.9%
3        10  27.8%

Fold 3
shape: (35, 14)
      count    pct
wine              
1        12  34.3%
2        14  40.0%
3         9  25.7%

Fold 4
shape: (34, 14)
      count    pct
wine              
1        11  32.4%
2        14  41.2%
3         9  26.5%


In [6]:
train_folds, test_folds = train_test_fold_split(wine_df, fold_idxs)


Fold 0:
(141, 14) (37, 14)

Fold 1:
(142, 14) (36, 14)

Fold 2:
(142, 14) (36, 14)

Fold 3:
(143, 14) (35, 14)

Fold 4:
(144, 14) (34, 14)


### Random Forest

In [7]:
train_folds[0].head()

Unnamed: 0,wine,alcohol,malic_acid,ash,alcalinity,magnesium,phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
5,1,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450


In [8]:
rf = RandomForestClassifier(n_estimators=100)
train = train_folds[0].sample(frac=1)
test = test_folds[0].sample(frac=1)
train_X = train.iloc[:,1:]
train_y = train.iloc[:,0]
test_X = test.iloc[:,1:]
test_y = test.iloc[:,0]
rf.fit(train_X,train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Score

In [9]:
print(rf.score(test_X, test_y))
list(zip(rf.predict(test_X), test_y, rf.predict_proba(test_X)))

0.972972972972973


[(3, 3, array([0.02, 0.15, 0.83])),
 (2, 2, array([0.01, 0.96, 0.03])),
 (3, 3, array([0. , 0.1, 0.9])),
 (2, 2, array([0.04, 0.96, 0.  ])),
 (2, 2, array([0., 1., 0.])),
 (1, 1, array([1., 0., 0.])),
 (1, 1, array([0.96, 0.03, 0.01])),
 (1, 1, array([0.64, 0.32, 0.04])),
 (2, 2, array([0.1 , 0.88, 0.02])),
 (2, 2, array([0.25, 0.71, 0.04])),
 (2, 2, array([0.01, 0.56, 0.43])),
 (1, 1, array([0.45, 0.45, 0.1 ])),
 (2, 2, array([0., 1., 0.])),
 (3, 3, array([0.01, 0.18, 0.81])),
 (2, 2, array([0.05, 0.89, 0.06])),
 (1, 1, array([0.95, 0.04, 0.01])),
 (3, 3, array([0.  , 0.14, 0.86])),
 (3, 3, array([0.05, 0.1 , 0.85])),
 (2, 2, array([0.01, 0.87, 0.12])),
 (2, 2, array([0.01, 0.99, 0.  ])),
 (1, 1, array([1., 0., 0.])),
 (3, 3, array([0.01, 0.03, 0.96])),
 (2, 2, array([0.24, 0.71, 0.05])),
 (1, 1, array([0.95, 0.04, 0.01])),
 (1, 1, array([0.72, 0.28, 0.  ])),
 (1, 2, array([0.67, 0.32, 0.01])),
 (1, 1, array([1., 0., 0.])),
 (1, 1, array([0.94, 0.06, 0.  ])),
 (3, 3, array([0.  , 0.01

Feature Importances

In [10]:
sorted(list(zip(rf.feature_importances_, train_X.columns)),reverse=True)

[(0.1759489150917862, 'proline'),
 (0.16030966623738002, 'color_intensity'),
 (0.15864423848197073, 'flavanoids'),
 (0.1171600877698009, 'hue'),
 (0.1164881828983997, 'alcohol'),
 (0.09088313556281631, 'od280'),
 (0.0436186334971713, 'phenols'),
 (0.033403477758907965, 'alcalinity'),
 (0.030985283933621786, 'magnesium'),
 (0.026276475287698133, 'proanthocyanins'),
 (0.021010401281257453, 'malic_acid'),
 (0.016037036337822087, 'ash'),
 (0.00923446586136741, 'nonflavanoid_phenols')]

K Folds

In [11]:
def random_forest(train, test, y_col):
    """Deploy a single instance of a random forest model."""
    rf = RandomForestClassifier(n_estimators=100)
    train_X = train.drop(train.columns[y_col],axis=1)
    train_y = train.iloc[:,y_col]
    test_X = test.drop(test.columns[y_col],axis=1)
    test_y = test.iloc[:,y_col]
    rf.fit(train_X,train_y)
    pred_y = rf.predict(test_X)
    acc = accuracy_score(test_y,pred_y)
    cm = confusion_matrix(test_y,pred_y)
    cr = classification_report(test_y, pred_y)
    print(f"Accuracy:{acc:.2f}\nConfusionMatrix:\n{cm}\nClassificationReport:\n{cr}")

In [12]:
for k in range(len(train_folds.keys())):
    print(f"\nFold {k}:\n")
    train = train_folds[k].sample(frac=1)
    test = test_folds[k].sample(frac=1)
    random_forest(train, test, 0)


Fold 0:

Accuracy:0.95
ConfusionMatrix:
[[11  1  0]
 [ 1 14  0]
 [ 0  0 10]]
ClassificationReport:
              precision    recall  f1-score   support

           1       0.92      0.92      0.92        12
           2       0.93      0.93      0.93        15
           3       1.00      1.00      1.00        10

   micro avg       0.95      0.95      0.95        37
   macro avg       0.95      0.95      0.95        37
weighted avg       0.95      0.95      0.95        37


Fold 1:

Accuracy:1.00
ConfusionMatrix:
[[12  0  0]
 [ 0 14  0]
 [ 0  0 10]]
ClassificationReport:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00        10

   micro avg       1.00      1.00      1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36


Fold 2:

Accuracy:0.94
ConfusionMatri