# Stratified Cross Validation

Example Solution using wine dataset.

April 2019

In [1]:
import os
import re
import numpy as np
import pandas as pd
import urllib.request
from collections import defaultdict

def setup_path(filepath, url):
    """Create path and download data."""
    fd, fn = re.findall(r"^(.+\/)([^\/]+)$",filepath)[0]
    fp = fd + fn
    if not os.path.exists(fd):
        os.mkdir(fd)
    if not os.path.exists(fp):
        urllib.request.urlretrieve(url, fp) 
    return fd, fn, fp 

def stratified_sampling(df, cls_col, k):
    """Find distribution by class, and distribute classes evenly amongst k folds."""
    class_dict = defaultdict(list)
    fold_idxs = defaultdict(list)

    # find distribution by class
    for i in range(len(df)):
        cls = int(df.iloc[i,:][cls_col])
        class_dict[cls].append(i)
    
    print("Initial Class Distribution:")
    for i, cls in enumerate(class_dict.keys()):
        print(f"{i}: {len(class_dict[cls])}, {100*(len(class_dict[cls])/len(wine_df)):.1f}%")
        
    # distributed classed indexes evenly by k folds
    for cls in class_dict.keys():
        idxs = class_dict[cls]
        i = 0
        while len(idxs)>0:
            idx = np.random.choice(idxs,replace=False)
            popped = idxs.pop(idxs.index(idx))
            fold_idxs[i].append(popped)
            if i == k-1:
                i = 0
            else:
                i+=1
    
    assert len(set(df.iloc[fold_idxs[k-1]].index))==len(wine_df.iloc[fold_idxs[k-1]].index)
    return fold_idxs

def test_stratified_sampling(df, cls_col, fold_idxs):
    """Test the functionality of stratified_sampling."""
    print(f"\nThere are {len(fold_idxs.keys())} folds.")
    for k in fold_idxs.keys():
        dfk = df.iloc[fold_idxs[k]]
        print(f"\nFold {k}\nshape: {dfk.shape}")
        dfk = pd.DataFrame(dfk.groupby(cls_col)[cls_col].count())
        dfk.columns = ['count']
        dfk['pct'] = dfk['count'].apply(lambda x: f"{100*(x / np.sum(dfk['count'])):.1f}%")
        print(dfk)
        
def train_test_fold_split(df, fold_idxs):
    """Determine train test split for k folds."""
    train_dict = defaultdict(list)
    test_dict = defaultdict(list)

    for k in fold_idxs.keys():
        print(f"\nFold {k}:")
        train = df.drop(fold_idxs[k])
        test = df.iloc[fold_idxs[k]]
        print(train.shape, test.shape)
        train_dict[k] = train.sort_index()
        test_dict[k] = test.sort_index()
        print(train_dict[k].head())
        print(test_dict[k].head())
    return train_dict, test_dict

In [2]:
fd, fn, fp = setup_path('/tmp/data/wine.data','https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data')

In [3]:
wine_df = pd.read_csv(fp,header=None)
wine_df.columns = ['wine','alcohol','malic_acid','ash','alcalinity','magnesium',
               'phenols','flavanoids','nonflavanoid_phenols','proanthocyanins',
               'color_intensity','hue','od280','proline']
print(wine_df.shape)
wine_df.head()

(178, 14)


Unnamed: 0,wine,alcohol,malic_acid,ash,alcalinity,magnesium,phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
fold_idxs = stratified_sampling(wine_df, "wine",5)
test_stratified_sampling(wine_df, "wine", fold_idxs)

Initial Class Distribution:
0: 59, 33.1%
1: 71, 39.9%
2: 48, 27.0%

There are 5 folds.

Fold 0
shape: (37, 14)
      count    pct
wine              
1        12  32.4%
2        15  40.5%
3        10  27.0%

Fold 1
shape: (36, 14)
      count    pct
wine              
1        12  33.3%
2        14  38.9%
3        10  27.8%

Fold 2
shape: (36, 14)
      count    pct
wine              
1        12  33.3%
2        14  38.9%
3        10  27.8%

Fold 3
shape: (35, 14)
      count    pct
wine              
1        12  34.3%
2        14  40.0%
3         9  25.7%

Fold 4
shape: (34, 14)
      count    pct
wine              
1        11  32.4%
2        14  41.2%
3         9  26.5%


In [5]:
train_folds, test_folds = train_test_fold_split(wine_df, fold_idxs)


Fold 0:
(141, 14) (37, 14)
   wine  alcohol  malic_acid   ash  alcalinity  magnesium  phenols  \
0     1    14.23        1.71  2.43        15.6        127     2.80   
1     1    13.20        1.78  2.14        11.2        100     2.65   
3     1    14.37        1.95  2.50        16.8        113     3.85   
4     1    13.24        2.59  2.87        21.0        118     2.80   
5     1    14.20        1.76  2.45        15.2        112     3.27   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   
5        3.39                  0.34             1.97             6.75  1.05   

   od280  proline  
0   3.92     1065  
1   3.40     1050  
3   3.45     148