### Stratified Sampling Example
Use stratified sampling when the sample set contains inhomogenous populations (non equal proportions). The stratified shuffle split object contains the train_index & test_index, NOTE: that this might only be accessed by looping within the strat-shuf-split object

In [1]:
import numpy as np
from sklearn.cross_validation import StratifiedShuffleSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4],[8,7],[5,4],[19,32], [5,1],[7,4],[9,4],
             [1, 2], [3, 4], [1, 2], [3, 4],[8,7],[5,4],[19,32], [5,1],[7,4],[9,4]])
y = np.array([0, 0, 0,0,0,0,0, 0, 0,0,0,0,2,2,2,2,2,2,1,1])

In [2]:
sss = StratifiedShuffleSplit(y, 2, test_size=0.33, random_state=0)
print(sss)

StratifiedShuffleSplit(labels=[0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 1 1], n_iter=2, test_size=0.33, random_state=0)


In [3]:
type(sss)

sklearn.cross_validation.StratifiedShuffleSplit

In [4]:
for train_index, test_index in sss:
    print("TRAIN indices:", train_index, "TEST indices:", test_index)
#...    X_train, X_test = X[train_index], X[test_index]

TRAIN indices: [ 4  1 13 12  6 10  2 15 17  8 11 19  7] TEST indices: [18 14  3 16  5  0  9]
TRAIN indices: [ 2 18  1 10  4 16 12  9 13  6  8 14  5] TEST indices: [19 17  3  7  0 15 11]


In [5]:
print(y[train_index])
print(y[test_index])

[0 1 0 0 0 2 2 0 2 0 0 2 0]
[1 2 0 0 0 2 0]


#### Test on Real Data

In [6]:
# Test on real data
import pandas as pd
features_df = pd.DataFrame.from_csv("well_data.csv") # features data
labels_df   = pd.DataFrame.from_csv("well_labels.csv") # labels

In [7]:
#function to map the labels from string to ints
def label_map(y):
   if y=="functional":
       return 2
   elif y=="functional needs repair":
       return 1
   else:
       return 0
labels_df = labels_df.applymap(label_map) # map labels
#test
print( labels_df.head() )

       status_group
id                 
69572             2
8776              2
34310             2
67743             0
19728             2


In [8]:
# Check that the labels have been encoded to ints 0,1,2
labels_df['status_group'].unique()

array([2, 0, 1])

In [9]:
# NOTE the data prep is ommited - droping columns, string encoding and OHE
# Can't use the data as is in ML algo

# The features and labels are taken out of their dataframe
# and put into a numpy.ndarray and list, respectively.
X = features_df.as_matrix()
y = labels_df["status_group"]# .tolist()

In [10]:
# Data setup
#X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.33, random_state=42)

#sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)
#X_train, X_test, y_train, y_test = StratifiedShuffleSplit(X, y, test_size=0.33, random_state=42)

sss = StratifiedShuffleSplit(y,2, test_size=0.2, random_state=42)
for train_index, test_index in sss:
    print("TRAIN:", train_index, "TEST:", test_index)
    #X_train, X_test = X[train_index], X[test_index]
    #y_train, y_test = y[train_index], y[test_index]

TRAIN: [56033 42149 50644 ..., 23965 52811 37112] TEST: [52237  4107 28169 ..., 54782  2110  3833]
TRAIN: [31788 54920  3600 ..., 37340 40168 39623] TEST: [21495  5759  4607 ..., 28518   447 26835]


In [11]:
print ('Training n: ' + str(len(train_index)))
print ('Test n: ' + str(len(test_index)))

Training n: 47520
Test n: 11880


In [12]:
test_proportion = len(test_index) / len(train_index)
print(test_proportion)

0.25


In [13]:
#y[test_index] # <- NaNs in here are I guess the training data
test_label_df = pd.DataFrame(y[test_index], columns=['labels'])

Check the proportions of labels in % the training set

In [14]:
train0 = y[train_index]
train0 = train0.to_frame()
train0.groupby('status_group').size() * 100 / len(train_index)

status_group
0    30.677609
1     5.784933
2    43.396465
dtype: float64

Check the proportions in % of labels in the test set

In [15]:
test0 = y[test_index]
test0 = test0.to_frame()
test0.groupby('status_group').size() * 100 / len(test_index)

status_group
0    30.808081
1     6.153199
2    43.367003
dtype: float64