Loading data
------------
 
Loading tabular data with mixed numerical and categorical data in sklearn can be a bit cumbersome. 
You can use the following function as an example of how to process your own dataset.


In [7]:
import numpy
import csv
from sklearn.feature_extraction import DictVectorizer

def read_adult_data(f):
    '''Read adult dataset and return feature array, and sequence of targets.
    
    Categorical variables are converted to dummies. Targets are kept as is.'''
    data = []
    for row in csv.reader(open(f)):
        if row: # skip empty lines
            # convert each row into three items:
            # (a) list of numerical features
            # (b) dict of categorical features
            # (c) target value
            nums = [ row[0], row[2], row[4], row[10], row[11], row[12] ]
            cats = { 'workclass': row[1], 'education': row[3], 'marital-status': row[5], 
                     'occupation': row[6], 'relationship': row[7], 'race': row[8], 
                     'sex': row[9], 'native-country': row[13] }
            target = row[14]
            data.append((nums, cats, target))
    # Now we can use the DictVectorizer to create dummy variables for the categorical features
    (Nums, Cats, Y) = zip(*data)
    vec = DictVectorizer(sparse=False)
    X_cat = vec.fit_transform(Cats)
    # Convert numerical features into an array of floats
    X_num = numpy.array(Nums, dtype='float')
    # We now need to put the two types of variables in a single array
    X = numpy.hstack([X_num, X_cat]) # hstack puts columns next to each other
    return (X, numpy.array(Y))

In [8]:
X, Y = read_adult_data('adult.data.csv')

In [9]:
X.shape

(32561, 108)

In [10]:
Y.shape

(32561,)