In [34]:
import numpy as np

In [37]:
def read_training_data(fname, D=None):
    """Given a file in appropriate format, and given a set D of features,
    returns the pair (A, b) consisting of
    a P-by-D matrix A and a P-vector b,
    where P is a set of patient identification integers (IDs).

    For each patient ID p,
      - row p of A is the D-vector describing patient p's tissue sample,
      - entry p of b is +1 if patient p's tissue is malignant, and -1 if it is benign.

    The set D of features must be a subset of the features in the data (see text).
    """
    file = open(fname)
    params = ["radius", "texture", "perimeter","area","smoothness","compactness","concavity","concave points","symmetry","fractal dimension"];
    stats = ["(mean)", "(stderr)", "(worst)"]
    feature_labels = set([y+x for x in stats for y in params])
    feature_map = {params[i]+stats[j]:j*len(params)+i for i in range(len(params)) for j in range(len(stats))}
    
    patient_ids = []
    feature_vectors = []
    patient_diagnoses = []
    for line in file:
        row = line.split(",")
        patient_ID = int(row[0])
        patient_ids.append(patient_ID)
        
        patient_diagnoses.append(-1 if row[1]=='B' else +1)
        feature_vectors.append([float(row[feature_map[key]+2]) for key in feature_labels])
    return feature_labels, patient_ids, np.matrix(feature_vectors), np.array(patient_diagnoses) 

In [38]:
read_training_data('train.data')

({'area(mean)',
  'area(stderr)',
  'area(worst)',
  'compactness(mean)',
  'compactness(stderr)',
  'compactness(worst)',
  'concave points(mean)',
  'concave points(stderr)',
  'concave points(worst)',
  'concavity(mean)',
  'concavity(stderr)',
  'concavity(worst)',
  'fractal dimension(mean)',
  'fractal dimension(stderr)',
  'fractal dimension(worst)',
  'perimeter(mean)',
  'perimeter(stderr)',
  'perimeter(worst)',
  'radius(mean)',
  'radius(stderr)',
  'radius(worst)',
  'smoothness(mean)',
  'smoothness(stderr)',
  'smoothness(worst)',
  'symmetry(mean)',
  'symmetry(stderr)',
  'symmetry(worst)',
  'texture(mean)',
  'texture(stderr)',
  'texture(worst)'},
 [842302,
  842517,
  84300903,
  84348301,
  84358402,
  843786,
  844359,
  84458202,
  844981,
  84501001,
  845636,
  84610002,
  846226,
  846381,
  84667401,
  84799002,
  848406,
  84862001,
  849014,
  8510426,
  8510653,
  8510824,
  8511133,
  851509,
  852552,
  852631,
  852763,
  852781,
  852973,
  853201,
  