In [1]:
import numpy as np

In [2]:
def read_training_data(fname, D=None):
    """Given a file in appropriate format, and given a set D of features,
    returns the pair (A, b) consisting of
    a P-by-D matrix A and a P-vector b,
    where P is a set of patient identification integers (IDs).

    For each patient ID p,
      - row p of A is the D-vector describing patient p's tissue sample,
      - entry p of b is +1 if patient p's tissue is malignant, and -1 if it is benign.

    The set D of features must be a subset of the features in the data (see text).
    """
    file = open(fname)
    params = ["radius", "texture", "perimeter","area","smoothness","compactness","concavity","concave points","symmetry","fractal dimension"];
    stats = ["(mean)", "(stderr)", "(worst)"]
    feature_labels = set([y+x for x in stats for y in params])
    feature_map = {params[i]+stats[j]:j*len(params)+i for i in range(len(params)) for j in range(len(stats))}
    
    patient_ids = []
    feature_vectors = []
    patient_diagnoses = []
    for line in file:
        row = line.split(",")
        patient_ID = int(row[0])
        patient_ids.append(patient_ID)
        
        patient_diagnoses.append(-1 if row[1]=='B' else +1)
        feature_vectors.append([float(row[feature_map[key]+2]) for key in feature_labels])
    return feature_labels, patient_ids, np.matrix(feature_vectors), np.array(patient_diagnoses) 

In [3]:
labels, ids, matrix, vectors = read_training_data('train.data')

In [4]:
print(labels)

{'area(worst)', 'texture(mean)', 'texture(worst)', 'smoothness(worst)', 'perimeter(mean)', 'concavity(stderr)', 'area(mean)', 'concavity(mean)', 'compactness(mean)', 'radius(worst)', 'radius(stderr)', 'fractal dimension(worst)', 'fractal dimension(stderr)', 'texture(stderr)', 'compactness(stderr)', 'concave points(mean)', 'smoothness(stderr)', 'perimeter(stderr)', 'symmetry(mean)', 'smoothness(mean)', 'concave points(stderr)', 'fractal dimension(mean)', 'area(stderr)', 'perimeter(worst)', 'symmetry(stderr)', 'symmetry(worst)', 'compactness(worst)', 'concave points(worst)', 'concavity(worst)', 'radius(mean)'}


In [5]:
def signum(u):
    return np.array([1 if v >= 0 else -1 for k, v in np.ndenumerate(u)])

In [6]:
signum(np.array([10, -20, 5, -3, 6]))

array([ 1, -1,  1, -1,  1])

In [7]:
def fraction_wrong(A, b, w):
    hypers_size = len(b)
#     hypers = []
#     for row in A:
#         row = np.array(row)
#         hypers.append(row.dot(w)[0])
#     hypers = np.array(hypers)
#     diff = signum(hypers).dot(b)

#     print(hypers_size)
#     print(signum(A.dot(w)).dot(b))
    hypers = signum(A.dot(w))
    diff = hypers.dot(b)
    diff = (hypers_size - diff) * 0.5
    ratio  = diff / hypers_size
    
    return ratio
    

In [8]:
w = [v for v in np.random.rand(30, 1)]
print(w)
# w = [1 for v in range(30)]
fraction_wrong(matrix, vectors, w)

[array([ 0.57403485]), array([ 0.05954288]), array([ 0.60268446]), array([ 0.31165047]), array([ 0.38284123]), array([ 0.42890859]), array([ 0.08663268]), array([ 0.02679945]), array([ 0.16893255]), array([ 0.04541315]), array([ 0.64565223]), array([ 0.96468283]), array([ 0.84961533]), array([ 0.52311106]), array([ 0.60688703]), array([ 0.88388626]), array([ 0.73639869]), array([ 0.9467534]), array([ 0.13561554]), array([ 0.13146509]), array([ 0.508663]), array([ 0.65752444]), array([ 0.33211096]), array([ 0.21561649]), array([ 0.95230612]), array([ 0.65418449]), array([ 0.86757825]), array([ 0.64151012]), array([ 0.93600711]), array([ 0.49694614])]


0.51333333333333331

In [9]:
def loss(A, b, w):
    hypersis = np.squeeze(np.asarray(A.dot(w)))
    error = hypersis - b
    return error.dot(error)

In [10]:
w = [v for v in np.random.rand(30, 1)]
loss(matrix, vectors, w)

576691675.86760688

In [11]:
w = [v for v in np.random.rand(30, 1)]
loss(matrix, vectors, w)

415756459.6047259

In [12]:
w = [v for v in np.random.rand(30, 1)]
loss(matrix, vectors, w)

232299724.45958215

In [13]:
def find_grad(A, b, w):
    hypersis = np.squeeze(np.asarray(A.dot(w)))
    error = np.transpose(hypersis - b)
    grad = np.squeeze(np.asarray(error.dot(A)))
    return grad

In [14]:
w = [v for v in np.random.rand(30, 1)]
find_grad(matrix, vectors, w)

array([  2.73918078e+08,   4.54415932e+06,   6.10168390e+06,
         3.07765066e+04,   2.37988659e+07,   8.29364082e+03,
         1.92187824e+08,   2.83012990e+04,   2.80275301e+04,
         4.31359685e+06,   1.20955669e+05,   1.93596424e+04,
         8.85090237e+02,   2.69672987e+05,   6.56808200e+03,
         1.56899049e+04,   1.53903264e+03,   8.57704314e+05,
         4.20137911e+04,   2.22030488e+04,   3.00576650e+03,
         1.39093972e+04,   1.39200822e+07,   2.86375160e+07,
         4.66116348e+03,   6.86350668e+04,   6.97084078e+04,
         3.33284679e+04,   7.95026307e+04,   3.62505634e+06])