In [1]:
import numpy as np

In [2]:
def read_training_data(fname, D=None):
    """Given a file in appropriate format, and given a set D of features,
    returns the pair (A, b) consisting of
    a P-by-D matrix A and a P-vector b,
    where P is a set of patient identification integers (IDs).

    For each patient ID p,
      - row p of A is the D-vector describing patient p's tissue sample,
      - entry p of b is +1 if patient p's tissue is malignant, and -1 if it is benign.

    The set D of features must be a subset of the features in the data (see text).
    """
    file = open(fname)
    params = ["radius", "texture", "perimeter","area","smoothness","compactness","concavity","concave points","symmetry","fractal dimension"];
    stats = ["(mean)", "(stderr)", "(worst)"]
    feature_labels = set([y+x for x in stats for y in params])
    feature_map = {params[i]+stats[j]:j*len(params)+i for i in range(len(params)) for j in range(len(stats))}
    
    patient_ids = []
    feature_vectors = []
    patient_diagnoses = []
    for line in file:
        row = line.split(",")
        patient_ID = int(row[0])
        patient_ids.append(patient_ID)
        
        patient_diagnoses.append(-1 if row[1]=='B' else +1)
        feature_vectors.append([float(row[feature_map[key]+2]) for key in feature_labels])
    return feature_labels, patient_ids, np.matrix(feature_vectors), np.array(patient_diagnoses) 

In [38]:
labels, ids, matrix, vectors = read_training_data('train.data')

In [42]:
print(labels)

{'concave points(mean)', 'area(mean)', 'concavity(worst)', 'concave points(worst)', 'concavity(mean)', 'fractal dimension(mean)', 'perimeter(mean)', 'symmetry(stderr)', 'texture(worst)', 'smoothness(stderr)', 'area(worst)', 'texture(mean)', 'symmetry(worst)', 'compactness(worst)', 'smoothness(worst)', 'texture(stderr)', 'radius(mean)', 'smoothness(mean)', 'fractal dimension(stderr)', 'perimeter(worst)', 'fractal dimension(worst)', 'area(stderr)', 'compactness(mean)', 'symmetry(mean)', 'radius(worst)', 'concavity(stderr)', 'concave points(stderr)', 'perimeter(stderr)', 'compactness(stderr)', 'radius(stderr)'}


In [101]:
def signum(u):
    return np.array([1 if v >= 0 else -1 for k, v in np.ndenumerate(u)])

In [102]:
signum(np.array([10, -20, 5, -3, 6]))

array([ 1, -1,  1, -1,  1])

In [197]:
def fraction_wrong(A, b, w):
    hypers_size = len(b)
#     hypers = []
#     for row in A:
#         row = np.array(row)
#         hypers.append(row.dot(w)[0])
#     hypers = np.array(hypers)
#     diff = signum(hypers).dot(b)

#     print(hypers_size)
#     print(signum(A.dot(w)).dot(b))
    hypers = signum(A.dot(w))
    diff = hypers.dot(b)
    diff = (hypers_size - diff) * 0.5
    ratio  = diff / hypers_size
    
    return ratio
    

In [198]:
w = [v for v in np.random.rand(30, 1)]
print(w)
# w = [1 for v in range(30)]
fraction_wrong(matrix, vectors, w)

[array([ 0.46911291]), array([ 0.60486061]), array([ 0.81597776]), array([ 0.34118145]), array([ 0.45119072]), array([ 0.96129867]), array([ 0.30786488]), array([ 0.70298255]), array([ 0.25242172]), array([ 0.46328544]), array([ 0.06505453]), array([ 0.326035]), array([ 0.76896294]), array([ 0.50223629]), array([ 0.15570007]), array([ 0.75132947]), array([ 0.52716082]), array([ 0.93762763]), array([ 0.35465296]), array([ 0.75453297]), array([ 0.5082789]), array([ 0.66293391]), array([ 0.70231719]), array([ 0.39615524]), array([ 0.1711996]), array([ 0.64834661]), array([ 0.08300025]), array([ 0.85899526]), array([ 0.01317044]), array([ 0.0270838])]


0.51333333333333331

In [294]:
def loss(A, b, w):
    hypersis = np.squeeze(np.asarray(A.dot(w)))
    error = hypersis - b
    return error.dot(error)

In [295]:
w = [v for v in np.random.rand(30, 1)]
loss(matrix, vectors, w)

165274300.22408837

In [296]:
w = [v for v in np.random.rand(30, 1)]
loss(matrix, vectors, w)

1154981887.8781612

In [297]:
w = [v for v in np.random.rand(30, 1)]
loss(matrix, vectors, w)

332459090.07432556

In [301]:
def find_grad(A, b, w):
    hypersis = np.squeeze(np.asarray(A.dot(w)))
    error = np.transpose(hypersis - b)
    print(error.dot(A))
    return error

In [302]:
w = [v for v in np.random.rand(30, 1)]
find_grad(matrix, vectors, w)

[[  1.56294990e+04   1.91874521e+08   7.86955962e+04   3.30483041e+04
    2.82106243e+04   1.38540990e+04   2.37323817e+07   4.67446871e+03
    6.05692232e+06   1.54272896e+03   2.71593310e+08   4.52049397e+06
    6.81665853e+04   6.89546465e+04   3.05812068e+04   2.69795013e+05
    3.61485122e+06   2.21166557e+04   8.85559958e+02   2.84724265e+07
    1.92266481e+04   1.39691500e+07   2.78799062e+04   4.18489218e+04
    4.28848910e+06   8.30611541e+03   3.00568364e+03   8.59542121e+05
    6.56592621e+03   1.21049198e+05]]


array([ 1207.58417909,  1387.98215533,  1275.23175806,   465.29136366,
        1314.61083001,   551.90285178,  1123.56409545,   668.00716729,
         589.65337185,   554.7418403 ,   866.87751222,   883.44764606,
        1180.33610101,   808.2854771 ,   625.9040868 ,   738.47897663,
         782.42866777,   903.32843859,  1441.28066649,   607.38228819,
         557.98975682,   313.81901504,   768.18402154,  1570.05896817,
        1150.54574224,  1034.93445598,   709.89988873,  1145.94641795,
         853.87879558,   996.39770047,  1201.52191027,   564.42373919,
         988.4520909 ,  1258.54300757,   886.21228743,   935.09409009,
         684.09594042,   545.36718926,   747.93630299,   613.30000903,
         622.71307283,   436.02983818,  1219.67985997,   636.86737965,
         592.66066237,  1148.41220265,   247.78540461,   597.69522434,
         497.95847371,   609.87092605,   480.66544204,   599.82105794,
         480.10207846,  1088.39809327,   778.61645857,   459.20964652,
      