In [2]:
import os, glob, csv, time
import numpy as np

In [3]:
np.random.seed(1234)
def randomize():
    np.random.seed(time.time())

In [4]:
RND_MEAN = 0
RND_STD=0.0030

LEARNING_RATE = 0.001

In [5]:
def main(epoch_count= 10, mb_size= 10, report=1):
    load_dataset()
    init_model()
    trainer(epoch_count, mb_size, report)

In [23]:
def load_dataset():
    with open('./pulsar_stars.csv') as file:
        csvreader=csv.reader(file)
        next(csvreader, None)
        rows=[]
        for row in csvreader:
            rows.append(row)
    global data, input_cnt, output_cnt
    input_cnt, output_cnt = 8, 1
    # 리스트 구조를 array로 변환
    data = np.asarray(rows, dtype='float32')
    
def init_model():
    global weight, bias, input_cnt, output_cnt
    weight = np.random.normal(RND_MEAN, RND_STD, [input_cnt, output_cnt])
    bias = np.zeros([output_cnt])
    
def trainer(epoch_count, mb_size, report):
    step_count = arrange_data(mb_size)
    test_x, test_y = get_test_data()
    for epoch in range(epoch_count):
        losses , accs = [],[]
        for n in range(step_count):
            train_x, train_y = get_train_data(mb_size, n)
            loss, acc = run_train(train_x, train_y)
            losses.append(loss)
            accs.append(acc)
        
        if report >0 and (epoch+1)%report ==0:
            acc = run_test(test_x, test_y)
            print("Epoch {}: loss={:5.3f}, accuracy={:5.3f}/{:5.3f}".format(epoch+1, np.mean(losses), np.mean(accs), acc))
    final_acc = run_test(test_x, test_y)
    print('\nFinal Test: final accuracy = {:5.3f}'.format(final_acc))

In [7]:
## dataloader 부분에 해당 
def arrange_data(mb_size):
    # 데이터 정렬
    global data, shuffle_map, test_begin_idx
    shuffle_map = np.arange(data.shape[0])
    np.random.shuffle(shuffle_map)
    step_count = int(data.shape[0]*.8)// mb_size
    test_begin_idx = step_count * mb_size
    return step_count

def get_test_data():
    # 테스트 데이버 확보
    global data, shuffle_map, test_begin_idx, output_cnt
    test_data = data[shuffle_map[test_begin_idx:]]
    return test_data[:,:-output_cnt], test_data[:, -output_cnt:]

def get_train_data(mb_size, nth):
    # 트레인데이터 확보
    global data, shuffle_map, test_begin_idx, output_cnt
    if nth == 0:
        np.random.shuffle(shuffle_map[:test_begin_idx])
    train_data = data[shuffle_map[mb_size*nth:mb_size*(nth+1)]]
    return train_data[:, :-output_cnt], train_data[:, -output_cnt:]

In [34]:
def run_train(x, y):
    # 순전파
    output, aux_nn = forward_neuralnet(x)
    # 순전파 후처리 연산과정
    loss, aux_pp = forward_postproc(output, y)
    # 정확도 판별
#     accuracy = eval_accuracy(output, y)
    # f1, 정확도, 정밀도, 재현율 고려한 평가함수로 변경
    accuracy = refine_eval_accuracy(output, y)
    G_loss = 1.0
    # 역전파 후처리 연산과정
    G_output=backprop_postproc(G_loss, aux_pp)
    # 역전파
    backprop_neuralnet(G_output, aux_nn)
    
    return loss, accuracy

def run_test(x, y):
    output, _ = forward_neuralnet(x)
    # f1, 정확도, 정밀도, 재현율 고려한 평가함수로 변경
    accuracy = refine_eval_accuracy(output, y)
#    accuracy = eval_accuracy(output, y)
    return accuracy

In [9]:
def forward_neuralnet(x):
    global weight, bias
    output = np.matmul(x, weight) + bias
    return output, x

def backprop_neuralnet(G_output, x):
    global weight, bias
    g_output_w = x.transpose()
    
    G_w = np.matmul(g_output_w, G_output)
    G_b = np.sum(G_output, axis=0)
    weight -=LEARNING_RATE *G_w
    bias -= LEARNING_RATE * G_b

In [10]:
def forward_postproc(output, y):
    entropy = sigmoid_cross_entropy_with_logits(y, output)
    loss = np.mean(entropy)
    return loss , [y, output, entropy]

def backprop_postproc(G_loss, aux):
    y, output, entropy = aux
    g_loss_entropy = 1.0 / np.prod(entropy.shape)
    g_entropy_output = sigmoid_cross_entropy_with_logits_derv(y, output)
    G_entropy = g_loss_entropy* G_loss
    G_output = g_entropy_output* G_entropy
    return G_output

In [12]:
def eval_accuracy(output, y):
    # 추론값을 을 기준으로 음수 양수의 값을 비교
    estimate = np.greater(output, 0)
    # 정답에서 0.5를 기준으로 높으면 True, 낮으면 False로 진행
    # y인 label 은 1 or 0 이기 떄문에 둘중 한개로 결정됨
    answer = np.greater(y, 0.5)
    correct= np.equal(estimate, answer)
    return np.mean(correct)

In [13]:
def relu(x):
    return np.maximum(x,0)

def sigmoid(x):
    return np.exp(-relu(-x))/(1.0 +np.exp(-np.abs(x)))

def sigmoid_derv(x, y):
    return y * (1-y)

def sigmoid_cross_entropy_with_logits(z, x):
    return relu(x) -x * z +np.log(1 + np.exp(-np.abs(x)))

def sigmoid_cross_entropy_with_logits_derv(z, x):
    return -z + sigmoid(x)

In [25]:
main()

Epoch 1: loss=0.151, accuracy=0.959/0.948
Epoch 2: loss=0.146, accuracy=0.964/0.974
Epoch 3: loss=0.130, accuracy=0.966/0.975
Epoch 4: loss=0.137, accuracy=0.966/0.955
Epoch 5: loss=0.128, accuracy=0.969/0.976
Epoch 6: loss=0.125, accuracy=0.969/0.977
Epoch 7: loss=0.137, accuracy=0.968/0.972
Epoch 8: loss=0.142, accuracy=0.967/0.976
Epoch 9: loss=0.132, accuracy=0.970/0.910
Epoch 10: loss=0.131, accuracy=0.969/0.976

Final Test: final accuracy = 0.976


In [45]:
def main_used_f1(epoch_count= 10, mb_size= 10, report=1, adjust_ratio=False):
    load_dataset_refine(adjust_ratio)
    init_model()
    refine_trainer(epoch_count, mb_size, report)

In [44]:
def load_dataset_refine(adjust_ratio):
    stars, pulses = [], []
    with open('./pulsar_stars.csv') as file:
        csvreader = csv.reader(file)
        next(csvreader, None)
        for row in csvreader:
            if row[8] =='1' : pulses.append(row)
            else: stars.append(row)
    global data, input_cnt, output_cnt
    input_cnt, output_cnt = 8,1
    
    star_cnt, pulses_cnt = len(stars), len(pulses)
    #if star_cnt > pulses_cnt:big, small = star_cnt,pulses_cnt
    #else: big, small = pulses_cnt, star_cnt
    if adjust_ratio:
        data= np.zeros([2*star_cnt, input_cnt+ output_cnt])
        data[:star_cnt, :]=np.asarray(stars, dtype='float32')
        for n in range(star_cnt):
            data[star_cnt+n] = np.asarray(pulses[n % pulses_cnt], dtype='float32')
    else:
        data = np.zeros([star_cnt+pulses_cnt, input_cnt+output_cnt])
        data[:star_cnt, :] = np.asarray(stars, dtype='float32')
        data[star_cnt:, :] = np.asarray(pulses, dtype='float32')
                        

In [47]:
def refine_eval_accuracy(output, y):
    est_yes = np.greater(output, 0)
    ans_yes = np.greater(y, .5)
    est_no = np.logical_not(est_yes)
    ans_no = np.logical_not(ans_yes)
    
    tp = np.sum(np.logical_and(est_yes, ans_yes))
    fp = np.sum(np.logical_and(est_yes, ans_no))
    fn = np.sum(np.logical_and(est_no, ans_yes))
    tn = np.sum(np.logical_and(est_no, ans_no))
    
    accuracy = safe_div(tp+tn , tp+tn+fn+fp)
    precision = safe_div(tp, tp+fp)
    recall = safe_div(tp, tp+fn)
    f1 = 2*safe_div(recall*precision, recall+precision)
    return [accuracy, precision, recall, f1]

def safe_div(p,q):
    p,q = float(p), float(q)
    if np.abs(q) < 1.0e-20: return np.sign(p)
    return p / q

In [42]:
def refine_trainer(epoch_count, mb_size, report):
    step_count = arrange_data(mb_size)
    test_x, test_y = get_test_data()
    for epoch in range(epoch_count):
        losses = []
        for n in range(step_count):
            train_x, train_y = get_train_data(mb_size, n)
            loss, _ = run_train(train_x, train_y)
            losses.append(loss)

        if report >0 and (epoch+1)%report ==0:
            acc = run_test(test_x, test_y)
            acc_str = ','.join(['%5.3f']*4)%tuple(acc)
            print("Epoch {}: loss={:5.3f}, result={}"
                  .format(epoch+1, np.mean(losses), acc_str))
    acc = run_test(test_x, test_y)
    acc_str = ','.join(['%5.3f']*4)%tuple(acc)
    print('\nFinal Test: final result = {}'.format(acc_str))

In [48]:
main_used_f1(adjust_ratio=False)

Epoch 1: loss=0.138, result=0.971,0.929,0.744,0.826
Epoch 2: loss=0.129, result=0.972,0.936,0.750,0.833
Epoch 3: loss=0.132, result=0.973,0.924,0.774,0.843
Epoch 4: loss=0.146, result=0.973,0.940,0.759,0.840
Epoch 5: loss=0.122, result=0.975,0.914,0.804,0.856
Epoch 6: loss=0.125, result=0.973,0.931,0.768,0.842
Epoch 7: loss=0.121, result=0.974,0.886,0.819,0.851
Epoch 8: loss=0.121, result=0.975,0.890,0.828,0.858
Epoch 9: loss=0.129, result=0.973,0.880,0.819,0.849
Epoch 10: loss=0.125, result=0.976,0.924,0.804,0.860

Final Test: final result = 0.976,0.924,0.804,0.860
