<a href="https://colab.research.google.com/github/devyulbae/AIClass/blob/main/ex_w1d3_logisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

tf.__version__

In [None]:
# colab import
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 문자열을 건너뛰고 읽기
xy = np.genfromtxt('/content/drive/MyDrive/datas/logistic_regression_dataset_16_features.csv',
                   delimiter=',', dtype=None, encoding=None)


x_train = xy[1:-100, 0:-1]
y_train = xy[1:-100, [-1]]
x_train, y_train = x_train.astype(np.float32), y_train.astype(np.float32)

x_test = xy[-100:, 0:-1]
y_test = xy[-100:, [-1]]
x_test, y_test = x_test.astype(np.float32), y_test.astype(np.float32)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print("x_test: \n", x_test[0])
print("y_test: \n", y_test[0])

In [None]:
# 상수 항 추가
x_train = sm.add_constant(x_train)

# 로지스틱 회귀 모델 피팅
logit_model = sm.Logit(y_train, x_train)
result = logit_model.fit()

# 결과 요약 출력
print(result.summary())

# 각 계수에 대한 Wald 검정
wald_test_results = result.wald_test_terms()

# Wald 검정 결과 출력
print("\nWald 검정 결과:")
print(wald_test_results)

# 아래 검정 결과를 통해 얻을 수 있는 결론:
# 1. "Optimization terminated successfully" -> 최적화가 성공적으로 종료되었다.
# 2. p-value(P>|z|)가 0.05를 넘는 값이 없다 -> 통계적으로 유의미한 파라미터가 없다.
# 3. Wald검정에서 (P>chi2)의 값이 0.05보다 낮은 값이 없다 -> 통계적으로 유의미한 파라미터가 없다.
# 4. Possibly complete quasi-separation -> 모델이 데이터에 완전히 학습되었다, 즉 과적합일 수 있다는 경고.

In [None]:
# data slicing
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(len(x_train))

In [None]:
W = tf.Variable(tf.random.normal([16,1]), name='weight')
b = tf.Variable(tf.random.normal([1]), name='bias')

In [None]:
# functions
def logistic_regression(features):
    hypothesis  = tf.divide(1., 1. + tf.exp(-(tf.matmul(features, W) + b)))
    return hypothesis

def loss_fn(hypothesis, labels):
    cost = -tf.reduce_mean( labels*tf.math.log(hypothesis)+ (1-labels)*tf.math.log(1-hypothesis))
    return cost

def accuracy_fn(hypothesis, labels):
    predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, labels), dtype=tf.float32))
    return accuracy

In [None]:
# optimizer
optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=1e-2)

In [None]:
# main
epochs = 10001

for step in range(epochs):
  for features, labels in dataset:
    with tf.GradientTape() as tape:
      pred = logistic_regression(features)
      loss_value = loss_fn( pred, labels)
      grads = tape.gradient(loss_value, [W,b])
      optimizer.apply_gradients(grads_and_vars=zip(grads,[W,b]))
      if step % 500 == 0:
            print("Iter: {}, Loss: {:.4f}".format(step, loss_fn(logistic_regression(features),labels)))

In [None]:
# test
test_acc = accuracy_fn(logistic_regression(x_test),y_test)
print("Testset Accuracy: {:.4f}".format(test_acc))