In [1]:
#@title
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col

In [2]:
import tensorflow as tf

In [3]:
tf.__version__

'2.11.0'

In [8]:
df = pd.read_csv('education2020.csv', index_col=0)

In [9]:
df.head(3)

Unnamed: 0,year,metro,id,sex,age,number,education,marriage,asset,debt,income,income_d,industry,job,house,education_year
0,2020,G1,10000112,1,34,3,6,2,112000,54500,6593,4599,F,3,2,16
1,2020,G1,10000132,2,45,2,8,2,42500,17500,17720,15257,J,2,3,21
2,2020,G1,10000162,2,73,1,2,3,5712,0,908,725,T,4,2,6


In [10]:
df1 = df[~df['house'].isin([5])] #1 자기집, 2 전세, 3 보증금 있는 월세, 4 보증금 없는 월세, 5 기타

In [11]:
df2 = df1.copy()

In [12]:
def h_status(x):                #자가 0, 전세 1, 월세 2로 만드는 함수 
    if x <= 2:
        return x-1
    else:
        return 2   

In [14]:
df2['house_type'] = df2['house'].apply(h_status)

In [15]:
df2 = df2[["sex","age","marriage", "education_year","asset", "debt", "income", "metro", "number", "job", "house_type", ]] 

In [16]:
#성별, 결혼, 거주지, 직업 더미
sex_dummies = pd.get_dummies(df2['sex']).rename(columns=lambda x: 's'+str(x))
marriage_dummies = pd.get_dummies(df2['marriage']).rename(columns=lambda x: 'm'+str(x))
metro_dummies = pd.get_dummies(df2['metro']).rename(columns=lambda x: 'g'+str(x))
job_dummies = pd.get_dummies(df2['job']).rename(columns=lambda x: 'j'+str(x))

In [17]:
df2 = pd.concat([df2, sex_dummies, marriage_dummies, metro_dummies, job_dummies], axis=1)
df3 = df2.copy()
df3['age2'] = df3['age']**2

In [18]:
x_data =df3[['income', 'asset', 'debt', 'age', 'age2', 'number', 'education_year', 's2', 'm2', 'm3', 'm4', 'gG2', 'j2', 'j3', 'j4', 'j5', 'j6', 'j7', 'j8', 'j9', 'jA']].to_numpy() #더미변수 통합

In [19]:
y_data = df3[['house_type']]

In [20]:
X = np.asarray(x_data, dtype=np.float32)
y = np.asarray(y_data, dtype=np.float32)
rows = y.shape[0]

In [21]:
def normalization(data):  #정규화를 위한 함수(최대, 최소 값이 1과 0으로 바꿉니다)
    numerator = data - np.min(data, 0)
    denominator = np.max(data, 0) - np.min(data, 0)
    return numerator / denominator

In [22]:
x_train = normalization(X)
k = x_data.shape[1] #설명 변수 개수
nb_classes = 3      #y의 분류 갯수
Y_one_hot = tf.one_hot(y.astype(np.int32), nb_classes) # y 변수를 3개의 one hot encoding 방식으로 표현
Y_one_hot = tf.reshape(Y_one_hot,[rows, -1] ) #차원을 2차원으로 변경
Y_one_hot  

<tf.Tensor: shape=(16789, 3), dtype=float32, numpy=
array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]], dtype=float32)>

In [23]:
W = tf.Variable(tf.random.normal((k, nb_classes)), name='weight') #무작위로 가중치 선택
b = tf.Variable(tf.random.normal((nb_classes,)), name='bias')       #무작위로 절편 선택

for i in range(10000+1):
    with tf.GradientTape() as tape:
        logits = tf.matmul(x_train, W) + b
        hypothesis  = tf.nn.softmax(logits) #분류를 위한 소프트맥스 함수 사용
        cost_i = tf.keras.losses.categorical_crossentropy(y_true=Y_one_hot, y_pred=logits, from_logits=True) 
        reg= tf.nn.l2_loss(W) * 0.01 #규제화(regularization) 변수로 0.01 사용
        cost = tf.reduce_mean(cost_i+ reg)
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=0.005,decay_steps=1000,decay_rate=0.96,staircase=True) # 초기 학습률을 0.005를 이용하다가 점차 감소
        optimizer = tf.optimizers.SGD(learning_rate=lr_schedule) # Stochastic Gradient Descent algorithm

        grads = tape.gradient(cost, [W, b])
        optimizer.apply_gradients(grads_and_vars=zip(grads,[W,b]))
        
        predicted = tf.argmax(hypothesis, 1)
        correct_prediction = tf.equal(predicted, tf.argmax(Y_one_hot,1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, dtype=tf.float32))
    
    if i % 1000 == 0: #1000번 시행시마다 비용과 정확도 출력
        print("Steps: {:5} | Costs: {:10.4f} | Accuracy {:10.4f}".format(i, cost.numpy(), accuracy.numpy())) 

Steps:     0 | Costs:     2.3059 | Accuracy     0.2253
Steps:  1000 | Costs:     1.4001 | Accuracy     0.5870
Steps:  2000 | Costs:     1.2675 | Accuracy     0.6186
Steps:  3000 | Costs:     1.1699 | Accuracy     0.6401
Steps:  4000 | Costs:     1.0955 | Accuracy     0.6575
Steps:  5000 | Costs:     1.0381 | Accuracy     0.6627
Steps:  6000 | Costs:     0.9931 | Accuracy     0.6735
Steps:  7000 | Costs:     0.9574 | Accuracy     0.6781
Steps:  8000 | Costs:     0.9287 | Accuracy     0.6798
Steps:  9000 | Costs:     0.9054 | Accuracy     0.6821
Steps: 10000 | Costs:     0.8862 | Accuracy     0.6835


In [24]:
y_Predicted = predicted.numpy().flatten()
y_Actual = y.flatten()
data = {'y_Actual': y_Actual,
        'y_Predicted': y_Predicted}

In [25]:
df = pd.DataFrame(data, columns = ['y_Actual', 'y_Predicted'])
cross = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames = ['Actual'], colnames=['Predicted'])
cross

Predicted,0,1,2
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,10570,52,601
1.0,1772,104,303
2.0,2445,141,801


In [26]:
confusion_matrix = np.zeros([3,3])  # 비어 있는 오분류표 작성

In [27]:
confusion_matrix

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [28]:
try : 
    confusion_matrix[0,0] = cross.loc[0,0]
    confusion_matrix[0,1] = cross.loc[0,1]
    confusion_matrix[0,2] = cross.loc[0,2]
    confusion_matrix[1,0] = cross.loc[1,0]
    confusion_matrix[1,1] = cross.loc[1,1]
    confusion_matrix[1,2] = cross.loc[1,2]
    confusion_matrix[2,0] = cross.loc[2,0]
    confusion_matrix[2,1] = cross.loc[2,1]
    confusion_matrix[2,2] = cross.loc[2,2]

except Exception as e:
    print(e)



In [29]:
confusion_matrix

array([[10570.,    52.,   601.],
       [ 1772.,   104.,   303.],
       [ 2445.,   141.,   801.]])

In [30]:
TRUE  = confusion_matrix[0,0]+confusion_matrix[1,1]+confusion_matrix[2,2]
FALSE = confusion_matrix[0,1]+confusion_matrix[0,2]+confusion_matrix[1,0]+confusion_matrix[1,2]+confusion_matrix[2,0]+confusion_matrix[2,1]

In [31]:
accuracy = TRUE/(TRUE+FALSE)

In [32]:
accuracy

0.6834832330692715