<a href="https://colab.research.google.com/github/dolmani38/non-linear-regression/blob/master/Regession_test_0804.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install lime

import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.layers import Input, Embedding, Dense
from keras.models import Model
from keras.callbacks import Callback
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from lime import lime_tabular, lime_text
from keras.utils import to_categorical
import datetime
import random

# Bayesian Methods for Hackers style sheet
plt.style.use('bmh')

np.random.seed(1234567890)

from keras import backend as K

def r2(y_true, y_pred):
    """
    # custom R2-score metrics for keras backend
    :param y_true: 실측 데이터
    :param y_pred: 모델에 의한 예측 데이테
    :return: R^2 value, 이 값이 높다고 (예:0.99)해서 예측 값이 정확 하다고 할 수 는 없음...
    """
    SS_res = K.sum(K.square(y_true - y_pred))
    SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
    return (1 - SS_res / (SS_tot + K.epsilon()))

def soft_100_acc(y_true, y_pred):
    delta = 100
    return K.mean(K.less_equal(K.abs(y_true-y_pred),delta))

def soft_300_acc(y_true, y_pred):
    delta = 300
    return K.mean(K.less_equal(K.abs(y_true-y_pred),delta))

def structure(df):
    """
    DataFrame의 column 등 전반적인 구조를 표출 한다.
    :param df:
    :return:
    """
    df1 = df.describe(include='all').T
    df1['type'] = df.dtypes
    df1['null count'] = df.isnull().sum()
    if 'freq' in df1.columns:
        df2 = df1[
            ['type', 'count', 'null count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'unique', 'top',
             'freq']]
    else:
        df2 = df1[['type', 'count', 'null count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]

    return df2

class VerboseCallback(keras.callbacks.Callback):
    """
    kerase.model.fit 함수에서, log 출력을 위한 callback
    """
    def __init__(self, epoch):
        self.epoch = epoch
        self.progress("learning:", 0, [])

    def progress(self, text, current=0, log=None):
        g = self.epoch - current
        f = 50 / self.epoch
        c = int(current * f)
        pg = ['=' for i in range(c)] + ['.' for i in range(50 - c)]
        print("\r" + str(text) + "{}/{}[{}]{}".format(current, self.epoch, ''.join(pg), ('' if log is None else str(log).strip())),
              end="", flush=True)
        if self.epoch - current == 0:
            print("")

    def on_epoch_end(self, epoch, logs=None):
        self.progress("learning:", epoch + 1, logs)


class PeriodicLogger(Callback):
    """
    A helper callback class that only prints the losses once in 'display' epochs
    """
    def __init__(self, display=100):
        self.display = display

    def on_train_begin(self, logs={}):      
        self.epochs = 0    

    def on_epoch_end(self, batch, logs={}):    
        self.epochs += 1     
        if self.epochs % self.display == 0:
            print ("Epoch: %d - %s" % (self.epochs, str(logs)))

periodic_logger_10 = PeriodicLogger(10) 
periodic_logger_50 = PeriodicLogger(50)
periodic_logger_250 = PeriodicLogger(250)
periodic_logger_1000 = PeriodicLogger(1000)



In [2]:
def nafyc_func(row):
    code,real_yy,rate = row['CODE'],row['REAL_YY'],row['COM_RATE']
    return real_yy * rate

In [39]:
def create_samples(n_samples):
    rate_table = {}
    samples = []
    for i in range(n_samples):
      code = np.random.choice(range(1000,1520))
      real_yy = random.choice(range(1000)) * 1000
      if code not in rate_table:
        rate = np.random.choice([0.20,0.25,0.30,0.33,0.65,0.70,0.75,0.80,0.85])
        rate_table[code] = rate
      else:
        rate = rate_table[code]


      samples.append([code,real_yy,rate])
        
    return pd.DataFrame(samples, columns=['CODE','REAL_YY','COM_RATE'])

In [40]:
dataset = create_samples(n_samples=10000)
dataset['NAFYC'] = dataset.apply(nafyc_func, axis=1)
dataset

Unnamed: 0,CODE,REAL_YY,COM_RATE,NAFYC
0,1386,331000,0.75,248250.0
1,1047,578000,0.33,190740.0
2,1289,875000,0.65,568750.0
3,1288,863000,0.75,647250.0
4,1036,507000,0.20,101400.0
...,...,...,...,...
9995,1152,957000,0.65,622050.0
9996,1290,316000,0.85,268600.0
9997,1452,215000,0.25,53750.0
9998,1234,266000,0.25,66500.0


In [41]:
structure(dataset)

Unnamed: 0,type,count,null count,mean,std,min,25%,50%,75%,max
CODE,int64,10000.0,0,1258.3063,149.829982,1000.0,1128.0,1258.0,1388.0,1519.0
REAL_YY,int64,10000.0,0,500475.2,289492.327646,0.0,248000.0,502000.0,753000.0,999000.0
COM_RATE,float64,10000.0,0,0.518756,0.24304,0.2,0.3,0.65,0.75,0.85
NAFYC,float64,10000.0,0,259847.093,205712.138808,0.0,99712.5,198000.0,386400.0,849150.0


In [42]:
dataset = dataset.astype({"CODE": object})

In [48]:
structure(dataset)

Unnamed: 0,type,count,null count,mean,std,min,25%,50%,75%,max,unique,top,freq
CODE,object,10000.0,0,,,,,,,,520.0,1119.0,31.0
REAL_YY,int64,10000.0,0,500475.2,289492.327646,0.0,248000.0,502000.0,753000.0,999000.0,,,
COM_RATE,float64,10000.0,0,0.518756,0.24304,0.2,0.3,0.65,0.75,0.85,,,
NAFYC,float64,10000.0,0,259847.093,205712.138808,0.0,99712.5,198000.0,386400.0,849150.0,,,


In [50]:
dataset[dataset['CODE'] == 1386]

Unnamed: 0,CODE,REAL_YY,COM_RATE,NAFYC
0,1386,331000,0.75,248250.0
301,1386,506000,0.75,379500.0
332,1386,199000,0.75,149250.0
1076,1386,43000,0.75,32250.0
1698,1386,117000,0.75,87750.0
2537,1386,775000,0.75,581250.0
2570,1386,126000,0.75,94500.0
3004,1386,260000,0.75,195000.0
3404,1386,357000,0.75,267750.0
3814,1386,16000,0.75,12000.0


In [51]:
# zscore norm

ds = dataset[['REAL_YY','COM_RATE']]

_mean = ds.mean(axis=0)
_std = ds.std(axis=0)

ds = ds - _mean
ds /= _std

dataset.update(ds)
dataset

Unnamed: 0,CODE,REAL_YY,COM_RATE,NAFYC
0,1386,-0.585422,0.951464,248250.0
1,1047,0.267796,-0.776645,190740.0
2,1289,1.293730,0.540010,568750.0
3,1288,1.252278,0.951464,647250.0
4,1036,0.022539,-1.311537,101400.0
...,...,...,...,...
9995,1152,1.576984,0.540010,622050.0
9996,1290,-0.637237,1.362919,268600.0
9997,1452,-0.986124,-1.105809,53750.0
9998,1234,-0.809953,-1.105809,66500.0


In [52]:
class EmbeddingMapping():
    """
    Helper class for handling categorical variables
    An instance of this class should be defined for each categorical variable we want to use.
    """
    def __init__(self, series):
        # get a list of unique values
        values = series.unique().tolist()
        
        # Set a dictionary mapping from values to integer value
        # In our example this will be {'Mercaz': 1, 'Old North': 2, 'Florentine': 3}
        self.embedding_dict = {value: int_value+1 for int_value, value in enumerate(values)}
        
        # The num_values will be used as the input_dim when defining the embedding layer. 
        # It will also be returned for unseen values 
        self.num_values = len(values) + 1

    def get_mapping(self, value):
        # If the value was seen in the training set, return its integer mapping
        if value in self.embedding_dict:
            return self.embedding_dict[value]
        
        # Else, return the same integer for unseen values
        else:
            return self.num_values

In [53]:
code_mapping = EmbeddingMapping(dataset['CODE'])
dataset = dataset.assign(code_mapping=dataset['CODE'].apply(code_mapping.get_mapping))
dataset

Unnamed: 0,CODE,REAL_YY,COM_RATE,NAFYC,code_mapping
0,1386,-0.585422,0.951464,248250.0,1
1,1047,0.267796,-0.776645,190740.0,2
2,1289,1.293730,0.540010,568750.0,3
3,1288,1.252278,0.951464,647250.0,4
4,1036,0.022539,-1.311537,101400.0,5
...,...,...,...,...,...
9995,1152,1.576984,0.540010,622050.0,35
9996,1290,-0.637237,1.362919,268600.0,174
9997,1452,-0.986124,-1.105809,53750.0,443
9998,1234,-0.809953,-1.105809,66500.0,361


In [54]:
# trainset과 validationset 나누고... 기타 정리

X_train_org, X_val_org, Y_train, Y_val = train_test_split(dataset[['REAL_YY','code_mapping']],
    dataset['NAFYC'], test_size=0.1, random_state=0)

# 학습 대상 변수만 선택
X_train_continuous = X_train_org[['REAL_YY']]
X_train_categorical = X_train_org[['code_mapping']]

X_val_continuous  = X_val_org[['REAL_YY']]
X_val_categorical = X_val_org[['code_mapping']]


In [81]:
# Define the embedding input
code_input = Input(shape=(1,), dtype='int32') 
output_dim=32*2   #X_train_categorical['code_mapping'].max()*1
code_embedings = Embedding(output_dim=output_dim, input_dim=X_train_categorical['code_mapping'].max()+1)(code_input)
code_embedings = keras.layers.Reshape((output_dim,))(code_embedings)

In [82]:
# Define the continuous variables input (just like before)
continuous_input = Input(shape=(X_train_continuous.shape[1], ))

# Concatenate continuous and embeddings inputs
all_input = keras.layers.concatenate([continuous_input, code_embedings])
print(all_input)

Tensor("concatenate_9/concat:0", shape=(None, 65), dtype=float32)


In [83]:
# Define the model
dense1 = Dense(all_input.shape[1]*3, activation='relu')(all_input)
dense2 = Dense(5, activation='relu')(dense1)
predictions = Dense(1)(dense2)

# Note using the input object 'area_input' not 'area_embeddings'
model = Model(inputs=[continuous_input, code_input], outputs=predictions)
model.compile(loss='mse', optimizer=keras.optimizers.Adam(lr=.8, beta_1=0.9, beta_2=0.999, decay=1e-03, amsgrad=True),metrics=[soft_100_acc,soft_300_acc,r2])
model.summary()

Model: "functional_24"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 1, 64)        33344       input_19[0][0]                   
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
reshape_9 (Reshape)             (None, 64)           0           embedding_9[0][0]                
______________________________________________________________________________________

In [None]:
epochs = 10000

# Note continuous and categorical columns are inserted in the same order as defined in all_inputs
history = model.fit([X_train_continuous, X_train_categorical['code_mapping']], Y_train, 
          epochs=epochs, batch_size=128, 
          callbacks=[periodic_logger_250], verbose=0,
          validation_data=([X_val_continuous, X_val_categorical['code_mapping']], Y_val))

Epoch: 250 - {'loss': 441042.5625, 'soft_100_acc': 0.17134682834148407, 'soft_300_acc': 0.4805017411708832, 'r2': 0.9999895691871643, 'val_loss': 10389590.0, 'val_soft_100_acc': 0.10674579441547394, 'val_soft_300_acc': 0.3583984375, 'val_r2': 0.9997135996818542}
Epoch: 500 - {'loss': 268260.6875, 'soft_100_acc': 0.21452464163303375, 'soft_300_acc': 0.5711487531661987, 'r2': 0.9999936819076538, 'val_loss': 10069593.0, 'val_soft_100_acc': 0.15587440133094788, 'val_soft_300_acc': 0.4388521611690521, 'val_r2': 0.999721348285675}
Epoch: 750 - {'loss': 216338.453125, 'soft_100_acc': 0.2353213131427765, 'soft_300_acc': 0.6181338429450989, 'r2': 0.9999947547912598, 'val_loss': 9928115.0, 'val_soft_100_acc': 0.17247596383094788, 'val_soft_300_acc': 0.47641226649284363, 'val_r2': 0.9997251033782959}
Epoch: 1000 - {'loss': 191067.84375, 'soft_100_acc': 0.2533670961856842, 'soft_300_acc': 0.6395686268806458, 'r2': 0.9999953508377075, 'val_loss': 9848173.0, 'val_soft_100_acc': 0.18539664149284363, 