In [1]:
# import things
import math
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import numpy as np
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)

train_path = './data/train.csv'
test_path = './data/test.csv'

In [None]:
class get_data:
    def __init__(self, path, is_train=True):
        self.is_train = is_train
        
        # load csv file
        data = pd.read_csv(path)
        # split data to feature & label
        if is_train:
            self.feature = data.drop('label', axis=1)
            self.label = data['label']
        else:
            self.feature = data
        # free space
        del data
        
        self.feature = self.deal_feature()
        if is_train:
            self.label = self.deal_label()
            self.ts_f, self.ts_l, self.vs_f, self.vs_l = self.split_t_v()
            
    
    # EDA
    def describe_plot(self):
        sns.countplot(self.label)
    def describe_num(self):
        res = self.label.value_counts()
        print(res)
    
    # reference: How to Check If Any Value is NaN in a Pandas DataFrame
    # https://chartio.com/resources/tutorials/how-to-check-if-any-value-is-nan-in-a-pandas-dataframe/
    def check_missing_val(self, data_df):
        res = data_df.isnull().values.any()
        print(res)
        
    def deal_feature(self):
        # normolize data
#         feature = self.feature / 255.0
        feature = self.feature.values.reshape(-1, 28, 28, 1)
        return feature
    
    def show_digit(self):
        plt.imshow(self.feature[0][:,:,0])
        
    def deal_label(self):
        label = to_categorical(self.label, num_classes = 10)
        return label
    
    def split_t_v(self):
        # trainingSet_feature
        # trainingSet_label
        # validationSet_feature
        # validationSet_label
        
        ts_f, ts_l, vs_f, vs_l = \
        train_test_split(
            self.feature,
            self.label,
            test_size=0.1,
            random_state=2
        )
        return ts_f, ts_l, vs_f, vs_l

In [2]:
class create_data:
    def __init__(self, path):
        self.data = pd.read_csv(path)
#         self.missing_data = self.get_missing()
 

In [3]:
   class create_model:
    def __init__(
        self,
        train_df,
        label_col='label',
        learning_rate=0.02,
        steps=100,
        batch_size=10,
        periods=10,
        hidden_units=[512, 126, 64],
        load_model=False,
        load_model_name='default',
        save_model=False,
        save_model_name='default',
    ):
        self.label = label_col
        # learning rate: optimizer
        self.learning_rate = learning_rate
        # steps, batch_size, periods: train
        self.steps = steps
        self.batch_size = batch_size
        self.periods = periods
        self.h_units = hidden_units
        # save model
        self.save_model = save_model
        self.load_model = load_model
        self.save_model_name = save_model_name
        self.load_model_name = load_model_name
        self.save_model_path = './model/' + self.save_model_name
        self.load_model_path = './model/' + self.load_model_name
        # split data
        self.te, self.tt, self.ve, self.vt = self.get_split_data(train_df)
        
        # create lenearRegressor
        self.feature_cols = self.get_feature_cols(train_df)
        self.optimizer = self.get_optimizer(learning_rate)
        
        if self.load_model:
            if self.save_model:
                self.lr = tf.estimator.DNNClassifier(
                    hidden_units=self.h_units,
    #             self.lr = tf.estimator.LinearRegressor(
                    feature_columns=self.feature_cols,
                    optimizer=self.optimizer,
                    model_dir=self.save_model_path,
                    warm_start_from=self.load_model_path,
                    n_classes=10
                )
            else:
                self.lr = tf.estimator.DNNClassifier(
                    hidden_units=self.h_units,
    #             self.lr = tf.estimator.LinearRegressor(
                    feature_columns=self.feature_cols,
                    optimizer=self.optimizer,
                    warm_start_from=self.load_model_path,
                    n_classes=10
                )
        else:
            if self.save_model:
                self.lr = tf.estimator.DNNClassifier(
                    hidden_units=self.h_units,
                #             self.lr = tf.estimator.LinearRegressor(
                    feature_columns=self.feature_cols,
                    optimizer=self.optimizer,
                    model_dir=self.save_model_path,
                    n_classes=10
                ) 
            else:
                self.lr = tf.estimator.DNNClassifier(
                    hidden_units=self.h_units,
    #             self.lr = tf.estimator.LinearRegressor(
                    feature_columns=self.feature_cols,
                    optimizer=self.optimizer,
                    n_classes=10
                )
        print('build the model')
    
    def get_split_data(self, train_df):
        # sample 80% for train data, 20% for vali data
        train_set, vali_set = self.split_train(train_df, 0.8)
        te, tt = self.get_e_t(train_set)
        ve, vt = self.get_e_t(vali_set)
        return te, tt, ve, vt
        
    def split_train(self, data_df, per):
        t_s = data_df.sample(frac=per, replace=False, random_state=42)
        v_s = data_df.loc[ set(data_df.index) - set(t_s.index)]
        return t_s, v_s
    
    def get_e_t(self, data_df):
        # data examples
        d_e = data_df.copy().drop(self.label, axis=1)
        # data targets
        d_t = pd.DataFrame()
        d_t[self.label] = data_df[self.label]
        return d_e, d_t
    
    def get_feature_cols(self, train_df):
        feature_df = train_df.copy().drop(self.label, axis=1)
        tmp_feature = [tf.feature_column.numeric_column(my_feature) for my_feature in feature_df]
#         if bin_age: # boolean:
#             tmp_age = tf.feature_column.numeric_column("Age")
#             bucketized_age = tf.feature_column.bucketized_column(
#               tmp_age, boundaries=get_quantile_based_boundaries(
#                 input_features["Age"], 4))
#             tmp_feature += [bucketized_age]
        return set(tmp_feature)
    
    def get_optimizer(self, learning_rate):
#         change the optimizer!!!!!!
        my_opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
#         my_opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        my_opt = tf.contrib.estimator.clip_gradients_by_norm(my_opt, 2.0)
        return my_opt
    
    def my_input_fn(
        self,
        features,
        targets,
        batch_size=1,
        shuffle=True,
        num_epochs=None
    ):
        # Convert pandas data into a dict of np arrays.
        features = {key:np.array(value) for key,value in dict(features).items()}
        # Construct a dataset, and configure batching/repeating.
        ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
        ds = ds.batch(batch_size).repeat(num_epochs)
        # Shuffle the data, if specified.
        if shuffle:
            ds = ds.shuffle(10000)
        # Return the next batch of data.
        features, labels = ds.make_one_shot_iterator().get_next()
        return features, labels
    
    def train(self):
        steps_per_period = self.steps / self.periods
        # create input function
        training_input_fn = lambda: self.my_input_fn(self.te, self.tt[self.label], batch_size=self.batch_size)
        predict_training_input_fn = lambda: self.my_input_fn(self.te, self.tt[self.label], num_epochs=1, shuffle=False)
        predict_vali_input_fn = lambda: self.my_input_fn(self.ve, self.vt[self.label], num_epochs=1, shuffle=False)
        
        print('Training model...')
        # recording RMSE
        training_rmse = []
        validation_rmse = []
        for period in range(self.periods):
            self.lr.train(
                input_fn=training_input_fn,
                steps=steps_per_period
            )
            self.lr.evaluate(
                input_fn=predict_vali_input_fn,
                steps=steps_per_period
            )
            
            # compute training predictions
            training_predictions = self.lr.predict(input_fn=predict_training_input_fn)
            training_predictions = np.array([item['predictions'][0] for item in training_predictions])
            # compute validation predictions
            validation_predictions = self.lr.predict(input_fn=predict_vali_input_fn)
            validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
            # get validation eval
            training_eval = self.lr.evaluate(input_fn=predict_training_input_fn)
            validation_eval = self.lr.evaluate(input_fn=predict_vali_input_fn)
            print("training_eval: {}".format(training_eval['average_loss']))
            print("validation_eval: {}".format(validation_eval['average_loss']))

            # compute training loss
            training_root_mean_squared_error = math.sqrt(
                metrics.mean_squared_error(training_predictions, self.tt))
            # compute validation loss
            validation_root_mean_squared_error = math.sqrt(
                metrics.mean_squared_error(validation_predictions, self.vt))
            # Occasionally print the current loss.
            print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
            # Add the loss metrics from this period to our list.
            training_rmse.append(training_root_mean_squared_error)
            validation_rmse.append(validation_root_mean_squared_error)
        
        print("Model training finished.")
        # saving model
        if self.save_model:
            print("Saving model...")
            
        # output a graph of loss metrics over periods.
        self.result_plot(training_rmse, validation_rmse)
        
    def result_plot(self, t_rmse, v_rmse):
        plt.ylabel("RMSE")
        plt.xlabel("Periods")
        plt.title("Root Mean Squared Error vs. Periods")
        plt.tight_layout()
        plt.plot(t_rmse, label="training")
        plt.plot(v_rmse, label="validation")
        plt.legend()
        

class predict_file:
    def __init__(self, model, test_df, id_frame):
        self.test_df = test_df
        self.id_frame = id_frame
#         self.predict_col = predict_col
        self.predict_input_fn = self.create_predict_fn()
        
        self.predictions = list(model.lr.predict(input_fn=self.predict_input_fn))
        self.predictions = np.array([item['predictions'][0] for item in self.predictions])
        self.predictions *= 1000
        
        self.evaluation = self.id_frame#test_df['Id'].copy().to_frame()
        self.evaluation['SalePrice'] = self.predictions
    
    def create_predict_fn(self):
#         predict_df = self.test_df
        predict_input_fn = tf.estimator.inputs.pandas_input_fn(
            x=self.test_df,
            num_epochs=1,
            shuffle=False
        )
        return predict_input_fn
    
    def save_predict(self, path):
        self.evaluation.to_csv(path, index=False)