In [3]:
# import things
import math
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import numpy as np
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)

train_path = './data/train.csv'
test_path = './data/test.csv'

In [4]:
class create_data:
    def __init__(self, path):
        self.data = pd.read_csv(path)
        self.missing_data = self.get_missing()
        
    def get_missing(self):
        total = self.data.isnull().sum().sort_values(ascending=False)
        percent = (self.data.isnull().sum() / self.data.isnull().count()).sort_values(ascending=False)
        missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
        return missing_data
        
    def chose_model(self, model, using_col, cat_col, is_test=False):
        if model == 1:
            self.model_df = self.drop_nan_data(self.missing_data)
        elif model == 2:
            self.model_df = self.deal_nan_data()
        elif model == 3:
            self.model_df = self.deal_ol_data()
        else:
            self.model_df = self.handle()
            print('not build yet.')
        
        # Id for combine two dataframe  
        id_lst = self.model_df['Id']
        # deal with SalePrice
        if not is_test:
            house_price = self.model_df['SalePrice']
    #       house_price = np.log(house_price)
            house_price /= 1000
        # scaler value
        tmp1_df = self.model_df[using_col]
        tmp1_df = (tmp1_df - tmp1_df.mean()) / tmp1_df.std()
        if not is_test:
            tmp1_df['SalePrice'] = house_price
        tmp1_df['Id'] = id_lst
        # category value
        tmp2_df = self.model_df[cat_col]
        # merge scaler value df & category value df
        self.model_df = pd.merge(tmp1_df, tmp2_df, on='Id')
        self.model_df = self.model_df.drop(['Id'], axis=1)
        # one-hot encode
        self.model_df = self.dum_data(self.model_df)
        
    def drop_nan_data(self, missing_df):
        drop_data = self.data.copy()
        drop_data = drop_data.drop((missing_df[missing_df['Total'] > 1]).index, 1)
        drop_data = drop_data.drop(drop_data.loc[drop_data['Electrical'].isnull()].index)
        self.drop_data = drop_data
        return drop_data
    
    def deal_nan_data(self):
        fill_data = self.data.copy()
        fill_data['BsmtQual'] = fill_data['BsmtQual'].apply(lambda x: 'NOVAL' if x != x else x)
        self.fill_data = fill_data
        return fill_data
        
    def deal_ol_data(self):
        ol_data = self.data.copy()
        ol_data['1stFlrSF'] = ol_data['1stFlrSF'].apply(lambda x: min(x, 2500))
        ol_data['GrLivArea'] = ol_data['GrLivArea'].apply(lambda x: min(x, 3000))
        ol_data['TotalBsmtSF'] = ol_data['TotalBsmtSF'].apply(lambda x: min(x, 3000))
        self.ol_data = ol_data
        return ol_data
    
    def dum_data(self, data_df):
        return pd.get_dummies(data_df)
#     def out_liars(self):
    
class create_model:
    def __init__(
        self,
        train_df,
        label_col='SalePrice',
        learning_rate=0.02,
        steps=100,
        batch_size=10,
        periods=10,
        hidden_units=[8, 4],
        load_model=False,
        load_model_name='default',
        save_model=False,
        save_model_name='default',
    ):
        self.label = label_col
        # learning rate: optimizer
        self.learning_rate = learning_rate
        # steps, batch_size, periods: train
        self.steps = steps
        self.batch_size = batch_size
        self.periods = periods
        self.h_units = hidden_units
        # save model
        self.save_model = save_model
        self.load_model = load_model
        self.save_model_name = save_model_name
        self.load_model_name = load_model_name
        self.save_model_path = './model/' + self.save_model_name
        self.load_model_path = './model/' + self.load_model_name
        # split data
        self.te, self.tt, self.ve, self.vt = self.get_split_data(train_df)
        
        # create lenearRegressor
        self.feature_cols = self.get_feature_cols(train_df)
        self.optimizer = self.get_optimizer(learning_rate)
        
        if self.load_model:
            if self.save_model:
                self.lr = tf.estimator.DNNRegressor(
                    hidden_units=self.h_units,
    #             self.lr = tf.estimator.LinearRegressor(
                    feature_columns=self.feature_cols,
                    optimizer=self.optimizer,
                    model_dir=self.save_model_path,
                    warm_start_from=self.load_model_path
                )
            else:
                self.lr = tf.estimator.DNNRegressor(
                    hidden_units=self.h_units,
    #             self.lr = tf.estimator.LinearRegressor(
                    feature_columns=self.feature_cols,
                    optimizer=self.optimizer,
                    warm_start_from=self.load_model_path
                )
        else:
            if self.save_model:
                self.lr = tf.estimator.DNNRegressor(
                    hidden_units=self.h_units,
                #             self.lr = tf.estimator.LinearRegressor(
                    feature_columns=self.feature_cols,
                    optimizer=self.optimizer,
                    model_dir=self.save_model_path
                ) 
            else:
                self.lr = tf.estimator.DNNRegressor(
                    hidden_units=[20, 10, 5],
    #             self.lr = tf.estimator.LinearRegressor(
                    feature_columns=self.feature_cols,
                    optimizer=self.optimizer
                )
        print('build the model')
    
    def get_split_data(self, train_df):
        # sample 80% for train data, 20% for vali data
        train_set, vali_set = self.split_train(train_df, 0.8)
        te, tt = self.get_e_t(train_set)
        ve, vt = self.get_e_t(vali_set)
        return te, tt, ve, vt
        
    def split_train(self, data_df, per):
        t_s = data_df.sample(frac=per, replace=False, random_state=42)
        v_s = data_df.loc[ set(data_df.index) - set(t_s.index)]
        return t_s, v_s
    
    def get_e_t(self, data_df):
        # data examples
        d_e = data_df.copy().drop(self.label, axis=1)
        # data targets
        d_t = pd.DataFrame()
        d_t[self.label] = data_df[self.label]
        return d_e, d_t
    
    def get_feature_cols(self, train_df):
        feature_df = train_df.copy().drop(self.label, axis=1)
        tmp_feature = [tf.feature_column.numeric_column(my_feature) for my_feature in feature_df]
#         if bin_age: # boolean:
#             tmp_age = tf.feature_column.numeric_column("Age")
#             bucketized_age = tf.feature_column.bucketized_column(
#               tmp_age, boundaries=get_quantile_based_boundaries(
#                 input_features["Age"], 4))
#             tmp_feature += [bucketized_age]
        return set(tmp_feature)
    
    def get_optimizer(self, learning_rate):
#         change the optimizer!!!!!!
        my_opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
#         my_opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        my_opt = tf.contrib.estimator.clip_gradients_by_norm(my_opt, 2.0)
        return my_opt
    
    def my_input_fn(
        self,
        features,
        targets,
        batch_size=1,
        shuffle=True,
        num_epochs=None
    ):
        # Convert pandas data into a dict of np arrays.
        features = {key:np.array(value) for key,value in dict(features).items()}
        # Construct a dataset, and configure batching/repeating.
        ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
        ds = ds.batch(batch_size).repeat(num_epochs)
        # Shuffle the data, if specified.
        if shuffle:
            ds = ds.shuffle(10000)
        # Return the next batch of data.
        features, labels = ds.make_one_shot_iterator().get_next()
        return features, labels
    
    def train(self):
        steps_per_period = self.steps / self.periods
        # create input function
        training_input_fn = lambda: self.my_input_fn(self.te, self.tt[self.label], batch_size=self.batch_size)
        predict_training_input_fn = lambda: self.my_input_fn(self.te, self.tt[self.label], num_epochs=1, shuffle=False)
        predict_vali_input_fn = lambda: self.my_input_fn(self.ve, self.vt[self.label], num_epochs=1, shuffle=False)
        
        print('Training model...')
        # recording RMSE
        training_rmse = []
        validation_rmse = []
        for period in range(self.periods):
            self.lr.train(
                input_fn=training_input_fn,
                steps=steps_per_period
            )
            
            # compute training predictions
            training_predictions = self.lr.predict(input_fn=predict_training_input_fn)
            training_predictions = np.array([item['predictions'][0] for item in training_predictions])
            # compute validation predictions
            validation_predictions = self.lr.predict(input_fn=predict_vali_input_fn)
            validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
            # get validation eval
            training_eval = self.lr.evaluate(input_fn=predict_training_input_fn)
            validation_eval = self.lr.evaluate(input_fn=predict_vali_input_fn)
            print("training_eval: {}".format(training_eval['average_loss']))
            print("validation_eval: {}".format(validation_eval['average_loss']))

            # compute training loss
            training_root_mean_squared_error = math.sqrt(
                metrics.mean_squared_error(training_predictions, self.tt))
            # compute validation loss
            validation_root_mean_squared_error = math.sqrt(
                metrics.mean_squared_error(validation_predictions, self.vt))
            # Occasionally print the current loss.
            print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
            # Add the loss metrics from this period to our list.
            training_rmse.append(training_root_mean_squared_error)
            validation_rmse.append(validation_root_mean_squared_error)
        
        print("Model training finished.")
        # saving model
        if self.save_model:
            print("Saving model...")
            
        # output a graph of loss metrics over periods.
        self.result_plot(training_rmse, validation_rmse)
        
    def result_plot(self, t_rmse, v_rmse):
        plt.ylabel("RMSE")
        plt.xlabel("Periods")
        plt.title("Root Mean Squared Error vs. Periods")
        plt.tight_layout()
        plt.plot(t_rmse, label="training")
        plt.plot(v_rmse, label="validation")
        plt.legend()
        

class predict_file:
    def __init__(self, model, test_df, id_frame):
        self.test_df = test_df
        self.id_frame = id_frame
#         self.predict_col = predict_col
        self.predict_input_fn = self.create_predict_fn()
        
        self.predictions = list(model.lr.predict(input_fn=self.predict_input_fn))
        self.predictions = np.array([item['predictions'][0] for item in self.predictions])
        self.predictions *= 1000
        
        self.evaluation = self.id_frame#test_df['Id'].copy().to_frame()
        self.evaluation['SalePrice'] = self.predictions
    
    def create_predict_fn(self):
#         predict_df = self.test_df
        predict_input_fn = tf.estimator.inputs.pandas_input_fn(
            x=self.test_df,
            num_epochs=1,
            shuffle=False
        )
        return predict_input_fn
    
    def save_predict(self, path):
        self.evaluation.to_csv(path, index=False)

In [5]:
# model 3: deal with NaN data & out lier data
# using_col = ['SalePrice', 'OverallQual', 'TotalBsmtSF', YearBuilt']
m3_using_col = ['Id', 'SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'FullBath', '1stFlrSF']
m3_category_col = ['Id', 'SaleType', 'SaleCondition']
# predict_col = using_col[1:]

# loading csv file
train_df = create_data(train_path)

# deal out lier data
train_df.chose_model(3, m3_using_col, m3_category_col)
train_df.model_df.head()

# create model
model3 = create_model(
    train_df=train_df.model_df, 
#     save_model=True, 
    save_model_name="model3_test_v1",
    load_model=True, 
    load_model_name="model3_test_v1",
    learning_rate=0.08,
    steps=200,
    batch_size=15,
#     hidden_units=[3]
)

# training model
# model3.train()


# data = pd.concat([t_df.data['SalePrice'], t_df.data['GrLivArea']], axis=1)
# data.plot.scatter(x='GrLivArea', y='SalePrice', ylim=(0, 800000))

# data2 = pd.concat([t_df.ol_data['SalePrice'], t_df.ol_data['GrLivArea']], axis=1)
# data2.plot.scatter(x='GrLivArea', y='SalePrice', ylim=(0, 800000))


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

build the model


In [8]:
test_df = create_data(test_path)
m1_using_col = ['Id', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'FullBath', '1stFlrSF']
m1_category_col = ['Id', 'SaleType', 'SaleCondition']
test_df.chose_model(1, m1_using_col, m1_category_col, True)
id_frame = test_df.data['Id'].copy().to_frame()

predict = predict_file(model3, test_df.model_df, id_frame)
# predict.evaluation.head()
predict.save_predict('m3_predict.csv')