source: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/notebooks/2_BasicModels/random_forest.ipynb


In [16]:
import tensorflow as tf
from tensorflow.python.ops import resources
from tensorflow.contrib.tensor_forest.python import tensor_forest

import pandas as pd
import numpy as np

from dateutil.parser import parse


# Ignore GPUs since TF random forest does not benifit from it.
import os
os.environ["CUDE_VISIBLE_DEVICES"] = ""

In [17]:
raw = pd.read_csv('data\BullDozersTrain.csv', index_col='SalesID')
len(raw)

401125

In [18]:
raw.dtypes

SalePrice                     int64
MachineID                     int64
ModelID                       int64
datasource                    int64
auctioneerID                float64
YearMade                      int64
MachineHoursCurrentMeter    float64
UsageBand                    object
saledate                     object
fiModelDesc                  object
fiBaseModel                  object
fiSecondaryDesc              object
fiModelSeries                object
fiModelDescriptor            object
ProductSize                  object
fiProductClassDesc           object
state                        object
ProductGroup                 object
ProductGroupDesc             object
Drive_System                 object
Enclosure                    object
Forks                        object
Pad_Type                     object
Ride_Control                 object
Stick                        object
Transmission                 object
Turbocharged                 object
Blade_Extension             

In [50]:
def add_times(df, column):
    added_column_names = []
    
    name = column + 'Year'
    df[name] = df[column].dt.year
    added_column_names.append(name)
    
    name = column + 'Month'
    df[name] = df[column].dt.month
    added_column_names.append(name)
    
    name = column + 'Day'
    df[name] = df[column].dt.day
    added_column_names.append(name)
    
    return added_column_names

def convert_to_date(df, column):
    # https://docs.python.org/2.7/library/datetime.html#datetime.datetime.strptime
    df[column] = pd.to_datetime(df[column], format='%m/%d/%Y %H:%M')    

class CleanedDataFrame:
    def __init__(self):
        self.input_columns = []
        self.output_columns = []
        self.df = None
        
    def __repr__(self):
        return f'output_columns:{str(self.output_columns)} input_columns:{str(self.input_columns)} df:{self.df.shape}'

def clean_data_frame(raw_df):
    
    input_columns = []
    output_columns = []
    df = raw_df.copy()
    df['SalePriceLog'] = np.log(df.SalePrice)  
    output_columns.append('SalePriceLog')
    
    
    # Get rid of all of the columns that are not explicitly defined
    columns_to_drop = []
    for column in df:
        if column not in output_columns:
            columns_to_drop.append(column)
            
    convert_to_date(df, 'saledate')
    
    input_columns += add_times(df, 'saledate')
            
    df = df.drop(columns=columns_to_drop)
    
    ret = CleanedDataFrame()
    ret.input_columns = input_columns
    ret.output_columns = output_columns
    ret.df = df
    
    return ret

In [51]:
result = clean_data_frame(raw)
result

output_columns:['SalePriceLog'] input_columns:['saledateYear', 'saledateMonth', 'saledateDay'] df:(401125, 4)

In [53]:
result.df.dtypes

SalePriceLog     float64
saledateYear       int64
saledateMonth      int64
saledateDay        int64
dtype: object