In [1]:
import re, os, feather
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

### Load Data (parse datetimes)

In [2]:
# % time df_raw = pd.read_csv('tmp_data/Train.csv', low_memory=False, parse_dates=["saledate"])

#### Save as Feather

In [3]:
# os.makedirs('tmp_data', exist_ok=True)
# %time df_raw.to_feather('tmp_data/raw_data')

In [4]:
%time df_raw = pd.read_feather('tmp_data/raw_data')

CPU times: user 1.15 s, sys: 1.25 s, total: 2.4 s
Wall time: 4.96 s


### Check Data Types

In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401125 entries, 0 to 401124
Data columns (total 53 columns):
SalesID                     401125 non-null int64
SalePrice                   401125 non-null int64
MachineID                   401125 non-null int64
ModelID                     401125 non-null int64
datasource                  401125 non-null int64
auctioneerID                380989 non-null float64
YearMade                    401125 non-null int64
MachineHoursCurrentMeter    142765 non-null float64
UsageBand                   69639 non-null object
saledate                    401125 non-null datetime64[ns]
fiModelDesc                 401125 non-null object
fiBaseModel                 401125 non-null object
fiSecondaryDesc             263934 non-null object
fiModelSeries               56908 non-null object
fiModelDescriptor           71919 non-null object
ProductSize                 190350 non-null object
fiProductClassDesc          401125 non-null object
state                

### Convert Non-Numerical Data to Categories and their Codes

In [6]:
def non_numeric_to_cat(dataframe):
    '''Selects all non numeric columns and turns them into categories'''
    df_cats = dataframe.copy()
    non_numeric = dataframe.select_dtypes(exclude=['int','float','datetime']).columns
    df_cats[non_numeric] = df_cats[non_numeric].apply(lambda x: x.astype('category'))
    return df_cats

### Convert NaN values for Numerical Variables
#### Fill Nan values with column median and create a new column + _is_null to detect if null values are significant
#### Create a dictionary with column_name as key, and column_median as value

In [7]:
def null_column_add(dataframe):
    '''Fill Nan values with column median and create a new column + _is_null to use to detect if null values are significant
    Also creates a dictionary with column_name as key, and column_median as value'''
    na_dict = {}
    numeric = dataframe.select_dtypes(include=['int','float'])
    interp_data = dataframe.copy()
    for col in numeric:
        if dataframe[col].isnull().sum():
            col_median = dataframe[col].median()
            na_dict[col] = col_median
            interp_data[col+'_is_null'] = dataframe[col].isnull()
            interp_data[col] = dataframe[col].fillna(col_median) 
    return interp_data, na_dict

### Make 2 copies: 1 for feeding into the model and 1 for keeping variable labels

In [8]:
def process_dataframe(dataframe):
    '''Selects all non numeric columns and turns them into categorical codes'''
    df_cats = non_numeric_to_cat(dataframe)
    interp_data,na_dict = null_column_add(df_cats)
    cat_cols = interp_data.select_dtypes(['category']).columns
    model_data = interp_data.copy()
    model_data[cat_cols] = interp_data[cat_cols].apply(lambda x: x.cat.codes)
    return interp_data, model_data, na_dict

In [9]:
interp_data, model_data, na_dict = process_dataframe(df_raw)

In [10]:
na_dict

{'auctioneerID': 2.0, 'MachineHoursCurrentMeter': 0.0}

In [11]:
interp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401125 entries, 0 to 401124
Data columns (total 55 columns):
SalesID                             401125 non-null int64
SalePrice                           401125 non-null int64
MachineID                           401125 non-null int64
ModelID                             401125 non-null int64
datasource                          401125 non-null int64
auctioneerID                        401125 non-null float64
YearMade                            401125 non-null int64
MachineHoursCurrentMeter            401125 non-null float64
UsageBand                           69639 non-null category
saledate                            401125 non-null datetime64[ns]
fiModelDesc                         401125 non-null category
fiBaseModel                         401125 non-null category
fiSecondaryDesc                     263934 non-null category
fiModelSeries                       56908 non-null category
fiModelDescriptor                   71919 non-null 

In [12]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401125 entries, 0 to 401124
Data columns (total 55 columns):
SalesID                             401125 non-null int64
SalePrice                           401125 non-null int64
MachineID                           401125 non-null int64
ModelID                             401125 non-null int64
datasource                          401125 non-null int64
auctioneerID                        401125 non-null float64
YearMade                            401125 non-null int64
MachineHoursCurrentMeter            401125 non-null float64
UsageBand                           401125 non-null int8
saledate                            401125 non-null datetime64[ns]
fiModelDesc                         401125 non-null int16
fiBaseModel                         401125 non-null int16
fiSecondaryDesc                     401125 non-null int16
fiModelSeries                       401125 non-null int8
fiModelDescriptor                   401125 non-null int16
ProductS

### Add Datepart

In [13]:
def add_datepart(df, fldname, drop=True, time=False, errors="raise"):   
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [14]:
add_datepart(interp_data, 'saledate')
add_datepart(model_data, 'saledate')

In [15]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401125 entries, 0 to 401124
Data columns (total 67 columns):
SalesID                             401125 non-null int64
SalePrice                           401125 non-null int64
MachineID                           401125 non-null int64
ModelID                             401125 non-null int64
datasource                          401125 non-null int64
auctioneerID                        401125 non-null float64
YearMade                            401125 non-null int64
MachineHoursCurrentMeter            401125 non-null float64
UsageBand                           401125 non-null int8
fiModelDesc                         401125 non-null int16
fiBaseModel                         401125 non-null int16
fiSecondaryDesc                     401125 non-null int16
fiModelSeries                       401125 non-null int8
fiModelDescriptor                   401125 non-null int16
ProductSize                         401125 non-null int8
fiProductClassDesc

#### Save as Feather

In [17]:
# %time interp_data.to_feather('tmp_data/interp_data')
# %time model_data.to_feather('tmp_data/model_data')

In [18]:
%time interp_data = pd.read_feather('tmp_data/interp_data')
%time model_data = pd.read_feather('tmp_data/model_data')

CPU times: user 284 ms, sys: 205 ms, total: 489 ms
Wall time: 943 ms
CPU times: user 85.4 ms, sys: 132 ms, total: 217 ms
Wall time: 515 ms


### Test on Model

In [19]:
X,y = model_data.drop('SalePrice',axis=1), model_data['SalePrice']

model = RandomForestRegressor(n_jobs=-1)
%time model.fit(X,y)
model.score(X,y)

CPU times: user 1min 47s, sys: 1.46 s, total: 1min 49s
Wall time: 38.6 s


0.9824672333771376

## Phase 2:
### Train/Test Split
### Feature Importances
### Remove non-predictive features
### Remove redundant features
### Test Model on Get Dummies to find important predictors within a category
### Use Partial Dependency Plots
### Extrapolate data 