This project is based on kaggle's project

[Blue Book for Bulldozers](https://www.kaggle.com/c/bluebook-for-bulldozers)

Obejctive - Predict the auction sale price for a piece of heavy equipment to create a "blue book" for bulldozers

We're going to take the following approach:  

1. Problem definition
2. Data  
    Train and Valid are combined and in TrainAndValid. Both of them have price column       
3. Evaluation
4. Features
5. Modelling
6. Experimentation


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder#, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import classification_report

In [2]:
data_dictionary = pd.read_excel("Data Dictionary.xlsx")
#to shorten the description and read more
data_dictionary["Description"] = data_dictionary["Description"].str.replace("machine configuration", "MC")
data_dictionary

Unnamed: 0,Variable,Description,Unnamed: 2
0,SalesID,unique identifier of a particular sale of a ...,
1,MachineID,identifier for a particular machine; machin...,
2,ModelID,identifier for a unique machine model (i.e. ...,
3,datasource,source of the sale record; some sources are...,
4,auctioneerID,"identifier of a particular auctioneer, i.e. ...",
5,YearMade,year of manufacturer of the Machine,
6,MachineHoursCurrentMeter,current usage of the machine in hours at tim...,
7,UsageBand,"value (low, medium, high) calculated compari...",
8,Saledate,time of sale,
9,Saleprice,cost of sale in USD,


In [3]:
#TrainAndValid is the combination of train and valid (2 seperate csv)
trainAndValid = pd.read_csv("TrainAndValid.csv", low_memory = False, parse_dates = ["saledate"])
trainAndValid.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000.0,999089,3157,121,3.0,2004,68.0,Low,2006-11-16,...,,,,,,,,,Standard,Conventional
1,1139248,57000.0,117657,77,121,3.0,1996,4640.0,Low,2004-03-26,...,,,,,,,,,Standard,Conventional
2,1139249,10000.0,434808,7009,121,3.0,2001,2838.0,High,2004-02-26,...,,,,,,,,,,
3,1139251,38500.0,1026470,332,121,3.0,2001,3486.0,High,2011-05-19,...,,,,,,,,,,
4,1139253,11000.0,1057373,17311,121,3.0,2007,722.0,Medium,2009-07-23,...,,,,,,,,,,


In [4]:
train = pd.read_csv("Train.csv", low_memory = False, parse_dates = ["saledate"])
train.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000,999089,3157,121,3.0,2004,68.0,Low,2006-11-16,...,,,,,,,,,Standard,Conventional
1,1139248,57000,117657,77,121,3.0,1996,4640.0,Low,2004-03-26,...,,,,,,,,,Standard,Conventional
2,1139249,10000,434808,7009,121,3.0,2001,2838.0,High,2004-02-26,...,,,,,,,,,,
3,1139251,38500,1026470,332,121,3.0,2001,3486.0,High,2011-05-19,...,,,,,,,,,,
4,1139253,11000,1057373,17311,121,3.0,2007,722.0,Medium,2009-07-23,...,,,,,,,,,,


In [5]:
#Splitting the data in X and y
# train_X = train.drop("SalePrice", axis = 1)
# train_y = train["SalePrice"]

In [6]:
trainAndValid.shape, trainAndValid.isnull().sum()

((412698, 53),
 SalesID                          0
 SalePrice                        0
 MachineID                        0
 ModelID                          0
 datasource                       0
 auctioneerID                 20136
 YearMade                         0
 MachineHoursCurrentMeter    265194
 UsageBand                   339028
 saledate                         0
 fiModelDesc                      0
 fiBaseModel                      0
 fiSecondaryDesc             140727
 fiModelSeries               354031
 fiModelDescriptor           337882
 ProductSize                 216605
 fiProductClassDesc               0
 state                            0
 ProductGroup                     0
 ProductGroupDesc                 0
 Drive_System                305611
 Enclosure                      334
 Forks                       214983
 Pad_Type                    331602
 Ride_Control                259970
 Stick                       331602
 Transmission                224691
 Turbocharged

In [7]:
#there is not point to see the test data as it will be same as train and valid
test = pd.read_csv("Test.csv", low_memory = False, parse_dates = ["saledate"])
test

Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,fiModelDesc,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1227829,1006309,3168,121,3,1999,3688.0,Low,2012-05-03,580G,...,,,,,,,,,,
1,1227844,1022817,7271,121,3,1000,28555.0,High,2012-05-10,936,...,,,,,,,,,Standard,Conventional
2,1227847,1031560,22805,121,3,2004,6038.0,Medium,2012-05-10,EC210BLC,...,None or Unspecified,"9' 6""",Manual,None or Unspecified,Double,,,,,
3,1227848,56204,1269,121,3,2006,8940.0,High,2012-05-10,330CL,...,None or Unspecified,None or Unspecified,Manual,Yes,Triple,,,,,
4,1227863,1053887,22312,121,3,2005,2286.0,Low,2012-05-10,650K,...,,,,,,None or Unspecified,PAT,None or Unspecified,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12452,6643171,2558317,21450,149,2,2008,,,2012-10-24,80NX3,...,None or Unspecified,None or Unspecified,None or Unspecified,None or Unspecified,Double,,,,,
12453,6643173,2558332,21434,149,2,2005,,,2012-10-24,28N,...,None or Unspecified,None or Unspecified,None or Unspecified,None or Unspecified,Double,,,,,
12454,6643184,2558342,21437,149,2,1000,,,2012-10-24,35N,...,None or Unspecified,None or Unspecified,None or Unspecified,None or Unspecified,Double,,,,,
12455,6643186,2558343,21437,149,2,2006,,,2012-10-24,35N,...,None or Unspecified,None or Unspecified,None or Unspecified,None or Unspecified,Double,,,,,


In [8]:
# test_null = pd.DataFrame(test.info())
# test_null
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12457 entries, 0 to 12456
Data columns (total 52 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   SalesID                   12457 non-null  int64         
 1   MachineID                 12457 non-null  int64         
 2   ModelID                   12457 non-null  int64         
 3   datasource                12457 non-null  int64         
 4   auctioneerID              12457 non-null  int64         
 5   YearMade                  12457 non-null  int64         
 6   MachineHoursCurrentMeter  2129 non-null   float64       
 7   UsageBand                 1834 non-null   object        
 8   saledate                  12457 non-null  datetime64[ns]
 9   fiModelDesc               12457 non-null  object        
 10  fiBaseModel               12457 non-null  object        
 11  fiSecondaryDesc           8482 non-null   object        
 12  fiModelSeries     

In [9]:
trainAndValid.shape, test.shape, train.shape


((412698, 53), (12457, 52), (401125, 53))

In [10]:
# #slicing the time part and converting in datetime format
# # from datetime import datetime

# def slice_and_convert(df):
#     temp = df["saledate"].str.slice(stop = -5) #extracting th date


#     df["saledate"] = pd.to_datetime(temp, 
#                           format = "%m/%d/%y", #format
#                           infer_datetime_format=True) #deal with multiple format
    

## Pipleline

Which didn't worked

In [11]:
%%time
np.random.seed(42)

fields_categorical = ["fiProductClassDesc", "state", "fiBaseModel", 
                              "ProductGroup", "ProductGroupDesc", 
                              "Enclosure", "Hydraulics"]

categorical_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ("OrdinalEncoder", OrdinalEncoder())])

fields_numercial = ["SalesID", "MachineID", 
                             "ModelID", "datasource", 
                             "auctioneerID", "YearMade"]

numeric_transform = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "median"))
])

preprocessor = ColumnTransformer(transformers = [
    ("cat", categorical_transformer, fields_categorical),
    ("num", numeric_transform, fields_numercial)
])

model = Pipeline(steps = [("preprocessor", preprocessor),
                         ("model", RandomForestRegressor(max_samples = 10000))])

# Taking a sample test set
# we need to take a random sample with result so the order doesn't disturb

train_X = train[fields_categorical + fields_numercial]
train_y = train["SalePrice"]

training_X, valid_X, training_y, valid_y = train_test_split(train_X, 
                                                            train_y, 
                                                            test_size = 0.2)

model.fit(training_X, training_y);

Wall time: 9.3 s


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('OrdinalEncoder',
                                                                   OrdinalEncoder())]),
                                                  ['fiProductClassDesc',
                                                   'state', 'fiBaseModel',
                                                   'ProductGroup',
                                                   'ProductGroupDesc',
                                                   'Enclosure', 'Hydraulics']),
                                                 ('num',
                             

Even though, the structure of the 2 is same, second part `Pipeline.score()` is not working.

In [12]:
# model.score(valid_X, valid_y)

In [13]:
# #creating a sample which will be preprocessed to fit a model

# np.random.seed(42)

# train_sample = train.sample(frac = 1.0)

# fields_numercial = ["SalesID", "MachineID", 
#                              "ModelID", "datasource",
#                              "auctioneerID", "YearMade"]

# #Imputing the numeric data
# num_trans = SimpleImputer(strategy='median')
# train_sample[fields_numercial] = num_trans.fit_transform(train_sample[fields_numercial])
# # num_trans.transform([fields_numerical])

# fields_categorical = ["fiProductClassDesc", "state", "fiBaseModel", 
#                       "ProductGroup", "Hydraulics",
#                       "ProductGroupDesc", "Enclosure"]

# # #Imputing the categorical data
# cat_trans = SimpleImputer(strategy='constant', fill_value='Missing')
# train_sample[fields_categorical] = cat_trans.fit_transform(train_sample[fields_categorical])
# # cat_trans.transform([fields_categorical])

# cat_enc = OrdinalEncoder()
# train_sample[fields_categorical] = cat_enc.fit_transform(train_sample[fields_categorical])

In [14]:
# #check the transformed data 
# train_sample[fields_categorical].info()

In [15]:
# #convert the type form float to int

# # train_sample[fields_categorical] = train_sample[fields_categorical].apply(pd.to_numeric)
# train_sample[fields_categorical + fields_numercial] = train_sample[fields_categorical + fields_numercial].astype(int)
# train_sample[fields_categorical + fields_numercial].info()

Now that we have transformed the data in the format suitable for training, we can split it in X, y and test and train

In [16]:
# np.random.seed(42)

# train_sample_X = train_sample[fields_categorical + fields_numercial]
# train_sample_y = train_sample["SalePrice"]

In [17]:
# not requied in RandomSearchCV
# sample_train_X, sample_test_X, \
# sample_train_y, sample_test_y = train_test_split(train_sample_X, train_sample_y, 
#                                                  test_size = 0.2)  
# Number of trees in random forest

rf_grid = {"n_estimators" : np.arange(10, 100, 10), 
          "max_depth": [None, 3, 5, 10],
          "min_samples_split" : np.arange(2, 20, 2),
          "min_samples_leaf" : np.arange(1, 20, 2),
          "max_features" : [0.5, 1, "sqrt", "auto"],
          "max_samples" : [10000]}

In [18]:
# %%time 
# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# model = RandomForestRegressor()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = model, 
#                                param_distributions = rf_grid, 
#                                n_iter = 30, cv = 3, verbose=2, 
#                                random_state=42, )

# rf_random.fit(train_sample_X, train_sample_y)

In [19]:
# rf_random.best_params_


In [20]:
# rf_random.best_estimator_

In [21]:

n_estimators = [int(x) for x in np.linspace(start = 20, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid_2 = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
                'max_samples': [1000] }


With 2% data, accuracy was 69%  
With 20% data, accuracy was 75.94%

In [None]:
# deleting variable not required to save space (RAM)
# del validSolution, valid, trainAndValid, test, rfbt,\
# machine_appendix, data_dictionary, median_benchmark

# # to delete the variable
# import gc
# gc.collect()

In [None]:
# %%time
# # best_parameter = {'n_estimators': 1200,
# #  'min_samples_split': 2,
# #  'min_samples_leaf': 2,
# #  'max_features': 'sqrt',
# #  'max_depth': 50,
# #  'bootstrap': False}

# np.random.seed(42)

# train_sample_X = train_sample[fields_categorical + fields_numercial]
# train_sample_y = train_sample["SalePrice"]

# #to save the space (RAM)
# # del train_sample

# # not requied in RandomSearchCV
# sample_train_X, sample_valid_X, \
# sample_train_y, sample_valid_y = train_test_split(train_sample_X, train_sample_y, 
#                                                  test_size = 0.2)  
# #to save spavve (RAM) we will delete
# # del train_sample_X, train_sample_y


# # model_2 = RandomForestRegressor(n_estimators = 400, min_samples_split = 2, min_samples_leaf = 2, 
# #                                 max_features = 'sqrt', max_depth = 30, bootstrap =  False,
# #                                 max_samples = 10000)
# model_2 = RandomForestRegressor(max_samples = 10000)

# model_2.fit(sample_train_X, sample_train_y)
# model_2.score(sample_valid_X, sample_valid_y)

Revisit - 75%, with max_samples = 10000

In [None]:
# temp = pd.DataFrame(data = model_2.feature_importances_, index = sample_train_X.columns, )
# temp.plot.bar();

In [None]:
# model.fit(sample_train_X.drop(["datasource", "Hydraulics"], axis = 1), sample_train_y)
# model.score(sample_test_X.drop(["datasource", "Hydraulics"], axis = 1), sample_test_y)

Without "datasource" and "Hydraulics", accuracy was 76.05%

In [None]:
# print(model.classification_report)

to reduce the number of features we will use `np.where`

After hyperparameter tunning and 2% data - 71%  
After hyperparameter tunning and 20% data - 77.93  
After hyperparameter tunning and 100% data - 81.94


 ## Save the Model