In [3]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, normalize
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split

import xgboost as xg
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
from seaborn import barplot
%matplotlib inline

# IMPORT TRAIN DATA

In [19]:
df = pd.read_csv('Train.csv', low_memory = False)
sid = df.pop('SalesID').values
df = df.drop(['auctioneerID'], axis = 1)
df['saledate'] = pd.to_datetime(df['saledate'])
df['saledate'] = df['saledate'].dt.year

# GET DUMMIES USING LABEL ENCODER

In [20]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [21]:
# List of Column Names that will be Encoded
lst = ['MachineID',
 'ModelID',
 'datasource',
 'UsageBand',
 'fiModelDesc',
 'fiBaseModel',
 'fiSecondaryDesc',
 'fiModelSeries',
 'fiModelDescriptor',
 'ProductSize',
 'fiProductClassDesc',
 'state',
 'ProductGroup',
 'ProductGroupDesc',
 'Drive_System',
 'Enclosure',
 'Forks',
 'Pad_Type',
 'Ride_Control',
 'Stick',
 'Transmission',
 'Turbocharged',
 'Blade_Extension',
 'Blade_Width',
 'Enclosure_Type',
 'Engine_Horsepower',
 'Hydraulics',
 'Pushblock',
 'Ripper',
 'Scarifier',
 'Tip_Control',
 'Tire_Size',
 'Coupler',
 'Coupler_System',
 'Grouser_Tracks',
 'Hydraulics_Flow',
 'Track_Type',
 'Undercarriage_Pad_Width',
 'Stick_Length',
 'Thumb',
 'Pattern_Changer',
 'Grouser_Type',
 'Backhoe_Mounting',
 'Blade_Type',
 'Travel_Controls',
 'Differential_Type',
 'Steering_Controls']

# Convert new Encoded DataFrame to X, y Matrices

In [22]:
df2 = MultiColumnLabelEncoder(columns = lst).fit_transform(df)
df2 = df2.fillna(999999)
y = df2.pop('SalePrice').values
X = df2.values

In [64]:
df2.to_csv("training_data_cleaned.csv", index = True)


In [66]:
np.savetxt("training_data_cleaned_y.csv", y)

In [23]:
X.shape, y.shape

((401125, 50), (401125,))

# Split Data

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.30)

In [25]:
# model = RandomForestRegressor()
# model.fit(X_train, y_train)
# model.score(X_test, y_test)

# RUN GRIDSEARCH FOR BEST GRADBOOST REGRESSOR

In [26]:
param_grid = {'n_estimators' : [100, 200],
              'max_depth' : [None],
              'max_features' : ['auto', 'sqrt', 'log2']}
model = RandomForestRegressor()
gs_cv = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', n_jobs = -1).fit(X_train, y_train)

gs_cv.best_params_, gs_cv.best_score_, gs_cv.best_estimator_

# Fit Model to Training Data

In [27]:
model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [28]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

# IMPORT/CLEAN FINAL TEST DATA

In [29]:
dft = pd.read_csv('test.csv', low_memory = False)
dft = dft.drop(['auctioneerID'], axis = 1)
Sales_ID = dft.pop('SalesID').values 
dft['saledate'] = pd.to_datetime(dft['saledate'])
dft['saledate'] = dft['saledate'].dt.year

In [30]:
dft2 = MultiColumnLabelEncoder(columns = lst).fit_transform(dft)
dft2 = dft2.fillna(999999)
X_test_data = dft2.values

In [65]:
dft2.to_csv("testing_data_cleaned.csv", index = True)

In [31]:
X_test_data.shape

(11573, 50)

# PREDICT ON TEST DATA FILE

In [32]:
predicted_values = model.predict(X_test_data)

In [41]:
predicted_values = predicted_values.reshape(11573, 1)
Sales_ID = Sales_ID.reshape(11573, 1)

In [44]:
final = np.column_stack((Sales_ID, predicted_values))
final

array([[ 1222837.,    30820.],
       [ 1222839.,    75760.],
       [ 1222841.,    46540.],
       ..., 
       [ 6333347.,    12930.],
       [ 6333348.,    14270.],
       [ 6333349.,    18440.]])

In [56]:
final_df = final_df.round(1)
final_df['SalesID'] = final_df['SalesID'].astype(int)
final_df

Unnamed: 0,SalesID,SalePrice
0,1222837,30820.0
1,1222839,75760.0
2,1222841,46540.0
3,1222843,33380.0
4,1222845,33265.0
5,1222847,9071.0
6,1222849,32405.0
7,1222850,32195.0
8,1222855,11395.0
9,1222863,26190.0


In [47]:
final_df = pd.DataFrame(final, columns = ['SalesID', 'SalePrice'])

In [58]:
final_df.to_csv("rafa_shawn_predictions.csv", index = False)

In [50]:
final_df['SalePrice'].float(1)

AttributeError: 'Series' object has no attribute 'float'

In [52]:
np.set_printoptions(precision=2)
final_f = predicted_values
final_f

array([[ 30820.],
       [ 75760.],
       [ 46540.],
       ..., 
       [ 12930.],
       [ 14270.],
       [ 18440.]])

In [59]:
model.score(X_test, y_test)

0.88437349664012799

In [60]:
param_grid = {'n_estimators' : [100, 200],
              'max_depth' : [None],
              'max_features' : ['auto', 'sqrt', 'log2']}
model = RandomForestRegressor()
gs_cv = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', n_jobs = -1).fit(X_train, y_train)

gs_cv.best_params_, gs_cv.best_score_, gs_cv.best_estimator_

({'max_depth': None, 'max_features': 'auto', 'n_estimators': 200},
 -65740028.645814724,
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))

In [61]:
gs_cv.best_estimator_.score(X_test, y_test)

0.8861219346434176

In [None]:
param_grid = {'n_estimators' : [500],
              'max_depth' : [None],
              'max_features' : ['auto']}
model = RandomForestRegressor()
gs_cv = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', n_jobs = -1).fit(X_train, y_train)

gs_cv.best_params_, gs_cv.best_score_, gs_cv.best_estimator_