Now that you've learned about random forests and decision trees let's do an exercise in accuracy. You know that random forests are basically a collection of decision trees. But how do the accuracies of the two models compare?

So here's what you should do. Pick a dataset. It could be one you've worked with before or it could be a new one. Then build the best decision tree you can.

Now try to match that with the simplest random forest you can. For our purposes measure simplicity with runtime. Compare that to the runtime of the decision tree. This is imperfect but just go with it.

Hopefully out of this you'll see the power of random forests, but also their potential costs. Remember, in the real world you won't necessarily be dealing with thousands of rows. It could be millions, billions, or even more.

In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [102]:
# We import the data from csv file:
df = pd.read_csv('think capstone1_slim.csv' ,encoding ='latin1')


In [103]:
# Get a feel for the data:
df.head()

Unnamed: 0,YYYY_MM,IdProduct,Product_Name,Manufacturer_Name,Product Classification,IdLocation,Drugstore_Name,Total_Sales,No_of_transactions,No_of_medical_prescriptions,Units_Sold,Net_Profit,row_num_sales,row_num_profit
0,2015-1,20177,Nut Seringa Ject 10 Ml Cu Ac,MINUT,_DispozitivMedicalMultiplu,3,Stad,45.73,15,0,86.0,18.07,2,3
1,2015-1,2130,Panthen Unguent 50 Mg /G X 100 G,BAYER SCHERING,Cosmetice,3,Stad,265.55,4,0,5.0,46.66,1,1
2,2015-1,18,Oderma Abc Derm Gel Spumant X 200 Ml,BIODERMA,Cosmetice,3,Stad,37.42,1,0,1.0,9.45,14,9
3,2015-1,20697,Ltene Tratament Pt Unghii,FOLTENE,Cosmetice,3,Stad,48.9,1,0,1.0,4.38,8,33
4,2015-1,25030,Nsiprod Tensiometru Pentru Brat,PANSIPROD,Dispozitive Medicale,3,Stad,408.16,2,0,2.0,96.13,1,2


In [104]:
df=df.drop(['IdProduct','row_num_sales','row_num_profit','IdLocation','No_of_transactions'] , axis=1)

In [105]:
# What kind of data is in the dataframe?
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1686 entries, 0 to 1685
Data columns (total 9 columns):
YYYY_MM                        1686 non-null object
Product_Name                   1686 non-null object
Manufacturer_Name              1686 non-null object
Product Classification         1686 non-null object
Drugstore_Name                 1686 non-null object
Total_Sales                    1686 non-null float64
No_of_medical_prescriptions    1686 non-null int64
Units_Sold                     1686 non-null float64
Net_Profit                     1686 non-null float64
dtypes: float64(3), int64(1), object(5)
memory usage: 118.6+ KB
None


In [106]:
categorical = df.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print('Column name:',i ,' \t\t/ Unique values:', column.nunique())

Column name: YYYY_MM  		/ Unique values: 3
Column name: Product_Name  		/ Unique values: 488
Column name: Manufacturer_Name  		/ Unique values: 132
Column name: Product Classification  		/ Unique values: 12
Column name: Drugstore_Name  		/ Unique values: 5


In [107]:
pd.get_dummies(df)

Unnamed: 0,Total_Sales,No_of_medical_prescriptions,Units_Sold,Net_Profit,YYYY_MM_2015-1,YYYY_MM_2015-2,YYYY_MM_2015-3,Product_Name_ - Spa 40 Mg X 24 Tb.,Product_Name_ - Spa Forte 80 Mg X 24 Cpr,Product_Name_ Insulina 12 Mm X 1 Buc.,...,Product Classification_Parafarm.,Product Classification_RX,Product Classification_RX - 120,Product Classification_Suplimente,Product Classification__DispozitivMedicalMultiplu,Drugstore_Name_Bica,Drugstore_Name_Buhu,Drugstore_Name_Ener,Drugstore_Name_Repu,Drugstore_Name_Stad
0,45.73,0,86.00,18.07,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,265.55,0,5.00,46.66,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,37.42,0,1.00,9.45,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,48.90,0,1.00,4.38,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,408.16,0,2.00,96.13,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,41.94,0,1.00,13.86,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,169.68,1,2.00,8.84,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,378.08,0,2.00,-3.00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,736.71,0,68.85,154.69,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,129.33,0,3.00,34.51,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


# Decision Tree:

In [108]:
# This is the model we'll be using.
from sklearn import tree

In [109]:
X = df.drop('Net_Profit', axis=1)
Y = df['Net_Profit']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

In [110]:
from sklearn.model_selection import train_test_split

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=20)

print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)


(1348, 643) (1348,)
(338, 643) (338,)


In [111]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np

import time
start_time = time.time()

regressor = DecisionTreeRegressor(max_depth=5)
regressor.fit( X_train , y_train )

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.018047571182250977 seconds ---


In [112]:
R2_tesr = regressor.score(X_test,y_test)
R2_trained

0.8163744253148448

### To make predictions on the test set, use the predict method:

In [115]:
y_pred = regressor.predict(X_test) 


In [116]:
# Now let's compare some of our predicted values with the actual values and see how accurate we were:
df_compare=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})  
df_compare  

Unnamed: 0,Actual,Predicted
1566,555.20,391.210870
1569,203.11,247.160000
799,765.17,611.807089
1040,744.33,736.180000
307,79.39,49.410736
975,1485.84,1194.968182
996,32.04,49.410736
544,276.66,247.160000
460,95.54,177.088995
1155,10.74,6.529397


### Evaluating the Algorithm
To evaluate performance of the regression algorithm, the commonly used metrics are 
- mean absolute error, 
- mean squared error, and 
- root mean squared error. 

In [117]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 

Mean Absolute Error: 66.83262966437677
Mean Squared Error: 12262.641985253413
Root Mean Squared Error: 110.7368140468806


In [118]:
# Calculate the absolute errors
errors = abs(y_pred - y_test)

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 243.52 %.


### Overfitting ...?

### Cross validation for Decision Tree:

In [121]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(regressor, X, Y, cv=10))

[0.784 0.791 0.848 0.894 0.691 0.797 0.912 0.872 0.77  0.826]


### Random Forest

In [78]:
# This is the model we'll be using.
from sklearn import tree

In [79]:
X = df.drop('Net_Profit', axis=1)
Y = df['Net_Profit']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

In [80]:
from sklearn.model_selection import train_test_split

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=20)

print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)


(1348, 643) (1348,)
(338, 643) (338,)


In [81]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

import time
start_time = time.time()

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(X_train, y_train);

print("--- %s seconds ---" % (time.time() - start_time))

--- 15.240860939025879 seconds ---


In [82]:
col_names = X.columns
col_names

Index(['Total_Sales', 'No_of_medical_prescriptions', 'Units_Sold',
       'YYYY_MM_2015-1', 'YYYY_MM_2015-2', 'YYYY_MM_2015-3',
       'Product_Name_ - Spa 40 Mg X 24 Tb.',
       'Product_Name_ - Spa Forte 80 Mg X 24 Cpr',
       'Product_Name_ Insulina 12 Mm X 1 Buc.',
       'Product_Name_. Soleil Sampon Antiparazitar Uman X 200 Ml',
       ...
       'Product Classification_Parafarm.', 'Product Classification_RX',
       'Product Classification_RX - 120', 'Product Classification_Suplimente',
       'Product Classification__DispozitivMedicalMultiplu',
       'Drugstore_Name_Bica', 'Drugstore_Name_Buhu', 'Drugstore_Name_Ener',
       'Drugstore_Name_Repu', 'Drugstore_Name_Stad'],
      dtype='object', length=643)

In [83]:
print ("Features sorted by their score:")
feat_importance = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), col_names), 
             reverse=True)

feat_importance[0:10]

Features sorted by their score:


[(0.6503, 'Total_Sales'),
 (0.0761, 'No_of_medical_prescriptions'),
 (0.0703, 'Units_Sold'),
 (0.0352, 'Product_Name_O-Dis L-Arginina 500 Mg X 50 Cps.'),
 (0.0317, 'Manufacturer_Name_BIO-DIS'),
 (0.031, 'Product Classification_PROPRII'),
 (0.0132, 'Drugstore_Name_Buhu'),
 (0.0093, 'Manufacturer_Name_ANTIBIOTICE'),
 (0.0048, 'Product Classification_OTC'),
 (0.0038, 'Product_Name_Penter 75 Mg X 28 Cpr. Film.')]

### Make Predictions on the Test Set
We compare the predictions to the known answers. When performing regression, we need to make sure to use the absolute error because we expect some of our answers to be low and some to be high. We are interested in how far away our average prediction is from the actual value so we take the absolute value

In [84]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2 ), 'Net_Profit')

Mean Absolute Error: 36.58 Net_Profit


In [85]:
# Now let's compare some of our predicted values with the actual values and see how accurate we were:
df_compare=pd.DataFrame({'Actual':y_test, 'Predicted':predictions})  
df_compare

Unnamed: 0,Actual,Predicted
1566,555.20,570.47825
1569,203.11,168.96022
799,765.17,587.29355
1040,744.33,636.80225
307,79.39,87.71612
975,1485.84,1269.58047
996,32.04,44.30425
544,276.66,294.93167
460,95.54,242.19076
1155,10.74,10.66814


### Determine Performance Metrics
To put our predictions in perspective, we can calculate an accuracy using the mean average percentage error subtracted from 100 %.

In [86]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 117.41 %.


In [87]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

import time
start_time = time.time()

rfc = ensemble.RandomForestRegressor()
X = df.drop('Net_Profit', axis=1)
Y = df['Net_Profit']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

print(cross_val_score(rfc, X, Y, cv=10))


print("--- %s seconds ---" % (time.time() - start_time))

[0.89827373 0.94069872 0.90922433 0.96520229 0.91943638 0.92954568
 0.92598974 0.94882975 0.87440189 0.91787411]
--- 1.8850045204162598 seconds ---


## SelectKbest

In [88]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [92]:
# feature extraction
kbest = SelectKBest(score_func=f_regression, k=4)

#Fitting finds the internal parameters of a model that will be used to transform data. 
fit = kbest.fit(X, Y)

# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)

#Transforming applies the parameters to data. You may fit a model to one set of data, and then 
#transform it on a completely different set.
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

[1.627e+03 9.076e+02 1.032e+00 1.261e-02 2.374e-03 2.560e-02 5.552e-02
 8.104e-02 3.785e-01 8.968e-01 2.721e-01 6.743e-01 2.006e-01 7.426e-03
 1.453e+00 3.611e-02 1.240e-02 2.242e+00 2.828e-01 2.971e-01 3.693e+01
 1.168e-01 5.808e+00 1.305e+00 9.177e-01 4.506e-03 3.009e-01 1.578e+00
 1.839e+00 9.071e-01 6.291e-01 2.765e-01 3.028e-01 9.295e-01 5.828e-02
 4.304e+01 4.411e-02 3.393e-01 2.360e-01 7.094e-01 4.275e-01 1.541e-02
 1.012e+00 1.645e+00 5.714e-01 5.429e+00 4.029e-02 3.336e+00 1.125e+00
 2.337e-01 1.985e+00 5.661e-02 1.299e+00 1.416e+00 4.610e+00 3.015e-01
 3.037e-01 3.031e-01 1.120e+00 3.334e-01 3.082e-01 8.351e-01 5.317e-01
 1.742e-01 2.670e-01 2.010e-01 8.509e-02 1.446e+00 1.672e+00 5.082e+01
 3.793e-01 1.094e+00 1.440e+00 2.956e-01 3.549e-01 5.847e+01 5.905e-01
 3.402e-01 2.850e-01 7.790e-01 4.859e-01 7.848e-01 1.415e+00 3.388e-01
 1.089e+00 2.868e-01 1.719e+02 1.115e+00 9.413e+00 2.468e-01 4.125e-02
 8.771e-02 1.213e-01 3.307e-01 3.853e-02 4.541e-01 2.920e-02 7.618e-01
 1.197

In [94]:
selected_features = kbest.fit_transform(X, Y)

In [96]:
selected_features[[0:]]

array([[4.573e+01, 0.000e+00, 0.000e+00, 0.000e+00],
       [2.656e+02, 0.000e+00, 0.000e+00, 0.000e+00],
       [3.742e+01, 0.000e+00, 0.000e+00, 0.000e+00],
       ...,
       [3.930e+03, 7.500e+01, 0.000e+00, 1.000e+00],
       [1.749e+03, 0.000e+00, 0.000e+00, 0.000e+00],
       [6.090e+02, 0.000e+00, 0.000e+00, 0.000e+00]])

### SVM Parameter Tuning in Scikit Learn using GridSearchCV:
- grid search cross validation

In [122]:
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn import svm
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge




In [123]:
X = df.drop('Net_Profit', axis=1)
Y = df['Net_Profit']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

In [124]:
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])

In [125]:
# create and fit a ridge regression model, testing each alpha
model = Ridge()

By default, the GridSearchCV’s cross validation uses 3-fold KFold or StratifiedKFold depending on the situation.


In [126]:
estimator = RandomForestRegressor()

param_grid = { 
            "n_estimators"      : [10,20,30],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_split" : [2,4,8],
            "bootstrap": [True, False],
            }

In [127]:
grid = GridSearchCV(estimator, param_grid ,  n_jobs=-1, cv=5 )
grid.fit( X, Y )
print(grid)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [True, False], 'min_samples_split': [2, 4, 8], 'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [10, 20, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)


In [128]:
# summarize the results of the grid search
print('best_score_: ',grid.best_score_)
print('best_estimator_: ',grid.best_estimator_)
print('best_params_: ',grid.best_params_)

best_score_:  0.9272174141736841
best_estimator_:  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
best_params_:  {'n_estimators': 30, 'min_samples_split': 2, 'bootstrap': True, 'max_features': 'auto'}
