In [2]:
# Import modules
%matplotlib inline

import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from urllib.request import urlopen 

plt.style.use('ggplot')
pd.set_option('display.max_columns', 500) 

In [3]:
#Load data
train = pd.read_csv('mobile_price/train.csv')
test = pd.read_csv('mobile_price/test.csv')

In [4]:
print(train[:5])

   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  pc  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2   2         20       756  2549     9     7         19   
1        136        3   6        905      1988  2631    17     3          7   
2        145        5   6       1263      1716  2603    11     2          9   
3        131        6   9       1216      1786  2769    16     8         11   
4        141        2  14       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range 

In [5]:
#create new feature area_pixel from 2 features px_height and px_width. 
#create new feature are_cm from 2 features px_height and px_width
train['area_pixel'] = train['px_height']*train['px_width']
train['area_cm'] = train['sc_h']*train['sc_w']

In [6]:
#delete unnessecery features
del train['px_height']
del train['px_width']
del train['sc_h']
del train['sc_w']

In [7]:
#list features
names_index = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 
               'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 
               'n_cores', 'pc', 'ram', 'talk_time', 'three_g', 'touch_screen',
              'wifi', 'area_pixel', 'area_cm']

Inspect the data to have the general knowgledge about the data we're working on.

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  ram            2000 non-null   int64  
 12  talk_time      2000 non-null   int64  
 13  three_g        2000 non-null   int64  
 14  touch_screen   2000 non-null   int64  
 15  wifi           2000 non-null   int64  
 16  price_range    2000 non-null   int64  
 17  area_pixel     2000 non-null   int64  
 18  area_cm 

In [9]:
train.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,ram,talk_time,three_g,touch_screen,wifi,price_range,area_pixel,area_cm
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,9.9165,2124.213,11.011,0.7615,0.503,0.507,1.5,905260.1,80.257
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,6.064315,1084.732044,5.463955,0.426273,0.500116,0.500076,1.118314,829762.5,76.824156
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,0.0,256.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,5.0,1207.5,6.0,1.0,0.0,0.0,0.75,263200.5,19.0
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,10.0,2146.5,11.0,1.0,1.0,1.0,1.5,601359.0,55.0
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,15.0,3064.5,16.0,1.0,1.0,1.0,2.25,1359027.0,121.5
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,20.0,3998.0,20.0,1.0,1.0,1.0,3.0,3886306.0,342.0


Prepare the data to train

In [10]:
feature_space = train.iloc[:, train.columns != 'price_range']
feature_class = train.iloc[:, train.columns == 'price_range']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(feature_space,
                                                    feature_class,
                                                    test_size = 0.25, 
                                                    random_state = 42)


In [12]:
X_train

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,ram,talk_time,three_g,touch_screen,wifi,area_pixel,area_cm
1738,511,0,0.9,1,15,1,24,0.6,136,3,18,2378,4,1,0,0,463888,54
548,641,1,1.1,0,0,1,7,0.9,192,1,3,3595,19,1,1,1,316503,35
936,805,0,0.8,0,1,0,34,0.6,88,6,2,3647,9,1,0,1,1940956,198
1389,1801,0,0.5,1,6,0,52,0.9,120,6,10,258,9,1,0,0,170800,112
1607,744,0,1.7,1,0,1,33,0.5,105,4,2,2700,19,1,0,0,2457676,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,1975,1,1.9,1,2,0,31,0.9,151,1,17,3022,19,0,0,1,1245425,65
1294,589,1,0.5,0,1,1,59,0.7,146,8,4,362,6,1,1,1,1410222,160
860,1829,1,0.5,0,0,1,15,0.4,160,5,7,2080,12,1,0,1,923643,176
1459,1927,0,0.9,1,3,0,11,0.4,190,8,12,2916,18,0,1,1,739446,176


Call rarel() to return a contiguous flattened array.

In [10]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1500, 18)
(500, 18)
(1500,)
(500,)


In [13]:
#Create model random forest
fit_rf = RandomForestClassifier(random_state=42)

Using GridSearchCV to find the best parameter for our model.

In [15]:
np.random.seed(42)
start = time.time()

param_dist = {'max_depth': [7, 8, 9, 10],
              'bootstrap': [True, False],
              'max_features': ['auto', 'sqrt', 'log2', None],
              'criterion': ['gini', 'entropy']}

cv_rf = GridSearchCV(fit_rf, cv = 10,
                     param_grid=param_dist, 
                     n_jobs = 3)

cv_rf.fit(X_train, y_train)
print('Best Parameters using grid search: \n', 
      cv_rf.best_params_)
end = time.time()
print('Time taken in grid search: {0: .2f}'.format(end - start))

Best Parameters using grid search: 
 {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 9, 'max_features': None}
Time taken in grid search:  90.85


We'll initialize our random forest model using ID3 algorithm for each tree, max depth of each tree will be 9.

In [17]:
#Set parameters found for the model
fit_rf.set_params(criterion = 'entropy',
                  max_features = None, 
                  bootstrap = True,
                  max_depth = 9)

RandomForestClassifier(criterion='entropy', max_depth=9, max_features=None,
                       random_state=42)

In [19]:
#Train the model
fit_rf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=9, max_features=None,
                       random_state=42)

Define 2 helpers function to show the level of important of each features.

In [20]:
def variable_importance(fit):
    """
    Purpose
    ----------
    Checks if model is fitted CART model then produces variable importance
    and respective indices in dictionary.

    Parameters
    ----------
    * fit:  Fitted model containing the attribute feature_importances_

    Returns
    ----------
    Dictionary containing arrays with importance score and index of columns
    ordered in descending order of importance.
    """
    try:
        if not hasattr(fit, 'fit'):
            return print("'{0}' is not an instantiated model from scikit-learn".format(fit)) 

        # Captures whether the model has been trained
        if not vars(fit)["estimators_"]:
            return print("Model does not appear to be trained.")
    except KeyError:
        print("Model entered does not contain 'estimators_' attribute.")

    importances = fit.feature_importances_
    indices = np.argsort(importances)[::-1]
    return {'importance': importances,
            'index': indices}

In [22]:
def print_var_importance(importance, indices, names_index):
    """
    Purpose
    ----------
    Prints dependent variable names ordered from largest to smallest
    based on information gain for CART model.
    Parameters
    ----------
    * importance: Array returned from feature_importances_ for CART
                models organized by dataframe index
    * indices: Organized index of dataframe from largest to smallest
                based on feature_importances_
    * name_index: Name of columns included in model

    Returns
    ----------
    Prints feature importance in descending order
    """
    print("Feature ranking:")

    for f in range(0, indices.shape[0]):
        i = f
        print("{0}. The feature '{1}' has a Mean Decrease in Impurity of {2:.5f}"
              .format(f + 1,
                      names_index[indices[i]],
                      importance[indices[f]]))

In [21]:


var_imp_rf = variable_importance(fit_rf)

importances_rf = var_imp_rf['importance']

indices_rf = var_imp_rf['index']

In [23]:
print_var_importance(importances_rf, indices_rf, names_index)

Feature ranking:
1. The feature 'ram' has a Mean Decrease in Impurity of 0.66625
2. The feature 'area_pixel' has a Mean Decrease in Impurity of 0.13183
3. The feature 'battery_power' has a Mean Decrease in Impurity of 0.13041
4. The feature 'mobile_wt' has a Mean Decrease in Impurity of 0.01529
5. The feature 'int_memory' has a Mean Decrease in Impurity of 0.01051
6. The feature 'area_cm' has a Mean Decrease in Impurity of 0.00732
7. The feature 'talk_time' has a Mean Decrease in Impurity of 0.00719
8. The feature 'm_dep' has a Mean Decrease in Impurity of 0.00617
9. The feature 'clock_speed' has a Mean Decrease in Impurity of 0.00565
10. The feature 'pc' has a Mean Decrease in Impurity of 0.00530
11. The feature 'n_cores' has a Mean Decrease in Impurity of 0.00496
12. The feature 'fc' has a Mean Decrease in Impurity of 0.00333
13. The feature 'dual_sim' has a Mean Decrease in Impurity of 0.00117
14. The feature 'blue' has a Mean Decrease in Impurity of 0.00116
15. The feature 'four_g'

As we can see, top 5 features 'ram', 'area_pixel', 'battery_power', 'mobile_wt', 'int_memory' are the most important features that contribute to the final predict of our model.

In [25]:
accuracy_rf = fit_rf.score(X_test, y_test)

print("Here is our mean accuracy on the test set:\n {0:.3f}"\
      .format(accuracy_rf))

Here is our mean accuracy on the test set:
 0.908


As we can see, the model get 90.8% accuracy on the test set. 

In [26]:
y_pred = fit_rf.predict(X_test)

In [28]:
print(confusion_matrix(y_test, y_pred))

[[122  10   0   0]
 [  6 108   4   0]
 [  0  10 100  10]
 [  0   0   6 124]]


In [29]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.953     0.924     0.938       132
           1      0.844     0.915     0.878       118
           2      0.909     0.833     0.870       120
           3      0.925     0.954     0.939       130

    accuracy                          0.908       500
   macro avg      0.908     0.907     0.906       500
weighted avg      0.910     0.908     0.908       500

