In [1]:
# Import modules
%matplotlib inline

import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from urllib.request import urlopen 

plt.style.use('ggplot')
pd.set_option('display.max_columns', 500) 

In [2]:
#Load data
train = pd.read_csv('mobile_price/train.csv')
test = pd.read_csv('mobile_price/test.csv')

In [3]:
print(train[:5])

   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  pc  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2   2         20       756  2549     9     7         19   
1        136        3   6        905      1988  2631    17     3          7   
2        145        5   6       1263      1716  2603    11     2          9   
3        131        6   9       1216      1786  2769    16     8         11   
4        141        2  14       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  price_range 

In [4]:
#create new feature area_pixel from 2 features px_height and px_width. 
#create new feature are_cm from 2 features px_height and px_width
train['area_pixel'] = train['px_height']*train['px_width']
train['area_cm'] = train['sc_h']*train['sc_w']

In [5]:
#list features
names_index = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 
               'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 
               'n_cores', 'pc', 'ram', 'talk_time', 'three_g', 'touch_screen',
              'wifi', 'area_pixel', 'area_cm', 'px_height','px_width', 'sc_h', 'sc_w']

Inspect the data to have the general knowgledge about the data we're working on.

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [7]:
train.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range,area_pixel,area_cm
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,9.9165,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5,905260.1,80.257
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,6.064315,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314,829762.5,76.824156
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,0.0,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,5.0,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75,263200.5,19.0
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,10.0,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5,601359.0,55.0
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,15.0,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25,1359027.0,121.5
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,20.0,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0,3886306.0,342.0


Prepare the data to train

In [8]:
feature_space = train.iloc[:, train.columns != 'price_range']
feature_class = train.iloc[:, train.columns == 'price_range']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(feature_space,
                                                    feature_class,
                                                    test_size = 0.25, 
                                                    random_state = 42)


Call rarel() to return a contiguous flattened array.

In [10]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1500, 22)
(500, 22)
(1500,)
(500,)


Create model random forest

In [15]:

fit_rf = RandomForestClassifier(random_state=42)

Using GridSearchCV to find the best parameter for our model.

In [16]:
np.random.seed(42)
start = time.time()

param_dist = {'max_depth': [7, 8, 9, 10],
              'bootstrap': [True, False],
              'max_features': ['auto', 'sqrt', 'log2', None],
              'criterion': ['gini', 'entropy']}

cv_rf = GridSearchCV(fit_rf, cv = 10,
                     param_grid=param_dist, 
                     n_jobs = 3)

cv_rf.fit(X_train, y_train)
print('Best Parameters using grid search: \n', 
      cv_rf.best_params_)
end = time.time()
print('Time taken in grid search: {0: .2f}'.format(end - start))

Best Parameters using grid search: 
 {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 8, 'max_features': None}
Time taken in grid search:  106.33


We'll initialize our random forest model using ID3 algorithm for each tree, max depth of each tree will be 9.

In [18]:
#Set parameters found for the model
fit_rf.set_params(criterion = 'entropy',
                  max_features = None, 
                  bootstrap = True,
                  max_depth = 8)

RandomForestClassifier(criterion='entropy', max_depth=8, max_features=None,
                       random_state=42)

In [19]:
#Train the model
fit_rf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=8, max_features=None,
                       random_state=42)

Define 2 helpers function to show the level of important of each features.

In [20]:
accuracy_rf = fit_rf.score(X_test, y_test)

print("Here is our mean accuracy on the test set:\n {0:.3f}"\
      .format(accuracy_rf))

Here is our mean accuracy on the test set:
 0.908


As we can see, the model get 90.8% accuracy on the test set. 

In [21]:
y_pred = fit_rf.predict(X_test)

In [22]:
print(confusion_matrix(y_test, y_pred))

[[121  11   0   0]
 [  7 110   1   0]
 [  0  11  98  11]
 [  0   0   5 125]]


In [23]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.945     0.917     0.931       132
           1      0.833     0.932     0.880       118
           2      0.942     0.817     0.875       120
           3      0.919     0.962     0.940       130

    accuracy                          0.908       500
   macro avg      0.910     0.907     0.906       500
weighted avg      0.911     0.908     0.908       500

