In [2]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection

In [4]:
if __name__=='__main__':
  df = pd.read_csv('https://raw.githubusercontent.com/Nithya-Vasudevan/Mobile-Price-Classification/master/train.csv')
  X = df.drop('price_range', axis=1).values
  y = df.price_range.values

  classifier = ensemble.RandomForestClassifier(n_jobs=-1)
  param_grid = {
      'n_estimators':[100,200,300,400],
      'max_depth':[1,3,5,7],
      'criterion':['gini','entropy'],

  }
  model = model_selection.GridSearchCV(
      estimator=classifier,
      param_grid = param_grid,
      scoring = 'accuracy',
      verbose=10,
      n_jobs = 1,
      cv=5
  )

  model.fit(X,y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 1/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.580 total time=   0.3s
[CV 2/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 2/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.593 total time=   0.3s
[CV 3/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 3/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.593 total time=   0.3s
[CV 4/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 4/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.588 total time=   0.3s
[CV 5/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 5/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.588 total time=   0.3s
[CV 1/5; 2/32] START criterion=gini, max_de

In [5]:
model.best_score_

0.873

In [6]:
model.best_estimator_

In [7]:
model.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 7,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 300,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [11]:
#Random Search CV
if __name__=='__main__':
  df = pd.read_csv('https://raw.githubusercontent.com/Nithya-Vasudevan/Mobile-Price-Classification/master/train.csv')
  X = df.drop('price_range', axis=1).values
  y = df.price_range.values

  classifier = ensemble.RandomForestClassifier(n_jobs=-1)
  param_grid = {
      'n_estimators':np.arange(100,1500,100),
      'max_depth':np.arange(1,20),
      'criterion':['gini','entropy'],

  }
  model = model_selection.RandomizedSearchCV(
      estimator=classifier,
      param_distributions = param_grid,
      n_iter=10,
      scoring = 'accuracy',
      verbose=10,
      n_jobs = 1,
      cv=5
  )

  model.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=entropy, max_depth=5, n_estimators=1400..........
[CV 1/5; 1/10] END criterion=entropy, max_depth=5, n_estimators=1400;, score=0.843 total time=   4.8s
[CV 2/5; 1/10] START criterion=entropy, max_depth=5, n_estimators=1400..........
[CV 2/5; 1/10] END criterion=entropy, max_depth=5, n_estimators=1400;, score=0.840 total time=   5.7s
[CV 3/5; 1/10] START criterion=entropy, max_depth=5, n_estimators=1400..........
[CV 3/5; 1/10] END criterion=entropy, max_depth=5, n_estimators=1400;, score=0.868 total time=   4.9s
[CV 4/5; 1/10] START criterion=entropy, max_depth=5, n_estimators=1400..........
[CV 4/5; 1/10] END criterion=entropy, max_depth=5, n_estimators=1400;, score=0.838 total time=   5.2s
[CV 5/5; 1/10] START criterion=entropy, max_depth=5, n_estimators=1400..........
[CV 5/5; 1/10] END criterion=entropy, max_depth=5, n_estimators=1400;, score=0.848 total time=   5.7s
[CV 1/5; 2/10] START cri

In [12]:
model.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 19,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1300,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [13]:
model.best_score_

0.8879999999999999

In [14]:
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import pipeline

In [15]:
#using pipeline
#Random Search CV
if __name__=='__main__':
  df = pd.read_csv('https://raw.githubusercontent.com/Nithya-Vasudevan/Mobile-Price-Classification/master/train.csv')
  X = df.drop('price_range', axis=1).values
  y = df.price_range.values


  scl = preprocessing.StandardScaler()
  pca = decomposition.PCA()
  rf = ensemble.RandomForestClassifier(n_jobs=-1)

  classifier = pipeline.Pipeline([
      ('scaling',scl),
      ('pca', pca),
      ('rf',rf)
  ])
  param_grid = {
      "pca__n_components":np.arange(5,10),
      'rf__n_estimators':np.arange(100,1500,100),
      'rf__max_depth':np.arange(1,20),
      'rf__criterion':['gini','entropy'],

  }
  model = model_selection.RandomizedSearchCV(
      estimator=classifier,
      param_distributions = param_grid,
      n_iter=10,
      scoring = 'accuracy',
      verbose=10,
      n_jobs = 1,
      cv=5
  )

  model.fit(X,y)
  print(model.best_score_)
  print(model.best_estimator_.get_params())



Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START pca__n_components=7, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=1100
[CV 1/5; 1/10] END pca__n_components=7, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=1100;, score=0.395 total time=   5.1s
[CV 2/5; 1/10] START pca__n_components=7, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=1100
[CV 2/5; 1/10] END pca__n_components=7, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=1100;, score=0.407 total time=   5.8s
[CV 3/5; 1/10] START pca__n_components=7, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=1100
[CV 3/5; 1/10] END pca__n_components=7, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=1100;, score=0.365 total time=   4.9s
[CV 4/5; 1/10] START pca__n_components=7, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=1100
[CV 4/5; 1/10] END pca__n_components=7, rf__criterion=gini, rf__max_depth=13, rf__n_estimators=1100;, score=0.425 total time=   4.9s

In [18]:
#Bayeian optimization with gaussian process
from functools import partial
from skopt import space
from skopt import gp_minimize

def optimize(params,param_names,x,y):
  params = dict(zip(param_names,params))
  model = ensemble.RandomForestClassifier(params)

  kf = model_selection.StratifiedKFold(n_splits=5)
  accuracies = []
  for idx in kf.split(X=x,y=y):
    train_idx,test_idx = idx[0],idx[1]
    X_train = X[train_idx]
    y_train = y[train_idx]

    X_test = X[test_idx]
    y_test = y[test_idx]

    model.fit(X_train,y_train)

    preds = model.predict(X_test)
    fold_acc = metrics.accuracy_score(y_test,preds)
    accuracies.append(fold_acc)

  return -1.0*np.mean(accuracies)

if __name__=='__main__':
  df = pd.read_csv('https://raw.githubusercontent.com/Nithya-Vasudevan/Mobile-Price-Classification/master/train.csv')
  X = df.drop('price_range', axis=1).values
  y = df.price_range.values

  param_space = [
      space.Integer(3,15,name='max_depth'),
      space.Integer(100,600,name='n_extimators'),
      space.Real(0.01,1,name='max_features',prior='uniform'),
      space.Categorical(['gini','entropy'],name='criterion')
  ]
  param_names = [
      'max_depth',
      'n_estimators',
      'criterion',
      'max_features'
  ]
  optimization_function = partial(
      optimize,
      param_names=param_names,
      x=X,
      y=y
  )
  result = gp_minimize(
      optimization_function,
      dimensions=param_space,
      n_calls=15,
      n_random_starts=10,
      verbose=10
  )
  print(
      dict(
          param_names,
          result.x
      )
  )





Iteration No: 1 started. Evaluating function at random point.


InvalidParameterError: The 'n_estimators' parameter of RandomForestClassifier must be an int in the range [1, inf). Got {'max_depth': 8, 'n_estimators': 362, 'criterion': 0.5423734796105734, 'max_features': 'gini'} instead.