In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier

In [26]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=1000, n_features=500,
                            n_informative=10, n_redundant=20,
                           random_state=0, shuffle=False)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [28]:
X.shape

(1000, 500)

In [29]:
y.shape

(1000,)

In [30]:
rf = RandomForestClassifier(n_estimators=200, oob_score=True)
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, oob_score=True)

In [31]:
rf.score(X_train, y_train)

1.0

In [32]:
rf.score(X_test, y_test)

0.7909090909090909

In [33]:
rf.oob_score_

0.7671641791044777

In [36]:
# Feature importances in descending order
sorted(rf.feature_importances_, reverse= True)

[0.04189898156468514,
 0.020371816495762217,
 0.01768694695638138,
 0.014787470033903323,
 0.013289686821434362,
 0.012550689727084197,
 0.012132875994581778,
 0.01139996017419799,
 0.01107634764486541,
 0.010533840022059163,
 0.009949878743591422,
 0.00989707820469669,
 0.008052211098514079,
 0.007693984989328645,
 0.006975477750899878,
 0.006552312518085298,
 0.0058021585804997125,
 0.005264627906053824,
 0.004738301185980468,
 0.0046835338043031355,
 0.004379986181316959,
 0.004196536620934017,
 0.0038961106091963084,
 0.0037025408296538272,
 0.0035825748707756368,
 0.003274400547899647,
 0.0032583161185926173,
 0.0031566180909988395,
 0.0030748999458168045,
 0.0030571396393248474,
 0.00300916206233448,
 0.0029856312692700827,
 0.0029619231238405337,
 0.0029131841943270177,
 0.0028673074987375117,
 0.0028410654350114577,
 0.002821642879095731,
 0.0027289697309703727,
 0.0027042422235835795,
 0.002685999656360909,
 0.0026010566548889907,
 0.0025656877854591986,
 0.0025050477700025568

In [44]:
# selecting top 20 feature indices
imp_features = np.argsort(rf.feature_importances_)[-20:]

In [47]:
imp_features

array([23, 15, 14, 24, 22,  5, 20, 29, 18, 19, 12, 25, 10, 27,  8,  0,  7,
        4, 16, 17])

In [48]:
# extracting data of 20 imp. features
X_new = X[:, imp_features]

In [49]:
X_new.shape

(1000, 20)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
...     X_new, y, test_size=0.33, random_state=42)

In [52]:
rf.fit(X_train,y_train)

RandomForestClassifier(n_estimators=200, oob_score=True)

In [53]:
rf.score(X_train, y_train)

1.0

In [54]:
rf.score(X_test, y_test)

0.8575757575757575

## VotingClassifier

In [70]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [71]:
m1 = KNeighborsClassifier()
m2 = LogisticRegression()
m3 = GaussianNB()
m4 = DecisionTreeClassifier()
m5 = BernoulliNB()

In [72]:
model = VotingClassifier([('knn',m1), ('lr',m2), ('mnb',m3), ('dt',m4),(
                            'bnb',m5)])

In [73]:
model.fit(X_train, y_train)

VotingClassifier(estimators=[('knn', KNeighborsClassifier()),
                             ('lr', LogisticRegression()),
                             ('mnb', GaussianNB()),
                             ('dt', DecisionTreeClassifier()),
                             ('bnb', BernoulliNB())])

In [74]:
model.score(X_train, y_train)

0.8462686567164179

In [75]:
model.score(X_test, y_test)

0.7696969696969697

## GradientBoosting

In [76]:
from sklearn.ensemble import GradientBoostingClassifier

In [77]:
gbc = GradientBoostingClassifier()

In [81]:
import xgboost

XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/Users/mohit/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/mohit/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


In [82]:
! pip3 install xgboost



In [83]:
from xgboost import XGBClassifier

XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/Users/mohit/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/mohit/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']
