In [3]:
# @title Installing LIME
try:
  import lime
except:
  print("Installing LIME")
  !pip install lime

In [7]:
# @title Importing modules
from __future__ import print_function
import lime
#import sklearn
import numpy as np

import sklearn
import sklearn.ensemble
import sklearn.metrics
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier



In [8]:
# @title Retrieving newsgroups data
from sklearn.datasets import fetch_20newsgroups

In [9]:
categories = ['sci.electronics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',
                                     categories=categories)
class_names = ['electronics', 'space']

In [10]:
# @title Vectorizing
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    lowercase=False)
train_vectors = vectorizer.fit_transform(newsgroups_train.data)
test_vectors = vectorizer.transform(newsgroups_test.data)

In [11]:
# @title AutoML experiment: Score measurement variables
best = 0 # best classifier score



In [12]:
clf = "None" # best classifier name 

In [13]:
# @title AutoML experiment: Random forest
rf1 = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf1.fit(train_vectors, newsgroups_train.target)
pred = rf1.predict(test_vectors)

In [14]:
newsgroups_test.target[:10]
#array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0])

array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0])

In [15]:
score1 = sklearn.metrics.f1_score(newsgroups_test.target,
                                  pred, average='binary')

In [17]:
if score1 > best:
  best = score1
  clf = "Random forest"
  print("Random forest has achieved the top score!", score1)
else:
    print("Score of Random Forest ",score1)


Score of Random Forest  0.9257294429708224


In [18]:
# @title AutoML experiment: Bagging
rf2 = BaggingClassifier(KNeighborsClassifier(),
                        n_estimators=500, max_samples=0.5,
                        max_features=0.5)
rf2.fit(train_vectors, newsgroups_train.target)
pred = rf2.predict(test_vectors)
score2 = sklearn.metrics.f1_score(newsgroups_test.target,
                                  pred, average='binary')
if score2 > best:
  best = score2
  clf = "Bagging"
  print("Bagging has achieved the top score!", score2)
else:
  print("Score of bagging", score2)

Score of bagging 0.6438356164383562


In [19]:
# @title AutoML experiment: Gradient boosting
rf3 = GradientBoostingClassifier(random_state=1, n_estimators=500)
rf3.fit(train_vectors, newsgroups_train.target)
pred = rf3.predict(test_vectors)
score3 = sklearn.metrics.f1_score(newsgroups_test.target,
                                  pred, average='binary')
if score3 > best:
  best = score3
  clf = "Gradient boosting"
  print("Gradient boosting has achieved the top score!", score3)
else:
  print("Score of gradient boosting", score3)

Score of gradient boosting 0.9009497964721846


In [20]:
# @title AutoML experiment: Decision tree
rf4 = DecisionTreeClassifier(random_state=1)
rf4.fit(train_vectors, newsgroups_train.target)
pred = rf4.predict(test_vectors)
score4 = sklearn.metrics.f1_score(newsgroups_test.target,
                                  pred, average='binary')

In [21]:
if score4 > best:
  best = score4
  clf = "Decision tree"
  print("Decision tree has achieved the top score!", score4)
else:
  print("Score of decision tree", score4)

Score of decision tree 0.80970625798212


In [22]:
# @title AutoML experiment: Extra trees
rf5 = ExtraTreesClassifier(n_estimators=500, random_state=1)
rf5.fit(train_vectors, newsgroups_train.target)
pred = rf5.predict(test_vectors)
score5 = sklearn.metrics.f1_score(newsgroups_test.target,
                                  pred, average='binary')
if score5 > best:
  best = score5
  clf = "Extra trees"
  print("Extra trees has achieved the top score!", score5)
else:
  print("Score of extra trees", score5)

Extra trees has achieved the top score! 0.9402390438247012


In [24]:
# @title AutoML experiment: Summary
print("The best model is", clf, "with a score of:", round(best, 5))
print("Scores:")
print("Random forest         :", round(score1, 5))
print("Bagging               :", round(score2, 5))
print("Gradient boosting     :", round(score3, 5))
print("Decision tree         :", round(score4, 5))
print("Extra trees           :", round(score5, 5))

The best model is Extra trees with a score of: 0.94024
Scores:
Random forest         : 0.92573
Bagging               : 0.64384
Gradient boosting     : 0.90095
Decision tree         : 0.80971
Extra trees           : 0.94024


In [28]:
# @title Activate the AutoML mode or
# choose a classifier in the dropdown list
AutoML = 'On' # @param ["On", "Off"]
dropdown = 'Gradient boosting' # @param ["Random forest",
                               #         "Bagging",
                               #         "Gradient boosting",
                               #         "Decision tree",
                               #         "Extra trees"]
if AutoML == "On":
  dropdown = clf
if clf == "None":
  dropdown = "Decision tree"
if dropdown == "Random forest":
  rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
if dropdown == "Bagging":
  rf = BaggingClassifier(KNeighborsClassifier(), n_estimators=500,
                         max_samples=0.5, max_features=0.5)
if dropdown == "Gradient boosting":
  rf = GradientBoostingClassifier(random_state=1, n_estimators=500)
if dropdown == "Decision tree":
  rf = DecisionTreeClassifier(random_state=1)
if dropdown == "Extra trees":
  rf = ExtraTreesClassifier(random_state=1, n_estimators=500)

In [29]:
rf.fit(train_vectors, newsgroups_train.target)

ExtraTreesClassifier(n_estimators=500, random_state=1)

In [30]:
# @title Prediction metrics
pred = rf.predict(test_vectors)
sklearn.metrics.f1_score(newsgroups_test.target,
                         pred, average='binary')

0.9402390438247012