In [None]:
!pip install seaborn

In [None]:
# Import all libraries
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Explore the data shapes
mnist = datasets.load_digits()

# data is the input
X = mnist.data
print(mnist.data.shape)

# target is the output
Y = mnist.target
print(mnist.target.shape)

In [None]:
# Print the data

# Print input present at index 0
print(mnist.data[0])

# Print output present at index 0
print(mnist.target[0])

In [None]:
# Preprocess the data

# Convert output to 1D array of type category with int datatype
y = pd.Series(mnist.target).astype('int').astype('category')

# Convert input to a tabular set of values
X = pd.DataFrame(mnist.data)

# head() is to print the top 5 values
print(X.head())
print(y.head())

In [None]:
# Fetch training and testing data
# 30% is testing data and 70% is training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# fit = train and predict = test
# Print the accuracy

# Number of trees = 10 
clf=RandomForestClassifier(n_estimators=10)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

# Number of trees = 50
clf=RandomForestClassifier(n_estimators=50)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

# Number of trees = 100
clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

In [None]:
# Estimate the feature importance
feature_imp = pd.Series(clf.feature_importances_).sort_values(ascending=False)

# Print the top 5 significant features
feature_imp[:5]

In [None]:
# Visualize the feature importance
%matplotlib inline

sns.barplot(x=round(feature_imp[:10], 4), y=feature_imp[:10].index)

plt.ylabel('Feature Importance Score')
# plt.xticks([])
plt.xticks(rotation=70)
plt.xlabel('Features')
plt.title("Visualizing Important Features")
plt.show()