In [2]:
import numpy as np
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [3]:
iris = datasets.load_iris()
X = iris.data
Y = iris.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [4]:
random_forest = RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 3, 
                                       min_samples_split = 2,
                                       max_depth=100,
                                       n_estimators=1000, 
                                       max_features='auto', 
                                       random_state=1, 
                                       n_jobs=-1)

In [5]:
random_forest.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=100, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [6]:
# Stats about the trees in random forest
n_nodes = []
max_depths = []

for ind_tree in random_forest.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 11
Average maximum depth 3


In [7]:
rf_predictions = random_forest.predict(X_test)

In [8]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, confusion_matrix
print("Precision:", precision_score(Y_test, rf_predictions, average='micro'))
print("Recall:",recall_score(Y_test, rf_predictions, average='micro'))
confusion_matrix(Y_test, rf_predictions)

Precision: 0.98
Recall: 0.98


array([[19,  0,  0],
       [ 0, 15,  0],
       [ 0,  1, 15]])

In [9]:
# estimator = random_forest.estimators_[4]

# from sklearn.tree import export_graphviz
# # Export as dot file
# export_graphviz(estimator, out_file='tree.dot', 
#                 feature_names = iris.feature_names,
#                 class_names = iris.target_names,
#                 rounded = True, proportion = False, 
#                 precision = 2, filled = True)

# # Convert to png using system command (requires Graphviz)
# from subprocess import call
# call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# # Display in jupyter notebook
# from IPython.display import Image
# Image(filename = 'tree.png')

In [10]:
# import matplotlib.pyplot as plt
# col = ['SepalLengthCm' ,'SepalWidthCm' ,'PetalLengthCm' ,'PetalWidthCm']
# fi = random_forest.feature_importances_
# fig, ax = plt.subplots() 
# width = 0.3 # the width of the bars 
# ind = np.arange(len(fi)) # the x locations for the groups
# ax.barh(ind, fi, width, color='green')
# ax.set_yticks(ind+width/10)
# ax.set_yticklabels(col, minor=False)
# plt.title('Feature importance in RandomForest Classifier')
# plt.xlabel('Relative importance')
# plt.ylabel('feature') 
# plt.figure(figsize=(5,5))
# fig.set_size_inches(6.5, 4.5, forward=True)
# fig.savefig('feature_importance.png')

In [12]:
import joblib
import bz2
with bz2.BZ2File('random_forest', 'w') as sfile:
    joblib.dump(random_forest, sfile)

In [11]:
print(X_test[0])
print(Y_test[0])

[6.1 2.8 4.7 1.2]
1


In [13]:
from sqlalchemy import create_engine

PG_HOST = 'database'
PG_PORT = '5432'
PG_USER = 'user'
PG_PASS = 'password'
PG_DB   = 'development'

con_str = ('postgresql://{username}:{password}@{host}:{port}/{dbname}'.format(
    username=PG_USER,
    password=PG_PASS,
    host=PG_HOST,
    port=PG_PORT,
    dbname=PG_DB
))
cnx = create_engine(con_str)

In [14]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [15]:
import pandas as pd
fns = ['sepal length', 'sepal width', 'petal length', 'petal width']
to_save = pd.concat([pd.DataFrame(iris.data, columns=fns), pd.DataFrame(iris.target, columns=['species'])], axis=1)

In [16]:
to_save.to_sql('iris', cnx, if_exists='replace')

In [17]:
n = np.array([1, 2, 3, 4])
n = np.append(n, 5)
x = ['sepal length', 'sepal width', 'petal length', 'petal width', 'species']
a = pd.DataFrame(n.reshape(-1, len(n)), columns=x)

In [18]:
a.to_sql('iris', cnx, if_exists="append")

In [19]:
a

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,1,2,3,4,5


In [61]:
_a = pd.read_sql('iris', cnx).tail(1).iloc[0]
# _b = pd.read_sql_query('SELECT * FROM iris WHERE index = 150', cnx)
# _b
_a

index           0.0
sepal length    1.0
sepal width     2.0
petal length    3.0
petal width     4.0
species         5.0
Name: 150, dtype: float64

In [45]:
pd.read_sql('iris', cnx)[['sepal length', 'sepal width', 'petal length', 'petal width', 'species']][['sepal length', 'sepal width', 'petal length', 'petal width']]

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8
150,1.0,2.0,3.0,4.0
