# Problem Definition
The problem here is to predict the gas consumption (in millions of gallons) in 48 of the US states based on petrol tax (in cents), per capita income (dollars), paved highways (in miles) and the proportion of population with the driving license.

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.tree import export_graphviz

# import metrics to help evaluate the model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score

In [None]:
df = pd.read_csv('../../../../data/other/petrol_consumption.csv')

In [None]:
df.head()

In [None]:
X = df.iloc[:, 0:4].values
y = df.iloc[:, 4].values

In [None]:
# get the train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
# now create a pipeline with a standard scalar and Random

model = Pipeline(
    [
        ('StandardScalar', StandardScaler()),
        ('RandomForestRegressor', RandomForestRegressor(n_estimators=2, random_state=42))
    ]
)

In [None]:
# fit the training data
model.fit(X_train, y_train)

# get the prediction
y_pred = model.predict(X_test)

In [None]:
print('Mean absolute error', mean_absolute_error(y_pred, y_test))
print('Mean squared error', mean_squared_error(y_pred, y_test))
print('Root mean squared error', np.sqrt(mean_squared_error(y_pred, y_test)))

In [None]:
estimator = model.steps[1][1].estimators_[1]

In [None]:
export_graphviz(estimator, out_file='tree.dot', 
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')