# Random Forest Example using Mpg Dataset 

In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
import pandas
import numpy
numpy.random.seed = 0



### Import Data 

In [2]:
data = pandas.read_csv("/users/danielcorcoran/desktop/github_repos/python_nb_visualization/seaborn_official_datasets/mpg.csv")

In [3]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


### Create feature matrix and target vector

In [4]:
y = data["cylinders"]
X = data[["mpg", "horsepower", "weight", "acceleration", "displacement"]]

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 5 columns):
mpg             398 non-null float64
horsepower      392 non-null float64
weight          398 non-null int64
acceleration    398 non-null float64
displacement    398 non-null float64
dtypes: float64(4), int64(1)
memory usage: 15.6 KB


In [6]:
X["horsepower"] = X["horsepower"].fillna(X["horsepower"].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


### Split dataset into training and testing sets 

In [7]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                y,
                                               random_state = 42,
                                               test_size = 0.20)

### Instantiate model using randomforestclassifier object

In [8]:
clf = RandomForestClassifier(n_jobs = 2,
                            random_state = 0)

### Fit model to training data

In [9]:
clf.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Predict against Xtest set using model

In [10]:
prediction = clf.predict(Xtest)

### Show Accuracy Score

In [11]:
from sklearn.metrics import accuracy_score

In [12]:
accuracy_score(prediction, ytest)

0.95

### Confusion Matrix, predicted vs true labels

In [13]:
from sklearn.metrics import confusion_matrix
import seaborn
import matplotlib.pyplot as plt

matrix = confusion_matrix(ytest, prediction)

seaborn.heatmap(matrix.T, 
                square = True, 
                annot = True, 
                fmt = "d", 
                cmap = "BrBG",
                cbar = False,
                xticklabels = Xtrain.columns,
                yticklabels = Xtrain.columns)

plt.xlabel("True label")
plt.ylabel("Predicted label")

Text(113.922,0.5,'Predicted label')