# Random Forest Example using boxclean.csv

In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
import pandas
import numpy
numpy.random.seed = 0



### Import Data 

In [2]:
data = pandas.read_csv("/users/danielcorcoran/desktop/box_cleaned.csv")

In [3]:
data.shape

(2728, 25)

In [4]:
data.head()

Unnamed: 0,age_A,age_B,height_A,height_B,reach_A,reach_B,weight_A,weight_B,won_A,won_B,...,kos_B,result,age_diff,height_diff,reach_diff,weight_diff,kd_diff_A,kd_diff_B,stance_A_southpaw,stance_B_southpaw
0,26.0,31.0,175.0,185.0,179.0,185.0,164.0,164.0,48,50,...,32.0,1,-5.0,-10.0,-6.0,0.0,47,48,0,0
1,25.0,29.0,175.0,174.0,179.0,180.0,155.0,155.0,46,31,...,19.0,1,-4.0,1.0,-1.0,0.0,45,28,0,0
2,23.0,31.0,175.0,175.0,179.0,188.0,155.0,155.0,43,19,...,12.0,1,-8.0,0.0,-9.0,0.0,42,18,0,0
3,21.0,40.0,175.0,174.0,179.0,180.0,154.0,154.0,39,46,...,39.0,1,-19.0,1.0,-1.0,0.0,39,39,0,0
4,21.0,32.0,175.0,180.0,179.0,188.0,154.0,154.0,38,33,...,28.0,1,-11.0,-5.0,-9.0,0.0,38,29,0,0


In [5]:
X = data[['height_B',
 'reach_A',
 'won_B',
 'lost_A',
 'lost_B',
 'drawn_A',
 'kos_B',
 'age_diff',
 'kd_diff_A',
 'kd_diff_B',
 'stance_A_southpaw',
 'stance_B_southpaw']]

y  = data["result"]

### Split dataset into training and testing sets 

In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                y,
                                               random_state = 42)

In [7]:
ytest.shape

(682,)

### Instantiate model using randomforestclassifier object

In [8]:
clf = RandomForestClassifier(n_estimators = 100,
                             n_jobs = 2,
                            random_state = 0)

### Fit model to training data

In [9]:
clf.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Predict against Xtest set using model

In [10]:
prediction = clf.predict(Xtest)

### Show Accuracy Score

In [11]:
from sklearn.metrics import accuracy_score

In [12]:
accuracy_score(prediction, ytest)

0.8695014662756598

In [13]:
data.head(3)

Unnamed: 0,age_A,age_B,height_A,height_B,reach_A,reach_B,weight_A,weight_B,won_A,won_B,...,kos_B,result,age_diff,height_diff,reach_diff,weight_diff,kd_diff_A,kd_diff_B,stance_A_southpaw,stance_B_southpaw
0,26.0,31.0,175.0,185.0,179.0,185.0,164.0,164.0,48,50,...,32.0,1,-5.0,-10.0,-6.0,0.0,47,48,0,0
1,25.0,29.0,175.0,174.0,179.0,180.0,155.0,155.0,46,31,...,19.0,1,-4.0,1.0,-1.0,0.0,45,28,0,0
2,23.0,31.0,175.0,175.0,179.0,188.0,155.0,155.0,43,19,...,12.0,1,-8.0,0.0,-9.0,0.0,42,18,0,0
