In [12]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, cross_validation, preprocessing
from sklearn.ensemble import RandomForestClassifier


## Import Data, Split Features from Categories, and Define Testing vs Trainging Sets

In [13]:
#Import Data From Feature Processing
data = pd.read_csv('category_vals', delimiter = '\t',index_col=0)
data = data.sample(frac=1).reset_index(drop=True) #Shuffle the rows of the data so that categories mix
data = data.replace([np.inf, -np.inf], np.nan) #convert infinities to nan
data = data.fillna(value=0) #remove NaNs from data set (set to 0)
size = data.shape 
Y_pos = size[1]-1#will use later when extracting the category from the data frame. This is the last column of the data frame
data_len = size[0]
#features = list(data)[1:-1]
    
#split data into X (features) and Y (categories)
X = data.iloc[:,1:-1] #removes the file name and the category from the features
Y = data.iloc[:,Y_pos]
X_scaled = preprocessing.scale(X)

#Convert X & Y to array from df
X = X_scaled
Y = Y.as_matrix()

# Reserve Last 500 as test, rest as training
train = data_len-500
#training set
Xtr = X[:train]
Ytr = Y[:train]
print("training size: " + str(len(Ytr)))
# testing set
Xte = X[train:]
Yte = Y[train:]
print("testing size: " + str(len(Yte)))



def round_sig(x, sig=2): #features extractor returns values with too much specificity for RandomForest, need to round w/out losing data (sort of like normalizing within only 1 column and not across all data rows)
    return round(x, sig-int(floor(log10(abs(x))))-1)

Xtr = np.around(Xtr, decimals=8)
Xte = np.around(Xte, decimals=8)

training size: 3734
testing size: 500


# Set up the Random Forest Classifier

In [14]:
# Create a classifier -instantiate classifier object
classifier = RandomForestClassifier(n_estimators=50)
# fit the classification model on training set
classifier.fit(Xtr, Ytr)
# make predictions for testing set
pred_rf = classifier.predict(Xte) 


In [15]:
#Evaluate
print("True Class / Predicted class")
print(np.vstack((Yte[0:10],pred_rf[0:10]))) # <- for just top 10
#print(np.vstack((Yte[0:10],pred_rf[0:10]))'...') #<- for Full List

# compute zero-one loss / score & confusion matrix
rf_01 = metrics.zero_one_loss(Yte, pred_rf) # zero-one loss
rf_01_score = metrics.accuracy_score(Yte, pred_rf) # zero-one score
rf_confmat = metrics.confusion_matrix(Yte, pred_rf) # conf mat

print("Zero-One Loss: " + str(rf_01))
print("Zero-One Score: " + str(rf_01_score))
#print("Confusion Matrix:")
#print("[i, j] is the # of objects truly in group i but predicted to be in group j")
#print(rf_confmat)

True Class / Predicted class
[['swan' 'conch' 'crab' 'octopus' 'penguin' 'horse' 'dog' 'airplanes'
  'camel' 'raccoon']
 ['hummingbird' 'skunk' 'elk' 'horse' 'owl' 'horse' 'goat' 'airplanes'
  'penguin' 'raccoon']]
Zero-One Loss: 0.712
Zero-One Score: 0.288


In [None]:
#My cl