In [35]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

# reading csv files
data =  pd.read_csv('letter-recognition.data', sep=",")
data.head()

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [36]:
# učitavanje direkno u array

from numpy import genfromtxt

data_F = genfromtxt('letter-recognition.data', delimiter=',',dtype=None,encoding="UTF8")[1:]
data_F[0]
X_F = data_F[:,1:].astype('int32')
y_F = data_F[:,0]


In [37]:
data['is_train'] = np.random.uniform(0, 1, len(data)) <= 0.60
data.head()

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx,is_train
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8,False
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10,False
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9,False
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8,False
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10,True


In [38]:
train, test = data[data['is_train'] == True], data[data['is_train'] == False]
print('Number of observations in the training data: ', len(train))
print('Number of observations in the testing data: ', len(test))

Number of observations in the training data:  11898
Number of observations in the testing data:  8102


In [22]:
features = data.columns[1:17]
features

Index(['x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 'y-bar', 'x2bar',
       'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx'],
      dtype='object')

In [23]:
y = train['lettr']
y

0        T
1        I
2        D
5        S
6        B
        ..
19993    J
19994    T
19997    T
19998    S
19999    A
Name: lettr, Length: 11983, dtype: object

In [24]:
clf = RandomForestClassifier(n_estimators=100, max_features="sqrt", max_depth=25, min_samples_split=5)
clf.fit(train[features], y)

RandomForestClassifier(max_depth=25, max_features='sqrt', min_samples_split=5)

In [25]:
print(test[features])
clf.predict(test[features])

       x-box  y-box  width  high  onpix  x-bar  y-bar  x2bar  y2bar  xybar  \
3          7     11      6     6      3      5      9      4      6      4   
4          2      1      3     1      1      8      6      6      6      6   
9         11     15     13     9      7     13      2      6      2     12   
10         3      9      5     7      4      8      7      3      8      5   
12         4      9      6     7      6      7      8      6      2      6   
...      ...    ...    ...   ...    ...    ...    ...    ...    ...    ...   
19989      2      1      3     2      1      4     10      3      5     10   
19991      4      3      5     4      2      7      6      8      8      6   
19992      4      9      5     6      3      5      9      2     10     10   
19995      2      2      3     3      2      7      7      7      6      6   
19996      7     10      8     8      4      4      8      6      9     12   

       x2ybr  xy2br  x-ege  xegvy  y-ege  yegvx  
3          4 

array(['N', 'G', 'M', ..., 'E', 'D', 'C'], dtype=object)

In [58]:
sum(clf.predict_proba(test[features])[1])

1.0

In [27]:
preds = clf.predict(test[features])
preds

array(['N', 'G', 'M', ..., 'E', 'D', 'C'], dtype=object)

In [28]:
y_test = test['lettr']
y_test.head()

3     N
4     G
9     M
10    X
12    G
Name: lettr, dtype: object

In [29]:
pd.crosstab(y_test, preds, rownames=['Actual Letter'], colnames=['Predicted Letter'])

Predicted Letter,A,B,C,D,E,F,G,H,I,J,...,Q,R,S,T,U,V,W,X,Y,Z
Actual Letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,296,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
B,0,324,0,1,1,0,1,1,0,0,...,0,7,1,0,3,5,0,0,0,0
C,0,0,282,0,3,2,2,0,0,0,...,1,0,0,0,1,0,0,1,0,0
D,0,6,0,320,0,1,0,5,0,0,...,1,3,0,0,0,0,0,0,0,0
E,0,0,1,0,308,0,2,0,0,0,...,1,0,2,0,0,0,0,1,0,3
F,0,3,0,1,1,294,0,0,0,1,...,0,0,2,6,0,1,0,0,1,0
G,1,1,1,2,1,1,302,0,0,0,...,5,1,2,0,1,0,0,1,0,0
H,1,1,0,9,1,0,0,245,0,0,...,1,8,0,0,1,1,0,0,1,0
I,0,2,0,2,0,2,0,0,275,17,...,0,0,0,0,0,0,0,0,0,0
J,1,1,0,1,0,1,0,1,8,279,...,0,1,0,0,0,0,0,0,0,0


In [30]:
from sklearn.metrics import zero_one_loss

errors = []
for i in range(10):
    clf = RandomForestClassifier(n_estimators=100, max_features="sqrt", max_depth=25, min_samples_split=5)
    clf.fit(train[features], y)
    preds = clf.predict(test[features])
    error = zero_one_loss(y_test, preds)
    errors.append(error)

print("mean: " + str(np.mean(errors)) + ", std: "+ str(np.std(errors)))

mean: 0.048921042784083824, std: 0.000961025834314982


In [31]:
letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

In [32]:
from liblinear.liblinearutil import *
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y_F)
print(le.classes_)
print(le.transform(['A', 'C', 'Y', 'Z']))
print(le.inverse_transform([0, 0, 1, 2]))
y_F_transformed = le.transform(y_F)
y_F_transformed

['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R'
 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z']
[ 0  2 24 25]
['A' 'A' 'B' 'C']


array([19,  8,  3, ..., 19, 18,  0])

In [44]:
def indicator_vector(class_index, nr_class):
   ind_vec = np.zeros(nr_class)
   ind_vec[class_index] = 1
   return ind_vec

#iv = indicator_vector(25, 26)
#print(iv)

decision_trees = clf.estimators_

def FI(X, decision_trees):
   results = []
   for tree in decision_trees:
      results.append([tree.predict(x) for x in X])
   return np.array(results)

tree_decisions = FI([X_F], decision_trees).reshape(100, 20000).T
print(tree_decisions.shape)
tree_decisions = np.array([[indicator_vector(int(e), 26) for e in x] for x in tree_decisions])
print(tree_decisions)
len(tree_decisions) # len = 100
len(y_F_transformed) # len = 20000

(20000, 100)
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]

20000

In [47]:
tree_decisions = tree_decisions.reshape(20000, 2600)
print(tree_decisions.shape)

(20000, 2600)


In [49]:
from liblinear.liblinearutil import *
# prob  = problem(np.array(y), np.array(train[features]))
prob  = problem(y_F_transformed, tree_decisions)
param = parameter('-s 2 -c 0.01')
m = train(prob, param)
#best_C, best_p, best_rate = train(y, x, '-C -s 0') # best_p is only for -s 11

In [None]:
#nr_feature =  m.get_nr_feature()
#nr_class = m.get_nr_class()
#print(nr_class)
#class_labels = model_.get_labels()
#is_prob_model = model_.is_probability_model()
#is_regression_model = model_.is_regression_model()
#[W, b] = m.get_decfun()
#print(W)
#print(b)