In [74]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

# reading csv files
data =  pd.read_csv('letter-recognition.data', sep=",")
data.head()

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [75]:
data['is_train'] = np.random.uniform(0, 1, len(data)) <= 0.60
data.head()

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx,is_train
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8,True
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10,False
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9,True
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8,True
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10,True


In [76]:
train, test = data[data['is_train'] == True], data[data['is_train'] == False]
print('Number of observations in the training data: ', len(train))
print('Number of observations in the testing data: ', len(test))

Number of observations in the training data:  11990
Number of observations in the testing data:  8010


In [77]:
features = data.columns[1:17]
features

Index(['x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 'y-bar', 'x2bar',
       'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx'],
      dtype='object')

In [78]:
y = train['lettr']
y

0        T
2        D
3        N
4        G
5        S
        ..
19988    D
19989    P
19992    E
19993    J
19998    S
Name: lettr, Length: 11990, dtype: object

In [79]:
clf = RandomForestClassifier(n_estimators=100, max_features="sqrt", max_depth=25, min_samples_split=5)
clf.fit(train[features], y)

RandomForestClassifier(max_depth=25, max_features='sqrt', min_samples_split=5)

In [80]:
print(test[features])
clf.predict(test[features])

       x-box  y-box  width  high  onpix  x-bar  y-bar  x2bar  y2bar  xybar  \
1          5     12      3     7      2     10      5      5      4     13   
6          4      2      5     4      4      8      7      6      6      7   
7          1      1      3     2      1      8      2      2      2      8   
8          2      2      4     4      2     10      6      2      6     12   
9         11     15     13     9      7     13      2      6      2     12   
...      ...    ...    ...   ...    ...    ...    ...    ...    ...    ...   
19994      5      8      7     7      7      7      9      4      8      7   
19995      2      2      3     3      2      7      7      7      6      6   
19996      7     10      8     8      4      4      8      6      9     12   
19997      6      9      6     7      5      6     11      3      7     11   
19999      4      9      6     6      2      9      5      3      1      8   

       x2ybr  xy2br  x-ege  xegvy  y-ege  yegvx  
1          3 

array(['I', 'B', 'A', ..., 'C', 'T', 'A'], dtype=object)

In [81]:
clf.predict_proba(test[features])[0:5]

array([[0.02142857, 0.00533333, 0.044     , 0.02416667, 0.07602381,
        0.0075    , 0.06440476, 0.004     , 0.27102381, 0.01078571,
        0.005     , 0.13233333, 0.00166667, 0.00125   , 0.03309524,
        0.00875   , 0.10666667, 0.02690476, 0.05316667, 0.002     ,
        0.01733333, 0.0025    , 0.        , 0.0165    , 0.        ,
        0.06416667],
       [0.        , 0.98214286, 0.        , 0.        , 0.        ,
        0.        , 0.015     , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.00142857, 0.        , 0.        , 0.00142857, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.982     , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.018     , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
      

In [82]:
preds = clf.predict(test[features])
preds

array(['I', 'B', 'A', ..., 'C', 'T', 'A'], dtype=object)

In [83]:
y_test = test['lettr']
y_test.head()

1    I
6    B
7    A
8    J
9    M
Name: lettr, dtype: object

In [84]:
pd.crosstab(y_test, preds, rownames=['Actual Letter'], colnames=['Predicted Letter'])

Predicted Letter,A,B,C,D,E,F,G,H,I,J,...,Q,R,S,T,U,V,W,X,Y,Z
Actual Letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,309,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,2,0
B,0,301,0,1,1,0,0,1,0,0,...,0,3,1,0,1,1,0,2,0,0
C,0,0,279,0,5,0,3,1,0,0,...,2,0,0,0,0,0,0,0,0,0
D,0,6,0,280,0,0,0,6,0,0,...,1,0,2,0,0,0,0,1,0,0
E,0,1,1,0,311,1,3,0,0,0,...,2,0,4,0,0,0,0,2,0,2
F,0,7,0,1,3,270,0,0,0,0,...,0,0,0,8,0,0,0,0,1,0
G,0,2,1,3,1,0,298,1,0,0,...,5,0,0,0,0,1,0,0,0,0
H,0,2,0,6,1,0,2,261,0,0,...,1,9,1,0,0,0,0,0,0,0
I,0,1,0,0,0,2,0,0,250,14,...,1,0,0,1,0,0,0,0,0,1
J,0,2,0,0,3,0,0,2,7,274,...,0,0,3,1,0,0,0,0,0,1


In [88]:
from sklearn.metrics import zero_one_loss

errors = []
for i in range(10):
    clf = RandomForestClassifier(n_estimators=100, max_features="sqrt", max_depth=25, min_samples_split=5)
    clf.fit(train[features], y)
    preds = clf.predict(test[features])
    error = zero_one_loss(y_test, preds)
    errors.append(error)

print("mean: " + str(np.mean(errors)) + ", std: "+ str(np.std(errors)))

mean: 0.05134831460674157, std: 0.0008282151455991087


In [89]:
letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

In [90]:
decision_trees = clf.estimators_
decision_trees

[DecisionTreeClassifier(max_depth=25, max_features='sqrt', min_samples_split=5,
                        random_state=901567334),
 DecisionTreeClassifier(max_depth=25, max_features='sqrt', min_samples_split=5,
                        random_state=1620232537),
 DecisionTreeClassifier(max_depth=25, max_features='sqrt', min_samples_split=5,
                        random_state=402340577),
 DecisionTreeClassifier(max_depth=25, max_features='sqrt', min_samples_split=5,
                        random_state=1366184396),
 DecisionTreeClassifier(max_depth=25, max_features='sqrt', min_samples_split=5,
                        random_state=584664964),
 DecisionTreeClassifier(max_depth=25, max_features='sqrt', min_samples_split=5,
                        random_state=75167182),
 DecisionTreeClassifier(max_depth=25, max_features='sqrt', min_samples_split=5,
                        random_state=450815483),
 DecisionTreeClassifier(max_depth=25, max_features='sqrt', min_samples_split=5,
                

In [91]:
from liblinear.liblinearutil import *

ModuleNotFoundError: No module named 'liblinear'