In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For machine learning 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('../dataset/all_yes_no.csv')

In [3]:
df.head()

Unnamed: 0,Thumb1,Index1,Middle1,Ring1,Pinky1,Palm1,AX1,AY1,AZ1,GX1,...,Ring2,Pinky2,Palm2,AX2,AY2,AZ2,GX2,GY2,GZ2,Word
0,789,728,665,545,871,781,-40.29,12.5,0.05,-0.17,...,546,873,620,-5.16,29.1,90.2,-0.51,-0.17,0.66,j
1,763,739,637,632,716,484,-3.64,3.38,-2.07,0.08,...,633,718,491,-2.7,8.9,3.3,0.05,0.96,-0.45,e
2,777,757,691,584,699,771,-5.47,0.02,0.69,-0.09,...,587,701,743,0.6,-2.19,3.12,-0.14,0.98,-0.07,m
3,784,894,842,822,709,572,0.18,1.41,-7.79,-0.06,...,826,710,532,5.73,4.58,6.15,-0.05,0.98,-0.17,w
4,826,890,741,593,689,876,9.63,-5.19,3.6,-0.73,...,594,689,895,38.54,-16.79,1.64,-0.75,0.63,-0.08,k


In [4]:
len(df)

5888

In [5]:
df['Word'].value_counts()

yes    384
e      338
eh     337
r      290
no     290
m      289
d      289
x      289
o      289
s      289
i      289
t      289
y      288
n      288
j      287
z      215
l       96
g       96
f       96
c       96
u       96
v       96
b       96
p       96
h       95
k       95
q       95
w       95
Name: Word, dtype: int64

In [6]:
df_X = df.drop(["Word"], axis=1)

In [7]:
df_y = df["Word"]

In [8]:
# Creating training and testing data
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.051)

In [9]:
print("Training data size : {} \nTesting data size : {}".format(len(X_train), len(X_test)))

Training data size : 5587 
Testing data size : 301


In [10]:
print("Train data shape : {} \nTest Data shape : {} ".format(X_train.shape, X_test.shape))

Train data shape : (5587, 24) 
Test Data shape : (301, 24) 


In [11]:
X_train.head()

Unnamed: 0,Thumb1,Index1,Middle1,Ring1,Pinky1,Palm1,AX1,AY1,AZ1,GX1,...,Middle2,Ring2,Pinky2,Palm2,AX2,AY2,AZ2,GX2,GY2,GZ2
5374,794,727,663,583,853,739,-5.14,63.08,93.35,-0.46,...,681,606,872,293,-18.07,-17.37,-53.77,-0.48,0.82,-0.09
5069,794,743,620,567,671,838,-4.2,0.11,-0.44,-0.07,...,625,562,670,835,-3.22,5.08,8.07,0.04,0.98,-0.37
3134,795,764,665,545,696,599,-5.13,1.21,0.98,-0.22,...,665,542,698,581,-3.0,-3.12,2.6,-0.17,0.94,0.06
3815,810,771,640,491,848,471,-3.24,2.1,-1.78,-0.04,...,642,492,853,467,2.86,4.5,5.69,-0.06,0.97,-0.35
2809,782,772,664,532,680,737,-2.21,2.05,0.1,-0.16,...,664,532,680,737,-2.18,0.83,3.94,-0.16,0.98,-0.31


In [12]:
print("Total unique elements in training data are : {}".format(len(y_train.value_counts())))

Total unique elements in training data are : 28


In [13]:
print("Training data for each alphabets are :\n{}".format(y_train.value_counts()))

Training data for each alphabets are :
yes    366
eh     319
e      317
no     278
d      278
j      278
r      277
y      276
m      275
i      275
n      274
o      273
t      272
x      270
s      266
z      199
q       95
f       95
u       93
v       93
l       92
b       91
g       91
k       91
p       91
w       88
c       87
h       87
Name: Word, dtype: int64


In [14]:
print("Total unique elements in test data are : {}".format(len(y_test.value_counts())))

Total unique elements in test data are : 27


In [15]:
print("Training data for each alphabets are :\n{}".format(y_test.value_counts()))

Training data for each alphabets are :
s      23
e      21
x      19
yes    18
eh     18
t      17
z      16
o      16
n      14
i      14
m      14
r      13
y      12
no     12
d      11
c       9
j       9
h       8
w       7
p       5
g       5
b       5
k       4
l       4
u       3
v       3
f       1
Name: Word, dtype: int64


In [16]:
rf = RandomForestClassifier(n_estimators=50, random_state=30)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=30, verbose=0,
                       warm_start=False)

In [17]:
rf.score(X_train, y_train)

1.0

In [18]:
rf.score(X_test, y_test)

1.0

#### Achieved training score of 100% and test score of 99.66%

In [19]:
y_test_predicted = rf.predict(X_test)

In [20]:
print("Classification Report \n ", classification_report(y_test, y_test_predicted))

Classification Report 
                precision    recall  f1-score   support

           b       1.00      1.00      1.00         5
           c       1.00      1.00      1.00         9
           d       1.00      1.00      1.00        11
           e       1.00      1.00      1.00        21
          eh       1.00      1.00      1.00        18
           f       1.00      1.00      1.00         1
           g       1.00      1.00      1.00         5
           h       1.00      1.00      1.00         8
           i       1.00      1.00      1.00        14
           j       1.00      1.00      1.00         9
           k       1.00      1.00      1.00         4
           l       1.00      1.00      1.00         4
           m       1.00      1.00      1.00        14
           n       1.00      1.00      1.00        14
          no       1.00      1.00      1.00        12
           o       1.00      1.00      1.00        16
           p       1.00      1.00      1.00         5
  

### This model is good enough for all the cases of all the alphabets and yes no cases


In [21]:
# Using pickle to save model for later use
import pickle5 as pickle

In [22]:
# Saving the model 
pickle.dump(rf, open('../models/alphabets_and_yes_no', 'wb'))