In [24]:
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./input/Dyt-tablet.csv
./input/Dyt-desktop.csv


In [6]:
desktopData= pd.read_csv("./input/Dyt-desktop.csv", index_col=0, na_values=['(NA)'])
tabletData = pd.read_csv("./input/Dyt-tablet.csv", index_col=0, na_values=['(NA)'])

In [9]:
def SeparateColumns(dataSetName):
    columns = defaultdict(list)
    with open(dataSetName, 'r') as f:
        reader = csv.reader(f, delimiter=';')
        headers = next(reader)
        column_nums = range(len(headers)) 
        for row in reader:
            for i in column_nums:
            
                columns[headers[i]].append(row[i])
    return dict(columns)

In [10]:
def cleanData(data) :
    for col in data.columns.values:
        data[col] = data[col].astype('string')
    #----------
    for col in data.columns.values:
        data[col] = data[col].astype('float',errors = 'ignore')
    #-----------
    data['Gender']=data.Gender.map({'Male': 1, 'Female': 2})
    data['Dyslexia']=data.Dyslexia.map({'No': 0, 'Yes': 1})
    data['Nativelang']=data.Nativelang.map({'No': 0, 'Yes': 1})
    data['Otherlang']=data.Otherlang.map({'No': 0, 'Yes': 1})

In [12]:
columns = SeparateColumns('./input/Dyt-desktop.csv')
desktopData=pd.DataFrame.from_dict(columns)

desktopData

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,Male,No,Yes,7,10,10,0,10,1,0,...,0,0,0,17,2,0,2,0.117647,0,No
1,Female,Yes,Yes,13,12,12,0,12,1,0,...,4,0.114286,0,26,2,2,2,0.0769231,0.0769231,Yes
2,Female,No,Yes,7,6,6,0,6,1,0,...,4,0.114286,0,26,1,3,1,0.0384615,0.115385,No
3,Female,No,Yes,7,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,No
4,Female,No,Yes,8,4,4,0,4,1,0,...,1,25,0.05,26,2,2,2,0.0769231,0.0769231,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3639,Male,No,No,10,7,7,0,7,1,0,...,2,0.67,0.33,4,1,3,1,0.25,0.75,Yes
3640,Female,No,Yes,15,9,9,0,9,1,0,...,3,0.75,0.25,4,2,2,2,0.5,0.5,No
3641,Female,No,Yes,15,11,11,0,11,1,0,...,3,0.6,0.4,4,2,2,2,0.5,0.5,No
3642,Female,No,Yes,15,10,10,0,10,1,0,...,3,0.75,0.25,4,3,1,3,0.75,0.25,No


In [13]:
cleanData(desktopData)

desktopData.head()

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,1,0,1,7.0,10.0,10.0,0.0,10.0,1.0,0.0,...,0.0,0.0,0.0,17.0,2.0,0.0,2.0,0.117647,0.0,0
1,2,1,1,13.0,12.0,12.0,0.0,12.0,1.0,0.0,...,4.0,0.114286,0.0,26.0,2.0,2.0,2.0,0.076923,0.076923,1
2,2,0,1,7.0,6.0,6.0,0.0,6.0,1.0,0.0,...,4.0,0.114286,0.0,26.0,1.0,3.0,1.0,0.038462,0.115385,0
3,2,0,1,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
4,2,0,1,8.0,4.0,4.0,0.0,4.0,1.0,0.0,...,1.0,25.0,0.05,26.0,2.0,2.0,2.0,0.076923,0.076923,0


In [14]:
columns = SeparateColumns('./input/Dyt-tablet.csv')
tabletData=pd.DataFrame.from_dict(columns)
tabletData.replace(["NULL"], np.nan, inplace = True)

tabletData

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,Male,Yes,No,7,6,6,0,6,1,0,...,,,,,,,,,,No
1,Female,Yes,No,7,7,7,0,7,1,0,...,,,,,,,,,,No
2,Female,Yes,No,7,6,6,0,6,1,0,...,,,,,,,,,,No
3,Male,Yes,No,7,5,5,0,5,1,0,...,,,,,,,,,,No
4,Male,Yes,No,7,8,6,2,8,0.75,0.25,...,,,,,,,,,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,Male,Yes,No,17,13,13,0,13,1,0,...,35,0.11428571428571,0,26,4,0,26,0.15384615384615,0,No
1391,Female,Yes,Yes,17,9,9,0,9,1,0,...,35,0.11428571428571,0,26,4,0,26,0.15384615384615,0,No
1392,Male,Yes,Yes,17,10,10,0,10,1,0,...,35,0.11428571428571,0,27,3,2,27,0.11111111111111,0.074074074074074,No
1393,Female,Yes,Yes,17,11,11,0,11,1,0,...,35,0.11428571428571,0,26,4,0,26,0.15384615384615,0,No


In [15]:
cleanData(tabletData)

tabletData.head()

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,1,1,0,7.0,6.0,6.0,0.0,6.0,1.0,0.0,...,,,,,,,,,,0
1,2,1,0,7.0,7.0,7.0,0.0,7.0,1.0,0.0,...,,,,,,,,,,0
2,2,1,0,7.0,6.0,6.0,0.0,6.0,1.0,0.0,...,,,,,,,,,,0
3,1,1,0,7.0,5.0,5.0,0.0,5.0,1.0,0.0,...,,,,,,,,,,0
4,1,1,0,7.0,8.0,6.0,2.0,8.0,0.75,0.25,...,,,,,,,,,,0


In [16]:
stateOfNUll= tabletData.isnull().any()
i = 0
for state in stateOfNUll : 
    if(state):  
        tabletData[stateOfNUll.index[i]].fillna(round(tabletData[stateOfNUll.index[i]].mean() , 4), inplace=True)
    i = i + 1    

tabletData

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,1,1,0,7.0,6.0,6.0,0.0,6.0,1.00,0.00,...,46.8333,0.386300,0.5439,52.51,2.7851,7.9719,52.51,0.970900,2.225400,0
1,2,1,0,7.0,7.0,7.0,0.0,7.0,1.00,0.00,...,46.8333,0.386300,0.5439,52.51,2.7851,7.9719,52.51,0.970900,2.225400,0
2,2,1,0,7.0,6.0,6.0,0.0,6.0,1.00,0.00,...,46.8333,0.386300,0.5439,52.51,2.7851,7.9719,52.51,0.970900,2.225400,0
3,1,1,0,7.0,5.0,5.0,0.0,5.0,1.00,0.00,...,46.8333,0.386300,0.5439,52.51,2.7851,7.9719,52.51,0.970900,2.225400,0
4,1,1,0,7.0,8.0,6.0,2.0,8.0,0.75,0.25,...,46.8333,0.386300,0.5439,52.51,2.7851,7.9719,52.51,0.970900,2.225400,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,1,1,0,17.0,13.0,13.0,0.0,13.0,1.00,0.00,...,35.0000,0.114286,0.0000,26.00,4.0000,0.0000,26.00,0.153846,0.000000,0
1391,2,1,1,17.0,9.0,9.0,0.0,9.0,1.00,0.00,...,35.0000,0.114286,0.0000,26.00,4.0000,0.0000,26.00,0.153846,0.000000,0
1392,1,1,1,17.0,10.0,10.0,0.0,10.0,1.00,0.00,...,35.0000,0.114286,0.0000,27.00,3.0000,2.0000,27.00,0.111111,0.074074,0
1393,2,1,1,17.0,11.0,11.0,0.0,11.0,1.00,0.00,...,35.0000,0.114286,0.0000,26.00,4.0000,0.0000,26.00,0.153846,0.000000,0


In [17]:
cols_with_missing = [col for col in tabletData.columns if tabletData[col].isnull().any()]

# Drop columns desktop data
reduced_desktopData = desktopData.drop(cols_with_missing, axis=1)

# Drop columns tablet data
reduced_tabletData = tabletData.drop(cols_with_missing, axis=1)

In [18]:
commonalityColumns = ['Gender','Nativelang','Otherlang','Age' , 'Dyslexia']
for i in  range(30):
    if((i>=0 and i<12) or (i>=13 and i<17) or i==21 or i==22 or i==29):
        commonalityColumns.append('Clicks'+str(i+1))
        commonalityColumns.append('Hits'+str(i+1))
        commonalityColumns.append('Misses'+str(i+1))
        commonalityColumns.append('Score'+str(i+1))
        commonalityColumns.append('Accuracy'+str(i+1))
        commonalityColumns.append('Missrate'+str(i+1))
    
reduced_desktopData=reduced_desktopData.loc[:,commonalityColumns]
reduced_tabletData=reduced_tabletData.loc[:,commonalityColumns]

In [19]:
y=reduced_desktopData['Dyslexia']
X=reduced_desktopData.loc[:, reduced_desktopData.columns != 'Dyslexia']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
#----RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train , y_train)
y_pred = rfc.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8894430590191189


In [22]:
yTest=reduced_tabletData['Dyslexia']
XTest=reduced_tabletData.loc[:, reduced_tabletData.columns != 'Dyslexia']

In [23]:
rfc2 = RandomForestClassifier()
rfc2.fit(X_train , y_train)
y_pred = rfc2.predict(XTest)
print("Accuracy:",metrics.accuracy_score(yTest, y_pred))

Accuracy: 0.8939068100358423


In [66]:
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [67]:
knn=KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)
y2_pred=knn.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y2_pred))

Accuracy: 0.8902743142144638


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [68]:
yTest=reduced_tabletData['Dyslexia']
XTest=reduced_tabletData.loc[:, reduced_tabletData.columns != 'Dyslexia']

In [69]:
knn2 = KNeighborsClassifier(n_neighbors=6)
knn2.fit(X_train , y_train)
y2_pred = knn2.predict(XTest)
print("Accuracy:",metrics.accuracy_score(yTest, y2_pred))

Accuracy: 0.8924731182795699


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [70]:
from sklearn.ensemble import AdaBoostClassifier


In [71]:
adb=AdaBoostClassifier()
adb.fit(X_train, y_train)
y_pred=adb.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9027431421446384


In [74]:
from sklearn.svm import SVC


In [76]:
svc=SVC(kernel='poly', degree=1, gamma='auto')
svc.fit(X_train, y_train)
y_pred=svc.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8877805486284289


In [77]:
from sklearn.neural_network import MLPClassifier


In [78]:
mlp=MLPClassifier()
mlp.fit(X_train, y_train)
y_pred=mlp.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8994181213632585


In [84]:
from sklearn.linear_model import LogisticRegression 
lr=LogisticRegression()
lr.fit(X_train, y_train)
y_pred=lr.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8977556109725686


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [85]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred=dtc.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8262676641729011


In [86]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda=LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred=lda.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8977556109725686


In [91]:
from sklearn.ensemble import VotingClassifier
# 1) naive bias = mnb
# 2) logistic regression =lr
# 3) random forest =rf
# 4) support vector machine = svm
evc=VotingClassifier(estimators=[('adb', adb),('lr',lr),('rfc',rfc),('svc',svc)],voting='hard')
evc.fit(X_train, y_train)
y_pred=evc.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8886118038237739


In [107]:
from sklearn.neural_network import MLPClassifier
classifier = MLPClassifier(hidden_layer_sizes=(8,8,8),activation='logistic',solver='adam',max_iter=500)
classifier.fit(X_train, y_train)



# Predicting the Test set results
y_pred = classifier.predict(X_test)



# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)


#Interpretation:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

#ACCURACY SCORE
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred)*100)

              precision    recall  f1-score   support

           0       0.92      0.94      0.93      1068
           1       0.43      0.35      0.39       135

    accuracy                           0.88      1203
   macro avg       0.68      0.65      0.66      1203
weighted avg       0.86      0.88      0.87      1203

87.53117206982543




In [112]:
from sklearn.neural_network import MLPClassifier
classifier = MLPClassifier(hidden_layer_sizes=(8,8,8),activation='tanh',solver='adam',max_iter=500)
classifier.fit(X_train, y_train)



# Predicting the Test set results
y_pred = classifier.predict(X_test)



# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)


#Interpretation:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

#ACCURACY SCORE
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred)*100)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1068
           1       0.47      0.31      0.37       135

    accuracy                           0.88      1203
   macro avg       0.69      0.63      0.65      1203
weighted avg       0.87      0.88      0.87      1203

88.2793017456359


In [113]:
from sklearn.neural_network import MLPClassifier
classifier = MLPClassifier(hidden_layer_sizes=(8,8,8),activation='relu',solver='adam',max_iter=500)
classifier.fit(X_train, y_train)



# Predicting the Test set results
y_pred = classifier.predict(X_test)



# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)


#Interpretation:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

#ACCURACY SCORE
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred)*100)

              precision    recall  f1-score   support

           0       0.91      0.96      0.94      1068
           1       0.49      0.29      0.36       135

    accuracy                           0.89      1203
   macro avg       0.70      0.63      0.65      1203
weighted avg       0.87      0.89      0.87      1203

88.69492934330839


In [118]:
from sklearn.neural_network import MLPClassifier
classifier = MLPClassifier(hidden_layer_sizes=(8,8,8),activation='identity',solver='adam',max_iter=500)
classifier.fit(X_train, y_train)



# Predicting the Test set results
y_pred = classifier.predict(X_test)



# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)


#Interpretation:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

#ACCURACY SCORE
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred)*100)

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      1068
           1       0.68      0.24      0.35       135

    accuracy                           0.90      1203
   macro avg       0.80      0.61      0.65      1203
weighted avg       0.89      0.90      0.88      1203

90.19118869492935
