# STEP1: processing raw eye data

In [1]:
import pandas as pd
import numpy as np
import string
import re
from nltk.stem import LancasterStemmer
import pickle

In [6]:
eye = pd.read_csv("../resource/table.txt",sep="\t")

In [2]:
def textProc():
    with open("../resource/all.txt") as f:
        contents = f.readlines()
        f.close()
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    result = []
    stm=LancasterStemmer()
    for line in contents:
        newLine = []
        line = re.sub(r"\d+\.?\d*",'numchain',line)
        line = regex.sub('',line)
        line = re.sub("ur",'your',line)
        line = line.strip().split()
        for item in line:
            newLine.append(stm.stem(item))
        result.append(newLine)
    return result

In [45]:
def matchEyeData(lemTxt,eye):
    seqDict = {'Time to First Fixation_': '0', 'Fixations Before_':'1','First Fixation Duration_':'2','Fixation Duration_':'3','Total Fixation Duration_':'4','Fixation Count_':'5','Fixation Count (Include Zeros)_':'6','Visit Duration_':'7','Total Visit Duration_':'8','Total Visit Duration (Include Zeros)_':'9','Visit Count_':'10','Visit Count (Include Zeros)_':'11'}
    wordsSet = set([item for sublist in lemTxt for item in sublist])
    result = []
    for i in range(3,eye.shape[0]):
        arr = np.zeros((50,len(wordsSet)*12))
        df = pd.DataFrame(arr,columns=[word+str(i) for word in wordsSet for i in range(12)])
        for j in range(1,len(eye.columns)):
            col = eye.columns[j]
            if not col.endswith('Mean'):
                continue
            info = col.split('_')
            rect = info[2]
            if rect == 'Rectangle':
                continue
            textNum = int(info[1].split()[0])-1
            rectNum = int(rect.split()[1])-1
            try:
                coWord = lemTxt[textNum][rectNum]
            except:
#                 print("error"+str(textNum))
                continue
            for key in seqDict.keys():
                if col.startswith(key):
                    colName = coWord+seqDict[key]
                    df.iloc[textNum][colName] = eye.iloc[5,j]
                    break
        result.append(df)
#         break
    return result

In [46]:
lemTxt = textProc()
eyeMatched = matchEyeData(lemTxt,eye)

In [49]:
with open('../resource/eyeMatched.pkl', 'wb') as f:
    pickle.dump(eyeMatched, f)

# STEP2: Prepare eye-tracking data for ml

In [70]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [58]:
with open('../resource/eyeMatched.pkl', 'rb') as f:
    eyeMatched = pickle.load(f)

In [73]:
finalData = pd.concat(eyeMatched)
eye_label = pd.read_pickle('../resource/raw_data')['label'].to_numpy().astype('int').tolist()*len(eyeMatched)

data = finalData.to_numpy()
label = np.array(eye_label)

In [82]:
def runIt(data,label,splits = 3, lowResource = False):
    clfs = [LogisticRegression(), LinearSVC(),KNeighborsClassifier()]
    skf = StratifiedKFold(n_splits=3)
    X = data
    y = label
    for clf in clfs:
        print(clf)
        for  train_index,test_index in skf.split(data,label):
            if lowResource:
                tmp = test_index
                test_index = train_index
                train_index = tmp
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train,y_train)
            y_pred = clf.predict(X_test)
            print (accuracy_score(y_test,y_pred))
            print (confusion_matrix(y_test,y_pred))


In [83]:
# single person
runIt(eyeMatched[0].to_numpy(),pd.read_pickle('../resource/raw_data')['label'].to_numpy().astype('int'))




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.5294117647058824
[[2 7]
 [1 7]]
0.7058823529411765
[[4 4]
 [1 8]]
0.6875
[[3 5]
 [0 8]]
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
0.5882352941176471
[[5 4]
 [3 5]]
0.6470588235294118
[[4 4]
 [2 7]]
0.6875
[[4 4]
 [1 7]]
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
0.47058823529411764
[[0 9]
 [0 8]]
0.5294117647058824
[[0 8]
 [0 9]]
0.4375
[