In [32]:
import os
import pandas as pd
import numpy as np
from sklearn import clone
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.model_selection import KFold, cross_validate, cross_val_predict, LeaveOneGroupOut, GroupKFold, StratifiedGroupKFold, GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score
from sklearn import tree
from sklearn.ensemble import RandomTreesEmbedding
from matplotlib import pyplot as plt
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import json

def task_encoding(task):
    task_dict = {'shape':1, 'size':2, 'orientation':3, 'hue':4, 'brightness':5}
    encoded_task = task_dict[task]
    return encoded_task

def similarity_encoding(similarity):
    if '#' in similarity:
        similarity = similarity.split('#')[1]
    similarity = list(map(int,similarity.split('-')))
    difference = abs(similarity[0]-similarity[1])
    return difference

class MakeDataDouble:
    def __init__(self, path, amount, random_state=1) -> None:
        data = pd.read_csv(path)
        data['Task Encoding'] = data.apply(lambda x: task_encoding(x['task']), axis=1)
        data['Similarity Encoding'] = data.apply(lambda x: similarity_encoding(x['level']), axis=1)
        data = data.dropna(axis=0)
        data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)
        self.exceptlist = ['bbx_x1','bbx_x2','bbx_y1','bbx_y2','task','level','participant','shape_target','shape_distractor',
                           'set_size','target_size','distractor_size','target_color_b','target_color_g','target_color_r',
                           'distractor_color_b','distractor_color_g','distractor_color_r','target_orientation','distractor_orientation']
        self.data = self.stack_rows(data, amount)
        self.data = self.data.sample(frac=1, random_state=random_state).reset_index(drop=True)

    def set_domain_except(self, *args):
        self.exceptlist = self.exceptlist + list(args)

    def take_x(self):
        x_data = self.data.loc[:,~self.data.columns.isin(self.exceptlist)]
        return x_data

    def take_y(self):
        return self.data['participant']
    
    def stack_rows(self, data, amount):
        participants = list(range(1,14))
        total_dataframe = pd.DataFrame()
        for participant in participants:
            this_data = data.loc[data['participant']==participant].reset_index()
            total_amount = this_data.index.size
            block_size = total_amount//amount
            dataframe_block = pd.DataFrame()
            for i in range(amount):
                small_block = this_data.iloc[i*block_size:(i+1)*block_size].reset_index()
                small_block = small_block.loc[:,~small_block.columns.isin(self.exceptlist)]
                dataframe_block = pd.concat([dataframe_block, small_block], axis=1)
            dataframe_block['participant'] = participant
            total_dataframe = pd.concat([total_dataframe, dataframe_block], ignore_index=True)
        return total_dataframe
        
    def get_data(self):
        return self.data
    


Unnamed: 0,level_0,index,x1,y1,x2,y2,x3,y3,x4,y4,...,x59,y59,x60,y60,gaze_hit,cnt_x,cnt_y,Task Encoding,Similarity Encoding,participant
0,315,4257,939.0,560.0,939.0,560.0,939.0,560.0,942.0,576.0,...,840.0,399.0,812.0,393.0,0,695,275,2,2,11
1,323,4535,987.0,563.0,982.0,564.0,982.0,574.0,978.0,557.0,...,677.0,511.0,682.0,515.0,0,675,255,1,1,4
2,368,4553,1007.0,505.0,1011.0,508.0,1009.0,511.0,1010.0,515.0,...,809.0,472.0,812.0,475.0,1,798,486,2,2,1
3,79,1104,1282.0,1143.0,1282.0,1143.0,1282.0,1143.0,1282.0,1143.0,...,1053.0,507.0,1051.0,509.0,0,675,825,1,1,6
4,253,3309,967.0,473.0,972.0,476.0,972.0,469.0,967.0,477.0,...,1212.0,479.0,1210.0,481.0,1,1245,635,5,2,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6491,406,5479,978.0,456.0,976.0,446.0,979.0,449.0,972.0,473.0,...,860.0,301.0,861.0,302.0,1,866,632,2,2,2
6492,195,2717,965.0,445.0,965.0,420.0,963.0,410.0,922.0,440.0,...,747.0,362.0,735.0,373.0,1,685,402,3,2,11
6493,482,6188,980.0,552.0,978.0,555.0,980.0,554.0,977.0,553.0,...,872.0,656.0,874.0,655.0,0,1230,810,3,1,8
6494,235,2804,897.0,673.0,897.0,673.0,901.0,638.0,904.0,618.0,...,758.0,287.0,760.0,289.0,0,1223,451,2,2,1


In [None]:
class ML_Result:
    def __init__(self, data) -> None:
        self.data = data

    def get_data(self):
        return self.data
    
    def plot(self, task, participant=None):
        # print(participant)
        clf_names = ['ZeroR', 'DecisionTree', 'kNN', 'NaiveBayes', 'SVM', 'LogisticRegression', 'AdaBoost', 'RandomForest']
        if participant!=None:
            this_data = self.data[task][participant]
            plot_data = pd.DataFrame()
            for clf in clf_names:
                data = pd.DataFrame({"Task":task,"Participant":participant,"Model": clf, "Mean Accuracy":this_data[clf]['average acc']}, index=[0])
                # print(data)
                plot_data = pd.concat([plot_data, data], ignore_index=True)
        else:
            this_data = self.data[task]
            plot_data = pd.DataFrame()
            for clf in clf_names:
                data = pd.DataFrame({"Task":task,"Model": clf, "Mean Accuracy":this_data[clf]['average acc']}, index=[0])
                # print(data)
                plot_data = pd.concat([plot_data, data], ignore_index=True)
                participant = "Whole"
        zeror_value = plot_data.loc[0,'Mean Accuracy']
        plt.figure(figsize=(13,5))
        plots = sns.barplot(data = plot_data, x="Model", y="Mean Accuracy")
        plots.set_title(f"{participant} in task time : {task}")
        plots.set_ylim(0,1)
        plots.hlines(zeror_value, 0, 7, colors='black', linestyles="--")
        plt.show()
        # plt.savefig(os.path.join('ml-results', f"all_task_{task}_p_{participant}.png"))
        # plt.close()

In [33]:
def average(data):
    return round(sum(data)/len(data))

def mlanalysis(x_data, y_data):
    warnings.filterwarnings('ignore')
    base_model = DummyClassifier(strategy='most_frequent', random_state=0)  # ZeroR
    svc_model = SVC()  # SVM
    knn_model = KNeighborsClassifier()  # k-Nearest Neighbors
    lr_model = LogisticRegression(C=1, random_state=0)  # Logistic Regression
    dt_model = DecisionTreeClassifier()  # Decision Tree
    rf_model = RandomForestClassifier(random_state=0)  # Random Forest
    ab_model = AdaBoostClassifier()  # AdaBoost
    nb_model = GaussianNB()  # Naive Bayse
    
    clf_names = ['ZeroR', 'DecisionTree', 'kNN', 'NaiveBayes', 'SVM', 'LogisticRegression', 'AdaBoost', 'RandomForest']
    classifiers = [base_model, dt_model, knn_model, nb_model, svc_model, lr_model, ab_model, rf_model]
    results = {}
    for n, clf in enumerate(classifiers):
        print("==================================")
        print(clf_names[n])
        clf2 = clone(clf)
        X = x_data.to_numpy()
        Y = y_data.to_numpy()
        kf = KFold(n_splits=10)
        cm_added = np.zeros((13,13))
        f1 = []
        precision = []
        accuracy = []
        for i, (train, test) in enumerate(kf.split(X)):
            train_x = X[train]
            train_y = Y[train]
            test_x = X[test]
            test_y = Y[test]
            clf2.fit(train_x, train_y)
            y_pred = clf2.predict(test_x)
            y_true = test_y
            f1.append(f1_score(y_true, y_pred, average=None).tolist())
            precision.append(precision_score(y_true, y_pred, average=None).tolist())
            accuracy.append(accuracy_score(y_true, y_pred).tolist()) 
            # print(confusion_matrix(y_true, y_pred))
        results[clf_names[n]] = {'precision': precision, 'accuracy': accuracy, 'f1':f1}
        
    return results    

In [34]:
path = 'data/blue_rawdata_task1.csv'
mydata = MakeDataDouble(path, 2)
mydata.get_data()

x = mydata.take_x()
y = mydata.take_y()
results = mlanalysis(x,y)

ZeroR
DecisionTree
kNN
NaiveBayes
SVM
LogisticRegression
AdaBoost


KeyboardInterrupt: 