In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from sktime.utils.load_data import load_from_tsfile_to_dataframe
import os
import sys
import glob
import seaborn as sns
from pathlib import Path
from IPython.display import display, HTML
import math
from scipy import stats
import random
import statsmodels.api as sm
from contextlib import redirect_stdout
from colorama import init
# init()
from termcolor import colored, cprint

%matplotlib inline

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
pd.set_option('display.width', 2000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option("display.max_seq_items", 100)
np.set_printoptions(threshold=1000)

In [7]:
def get_datasets_in_path(path, exclude="."):
    datasets = [os.path.basename(os.path.dirname(p)) for p in glob.glob(path + "*/")
                if not os.path.basename(os.path.dirname(p)).startswith(exclude) ]
    return datasets

# Verification and fixes for csv files

## Split a single invalid row in exp files

In [103]:

def split_invalid_csv_row2(line, columns, verbose = True):
    import re

    if verbose: cprint(line, 'blue')

    columns = re.split(r',(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)', line)
    if verbose: print(f"num columns: {len(columns)}")

    valid_lines = {}
    current_row_columns = []
    key_stack = []

    # iteration 1: populate columns
    current_row_num = 0
    valid_lines[current_row_num] = []
    for i,  c in enumerate(columns):
        # cprint(c, 'green')

        valid_lines[current_row_num].append(c)

        if i % 26 == 0 and i != 0:
            current_row_num = current_row_num + 1
            valid_lines[current_row_num] = []

    # remove any empty rows
    if len(valid_lines[current_row_num]) == 0:
        del valid_lines[current_row_num]


    # iteration 2: fix columns near edges
    prev_test_key = None
    for i, (k, tmp_line) in enumerate(valid_lines.items()):
        # cprint(c, 'blue')

        last_coloumn = tmp_line[-1].strip()
        if verbose: cprint(last_coloumn, 'red')

        # just check if last column is valid, if not extract key and fix it
        if last_coloumn.endswith("csv"):
            # all good
            test_key = None
        else:
            # fix file name
            splitted_file_name = last_coloumn.split(".")
            file_name = splitted_file_name[0].strip() + ".train.exp.csv"
            tmp_line[-1] = file_name

            # get key
            test_key = splitted_file_name[-1].strip()[-8:].strip()

        # # check if key has valid length
        # if len(test_key) != 8:
        #     cprint(f"Invalid test key length: {test_key}, taking last 8 chars", 'red')
        #     test_key = test_key[-8:]
        #     tmp_line[0] = test_key

        if verbose: cprint(" ----> " + file_name, 'red', end = '')
        if verbose: cprint(" ---- " + str(test_key), 'red', end = '')
        if verbose: cprint(" || " + str(prev_test_key), 'red')

        if len(tmp_line) == 27:
            # have all columns
            pass
        elif len(tmp_line) == 26:
            # missing the first column, so use key from the previous row
            if "-" in tmp_line[0]:
                # cprint(c[0], 'red')
                # cprint(test_key, 'red')
                # cprint(prev_test_key, 'red')
                # if first col is a timestamp it has "-"
                if prev_test_key is not None:
                    tmp_line.insert(0, prev_test_key)
                else:
                    cprint(tmp_line, 'red')
                    raise Exception("Missing first column with prev_test_key == None:")
            else:
                cprint(tmp_line, 'red')
                raise Exception(f"First column is missing but its not in date format: {tmp_line[0]}")

        else:
            cprint(tmp_line, 'red')
            raise Exception(f"Number of columns != 26 or 27 for row {i}, found {len(tmp_line)}")

        prev_test_key = test_key
        if verbose: cprint(tmp_line, 'yellow')

    return valid_lines


tmp_line = '30e3597b,2022-02-17-011773,AtrialFibrillation,15,15,3,2,640,msm_i_id,0.5333333333333333,0.2,msm,i,id,"MSM[,c=2.08,dep=false,|d|=2,lpI=1.0,lpD=2.0,adj=true,]","MSM[,c=2.08,dep=false,|d|=2,lpI=1.0,lpD=2.0,adj=true,]",0.127674692,1.825698631,6463564,false,2,false,"{0,1}",52,7472822f,out/rev1/test/fold0/AtrialFibrillation/AtrialFibrillation-train-id-fold0.best.exp.csv,AtrialFibrillation-msm-i-p_0_99.train.exp.cf39594bf510,2022-02-17-011773,AtrialFibrillation,15,15,3,2,640,msm_d_id,0.5333333333333333,0.26666666666666666,msm,d,id,"MSM[,c=0.964,dep=true,|d|=2,lpI=1.0,lpD=2.0,adj=true,]","MSM[,c=0.964,dep=true,|d|=2,lpI=1.0,lpD=2.0,adj=true,]",0.619903853,8.152757145,6463564,false,2,true,"{0,1}",48,07a98739,out/rev1/test/fold0/AtrialFibrillation/AtrialFibrillation-train-id-fold0.best.exp.csv,AtrialFibrillation-msm-d-p_0_99.train.exp.csv'
# tmp_line = 'b9d29382,2022-02-17-013425,Cricket,108,72,12,6,1197,lcss_d_id,1.0,0.9861111111111112,lcss,d,id,"LCSS[,w=67,wp=0.05555555555555555,e=0.28876816422646734,ed=0.28876817161870416,0.28876807654331454,0.2887682064461225,0.28876813841996885,0.2887681622180289,0.28876823011258235,dep=true,|d|=6,lpI=1.0,lpD=2.0,adj=true,]","LCSS[,w=67,wp=0.05555555555555555,e=0.28876816422646734,ed=0.28876817161870416,0.28876807654331454,0.2887682064461225,0.28876813841996885,0.2887681622180289,0.28876823011258235,dep=true,|d|=6,lpI=2.0,lpD=2.0,adj=true,]",1.228763908,2.468852892,6463564,false,6,true,"{0,1,2,3,4,5}",21,93ef8398,out/rev1/test/fold0/Cricket/Cricket-train-id-fold0.best.exp.csv,Cricket-lcss-d-p_0_99.train.exp.csvbb11e20e4,2022-02-17-013425,Cricket,108,72,12,6,1197,msm_i_id,0.9907407407407408,0.9861111111111112,msm,i,id,"MSM[,c=0.892,dep=false,|d|=6,lpI=1.0,lpD=2.0,adj=true,]","MSM[,c=0.892,dep=false,|d|=6,lpI=1.0,lpD=2.0,adj=true,]",22.48207987,46.487638166,6463564,false,6,false,"{0,1,2,3,4,5}",46,8786b90e,out/rev1/test/fold0/Cricket/Cricket-train-id-fold0.best.exp.csv,Cricket-msm-i-p_0_99.train.exp.csv22829862e,2022-02-17-013425,Cricket,108,72,12,6,1197,msm_d_id,1.0,1.0,msm,d,id,"MSM[,c=7.84,dep=true,|d|=6,lpI=1.0,lpD=2.0,adj=true,]","MSM[,c=7.84,dep=true,|d|=6,lpI=1.0,lpD=2.0,adj=true,]",108.935763,211.931426612,6463564,false,6,true,"{0,1,2,3,4,5}",68,e6440eb3,out/rev1/test/fold0/Cricket/Cricket-train-id-fold0.best.exp.csv,Cricket-msm-d-p_0_99.train.exp.csv986f2d855,2022-02-17-013425,Cricket,108,72,12,6,1197,twe_i_id,0.9722222222222222,0.9861111111111112,twe,i,id,"TWE[,n=1.0E-5,l=0.088888889,dep=false,|d|=6,lpI=1.0,lpD=2.0,adj=true,]","TWE[,n=1.0E-5,l=0.088888889,dep=false,|d|=6,lpI=1.0,lpD=2.0,adj=true,]",28.209427171,52.615102415,6463564,false,6,false,"{0,1,2,3,4,5}",8,4d0ad6cc,out/rev1/test/fold0/Cricket/Cricket-train-id-fold0.best.exp.csv,Cricket-twe-i-p_0_99.train.exp.csv7613a153a,2022-02-17-013425,Cricket,108,72,12,6,1197,twe_d_id,0.9629629629629628,0.9722222222222222,twe,d,id,"TWE[,n=1.0E-5,l=0.1,dep=true,|d|=6,lpI=1.0,lpD=2.0,adj=true,]","TWE[,n=1.0E-5,l=0.1,dep=true,|d|=6,lpI=2.0,lpD=2.0,adj=true,]",31.897895311,48.939642628,6463564,false,6,true,"{0,1,2,3,4,5}",9,1af93ecc,out/rev1/test/fold0/Cricket/Cricket-train-id-fold0.best.exp.csv,Cricket-twe-d-p_0_99.train.exp.csv85edd49ac,2022-02-17-013425,Cricket,108,72,12,6,1197,wddtw_i_id,0.9537037037037036,0.9583333333333334,wddtw,i,id,"WDDTW[,g=0.0,WDTW[,g=0.0,dep=false,|d|=6,lpI=1.0,lpD=2.0,adj=true,],]","WDDTW[,g=0.0,WDTW[,g=0.0,dep=false,|d|=6,lpI=1.0,lpD=2.0,adj=true,],]",23.416549689,45.293672535,6463564,false,6,false,"{0,1,2,3,4,5}",0,63084f97,out/rev1/test/fold0/Cricket/Cricket-train-id-fold0.best.exp.csv,Cricket-wddtw-i-p_0_99.train.exp.csv59ef4e7b5,2022-02-17-013425,Cricket,108,72,12,6,1197,wddtw_d_id,0.8148148148148148,0.75,wddtw,d,id,"WDDTW[,g=0.0,WDTW[,g=0.0,dep=true,|d|=6,lpI=1.0,lpD=2.0,adj=true,],]","WDDTW[,g=0.0,WDTW[,g=0.0,dep=true,|d|=6,lpI=2.0,lpD=2.0,adj=true,],]",11.900992472,24.785574748,6463564,false,6,true,"{0,1,2,3,4,5}",0,1b133c16,out/rev1/test/fold0/Cricket/Cricket-train-id-fold0.best.exp.csv,Cricket-wddtw-d-p_0_99.train.exp.csv724e98e1a,2022-02-17-013425,Cricket,108,72,12,6,1197,wdtw_i_id,1.0,0.9861111111111112,wdtw,i,id,"WDTW[,g=0.0,dep=false,|d|=6,lpI=1.0,lpD=2.0,adj=true,]","WDTW[,g=0.0,dep=false,|d|=6,lpI=1.0,lpD=2.0,adj=true,]",13.464573066,24.517223357,6463564,false,6,false,"{0,1,2,3,4,5}",0,fc99a75d,out/rev1/test/fold0/Cricket/Cricket-train-id-fold0.best.exp.csv,Cricket-wdtw-i-p_0_99.train.exp.csc9640684f3,2022-02-17-013425,Cricket,108,72,12,6,1197,wdtw_d_id,1.0,1.0,wdtw,d,id,"WDTW[,g=0.0,dep=true,|d|=6,lpI=1.0,lpD=2.0,adj=true,]","WDTW[,g=0.0,dep=true,|d|=6,lpI=2.0,lpD=2.0,adj=true,]",10.01358353,21.030246658,6463564,false,6,true,"{0,1,2,3,4,5}",0,8147efa4,out/rev1/test/fold0/Cricket/Cricket-train-id-fold0.best.exp.csv,Cricket-wdtw-d-p_0_99.train.exp.csv'
tmp_line = 'daf53ee6,2022-02-17-060616,StandWalkJump,12,15,3,4,2500,msm_i_id,0.3333333333333333,0.2,msm,i,id,"MSM[,c=3.52,dep=false,|d|=4,lpI=1.0,lpD=2.0,adj=true,]","MSM[,c=3.52,dep=false,|d|=4,lpI=1.0,lpD=2.0,adj=true,]",2.090399923,32.035791847,6463564,false,4,false,"{0,1,2,3}",56,f090e456,out/rev1/test/fold3/StandWalkJump/StandWalkJump-train-id-fold3.best.exp.csv,StandWalkJump-msm-i-p_0_99.train.exp.cd5429fcc6d1,2022-02-17-060616,StandWalkJump,12,15,3,4,2500,msm_d_id,0.4166666666666667,0.13333333333333333,msm,d,id,"MSM[,c=0.02875,dep=true,|d|=4,lpI=1.0,lpD=2.0,adj=true,]","MSM[,c=0.02875,dep=true,|d|=4,lpI=1.0,lpD=2.0,adj=true,]",8.806052864,136.043979083,6463564,false,4,true,"{0,1,2,3}",5,c32d2999,out/rev1/test/fold3/StandWalkJump/StandWalkJump-train-id-fold3.best.exp.csv,StandWalkJump-msm-d-p_0_99.train.exp.c91a7405c3a4,2022-02-17-060616,StandWalkJump,12,15,3,4,2500,twe_i_id,0.3333333333333333,0.26666666666666666,twe,i,id,"TWE[,n=1.0E-4,l=0.077777778,dep=false,|d|=4,lpI=1.0,lpD=2.0,adj=true,]","TWE[,n=1.0E-4,l=0.077777778,dep=false,|d|=4,lpI=1.0,lpD=2.0,adj=true,]",1.757084294,28.126779655,6463564,false,4,false,"{0,1,2,3}",17,75176edc,out/rev1/test/fold3/StandWalkJump/StandWalkJump-train-id-fold3.best.exp.csv,StandWalkJump-twe-i-p_0_99.train.exp.c2c2e9d8ba2e,2022-02-17-060616,StandWalkJump,12,15,3,4,2500,twe_d_id,0.4166666666666667,0.06666666666666667,twe,d,id,"TWE[,n=1.0E-5,l=0.011111111,dep=true,|d|=4,lpI=1.0,lpD=2.0,adj=true,]","TWE[,n=1.0E-5,l=0.011111111,dep=true,|d|=4,lpI=1.0,lpD=2.0,adj=true,]",1.846849106,29.323220497,6463564,false,4,true,"{0,1,2,3}",1,3d1d153a,out/rev1/test/fold3/StandWalkJump/StandWalkJump-train-id-fold3.best.exp.csv,StandWalkJump-twe-d-p_0_99.train.exp.c716448995d1,2022-02-17-060616,StandWalkJump,12,15,3,4,2500,wddtw_i_id,0.5,0.4666666666666667,wddtw,i,id,"WDDTW[,g=0.86,WDTW[,g=0.86,dep=false,|d|=4,lpI=1.0,lpD=2.0,adj=true,],]","WDDTW[,g=0.86,WDTW[,g=0.86,dep=false,|d|=4,lpI=1.0,lpD=2.0,adj=true,],]",1.407979667,23.705508046,6463564,false,4,false,"{0,1,2,3}",86,108f48ad,out/rev1/test/fold3/StandWalkJump/StandWalkJump-train-id-fold3.best.exp.csv,StandWalkJump-wddtw-i-p_0d439d9d8,2022-02-182703a9b,2022-02-17-060616,StandWalkJump,12,15,3,4,2500,wddtw_d_id,0.5,0.4,wddtw,d,id,"WDDTW[,g=0.86,WDTW[,g=0.86,dep=true,|d|=4,lpI=1.0,lpD=2.0,adj=true,],]","WDDTW[,g=0.86,WDTW[,g=0.86,dep=true,|d|=4,lpI=1.0,lpD=2.0,adj=true,],]",1.098460876,13.829707204,6463564,false,4,true,"{0,1,2,3}",86,d71c891d,out/rev1/test/fold3/StandWalkJump/StandWalkJump-train-id-fold3.best.exp.csv,StandWalkJump-wddtw-d-p_0_99.train.exp.cs13b2172274,2022-02-17-060616,StandWalkJump,12,15,3,4,2500,wdtw_i_id,0.5,0.4,wdtw,i,id,"WDTW[,g=0.73,dep=false,|d|=4,lpI=1.0,lpD=2.0,adj=true,]","WDTW[,g=0.73,dep=false,|d|=4,lpI=1.0,lpD=2.0,adj=true,]",0.973075033,16.763320935,6463564,false,4,false,"{0,1,2,3}",73,1d3ac602,out/rev1/test/fold3/StandWalkJump/StandWalkJump-train-id-fold3.best.exp.csv,StandWalkJump-wdtw-i-p_0_99.train.exp.csv'


tmp = split_invalid_csv_row2(tmp_line, None, verbose = True)

for i, t in enumerate(tmp.items()):
    print(str(i) + ": " + str(t))


[34mdaf53ee6,2022-02-17-060616,StandWalkJump,12,15,3,4,2500,msm_i_id,0.3333333333333333,0.2,msm,i,id,"MSM[,c=3.52,dep=false,|d|=4,lpI=1.0,lpD=2.0,adj=true,]","MSM[,c=3.52,dep=false,|d|=4,lpI=1.0,lpD=2.0,adj=true,]",2.090399923,32.035791847,6463564,false,4,false,"{0,1,2,3}",56,f090e456,out/rev1/test/fold3/StandWalkJump/StandWalkJump-train-id-fold3.best.exp.csv,StandWalkJump-msm-i-p_0_99.train.exp.cd5429fcc6d1,2022-02-17-060616,StandWalkJump,12,15,3,4,2500,msm_d_id,0.4166666666666667,0.13333333333333333,msm,d,id,"MSM[,c=0.02875,dep=true,|d|=4,lpI=1.0,lpD=2.0,adj=true,]","MSM[,c=0.02875,dep=true,|d|=4,lpI=1.0,lpD=2.0,adj=true,]",8.806052864,136.043979083,6463564,false,4,true,"{0,1,2,3}",5,c32d2999,out/rev1/test/fold3/StandWalkJump/StandWalkJump-train-id-fold3.best.exp.csv,StandWalkJump-msm-d-p_0_99.train.exp.c91a7405c3a4,2022-02-17-060616,StandWalkJump,12,15,3,4,2500,twe_i_id,0.3333333333333333,0.26666666666666666,twe,i,id,"TWE[,n=1.0E-4,l=0.077777778,dep=false,|d|=4,lpI=1.0,lpD=2.0,adj=

Exception: Missing first column with prev_test_key == None:

## Fix an exp file

In [70]:
def fix_exp_file(file_name, num_columns = 27, save = False, overwrite = False, verbose = False):
    '''
     fixes invalid .exp.csv files
     last modified on: 2-3-2022
    '''

    import re
    from shutil import copyfile
    from pathlib import Path

    # To avoid modifying the timestamps of valid files by overwriting them
    is_file_invalid = False

    # ------------------------ read file lines into a list
    file = open(file_name, "r")
    original_lines = file.readlines()
    file.close()

    modified_lines = []
    for i, line in enumerate(original_lines):
        # print(str(i) + ":" + line)

        # Split the line
        columns = re.split(r',(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)', line)

        if len(columns) > num_columns:
            # Invalid row found, too many columns

            is_file_invalid = True
            if verbose: cprint(str(i) + ":" + line, 'yellow')
            new_lines = split_invalid_csv_row2(line, columns, verbose = False)
            for j, (k, l) in enumerate(new_lines.items()):
                if len(l) != 27:
                    raise Exception(f"Number of columns doesnt match after fixing them: Unknown Error.... {l}");
                fixed_line = ",".join(l)
                fixed_line = fixed_line.strip() + "\n"
                if len(fixed_line) > 0:
                    modified_lines.append(fixed_line)
                if verbose: cprint(str(i) + "." + str(j) + ":" + fixed_line, 'green')

        elif len(columns) == num_columns:
            # Assume that this row is valid
            if verbose: cprint(str(i) + ":" + line, 'blue')
            modified_lines.append(line)

        elif line == "\n":
            pass

        else:
            # Invalid row found, too few columns, ignore this row
            is_file_invalid = True
            if verbose: cprint(str(i) + ":" + line, 'yellow')
            pass

    #  ------------------------  cleaup list of lines if needed
    # for i, line in enumerate(modified_lines):
        # cprint(str(i) + ': ' + line,'red', end = "")


    # ------------------------ write the list of modified lines to a file
    new_file_name = file_name
    if save and is_file_invalid:
        if overwrite:
            bck_file = Path(file_name + ".bck.csv")
            if not bck_file.is_file():
                copyfile(file_name, file_name + ".bck.csv")
            else:
                cprint(f"backup file {bck_file} already exists", 'red')
            new_file_name = file_name
        else:
            new_file_name = file_name + ".fixed.csv"

        file = open(new_file_name, "w")
        file.writelines(modified_lines)
        file.close()

        # Extra backups
        if not Path(file_name + ".original.csv").is_file():
            file = open(file_name + ".original.csv", "w")
            file.writelines(original_lines)
            file.close()
        if not Path(file_name + ".modified.csv").is_file():
            file = open(file_name  + ".modified.csv", "w")
            file.writelines(modified_lines)
            file.close()

    return new_file_name

_tmp_file_name = 'E:/git/experiments/knn/10-2-2022/m3/i1d2-norm/test//fold1/ArticularyWordRecognition/ArticularyWordRecognition-test-d.test.exp.csv'

fix_exp_file(_tmp_file_name, save = False, overwrite = False, verbose = True)


[34m0:testKey,timestamp,dataset,trainSize,testSize,classes,dimensions,length,classifier,trainAccuracy,testAccuracy,name,dependency,groupBy,measure,trainMeasure,trainTime,testTime,seed,normalize,noDims,useDependentDims,dimensionsToUse,paramID,iterationKey,bestFile,expFile
[0m
[34m1:ab491fcd,2022-02-16-223353,ArticularyWordRecognition,275,300,25,9,144,ddtwf_d_d,0.3454545454545454,0.36333333333333334,ddtwf,d,d,"DDTW[,w=144,DTW[,w=144,wp=-1.0,dep=true,|d|=9,lpI=1.0,lpD=2.0,adj=true,],]","DDTW[,w=144,DTW[,w=144,wp=-1.0,dep=true,|d|=9,lpI=1.0,lpD=2.0,adj=true,],]",1.564130853,5.009464767,6463564,false,9,true,"{0,1,2,3,4,5,6,7,8}",100,ac02370e,out/rev1/test/fold1/ArticularyWordRecognition/ArticularyWordRecognition-train-d-fold1.best.exp.csv,ArticularyWordRecognition-ddtwf-d-p_0_99.train.exp.csv
[0m
[34m2:0cfb5fa3,2022-02-16-223353,ArticularyWordRecognition,275,300,25,9,144,ddtwr_d_d,0.3527272727272727,0.36,ddtwr,d,d,"DDTW[,w=15,DTW[,w=15,wp=0.1,dep=true,|d|=9,lpI=1.0,lpD=2.0,adj=true,],]

'E:/git/experiments/knn/10-2-2022/m3/i1d2-norm/test//fold1/ArticularyWordRecognition/ArticularyWordRecognition-test-d.test.exp.csv'

## Check folders for exp file issues

In [162]:

def verify_test_csv_files(input_dir, dataset, fold = 0, group = 'i', rethrow = True, save = False, overwrite = False):

    from pandas.errors import ParserError
    import traceback

    try:
        file_name = f"{input_dir}/fold{fold}/{dataset}/{dataset}-test-{group}.test.exp.csv"
        print(f"Verifying Exp File: {file_name}")
        new_file = fix_exp_file(file_name, save = save, overwrite = overwrite)

        df_exp = pd.read_csv(new_file)
        # display(df_exp.head(2))

        # Get exp keys
        exp_keys = df_exp['testKey']

        # Check if all keys are in the count file
        count_file_name = f"{input_dir}/fold{fold}/{dataset}/{dataset}-test-{group}.test.count.csv"
        df_count = pd.read_csv(count_file_name)
        count_keys = df_count['testKey'].unique()
        count_keys = [c for c in count_keys]

        # Fix key length issue
        if set(df_exp['testKey']) != set(count_keys):
            df_exp['testKey'] = exp_keys.apply(lambda key: key if len(key) == 8 else key[-8:])
            if save:
                df_exp.to_csv(new_file, index=False)
                df_exp = pd.read_csv(new_file)


        # Fail if keys in exp file matches the keys in the count file
        if set(df_exp['testKey']) != set(count_keys):
            cprint(list(df_exp['testKey']), 'yellow')
            cprint(count_keys, 'red')
            raise Exception(f"Mismatch between keys in the count file and exp file");

    except ParserError as e:
        cprint(e, 'red')
        cprint(traceback.format_exc(), 'blue')
        raise(e)
    except Exception as e:
        if rethrow: raise(e)
        else: print(colored(e, 'red'))


config = "i1d2-norm"
prod_indir = f"E:/git/experiments/knn/10-2-2022/m3/{config}/train/"
prod_outdir = f"E:/git/experiments/knn/10-2-2022/m3/{config}/test/"
# datasets = ['ArticularyWordRecognition']
datasets = get_datasets_in_path(prod_indir + "/fold0/")
# folds = list(range(0,9))
# folds = [0,1,2,3,4,5]
# folds = [0,1,2,3,4,5]
# datasets = datasets[0:5]
datasets = 'ArticularyWordRecognition,AtrialFibrillation,BasicMotions,Cricket,DuckDuckGeese,Epilepsy,ERing,EthanolConcentration,FingerMovements,HandMovementDirection,Handwriting,Heartbeat,Libras,LSST,NATOPS,PEMS-SF,PenDigits,PhonemeSpectra,RacketSports,SelfRegulationSCP1,SelfRegulationSCP2,StandWalkJump,UWaveGestureLibrary'.split(',')

# datasets = 'StandWalkJump'.split(',')


print(",".join(datasets))

# folds = list(range(0,10))
folds = [9]

for f in folds:
    for d in datasets:
        for g in ["i", "d", "id", "b"]:
            verify_test_csv_files(prod_outdir, d, fold = f, group = g, rethrow = True, save = True, overwrite = True)

ArticularyWordRecognition,AtrialFibrillation,BasicMotions,Cricket,DuckDuckGeese,Epilepsy,ERing,EthanolConcentration,FingerMovements,HandMovementDirection,Handwriting,Heartbeat,Libras,LSST,NATOPS,PEMS-SF,PenDigits,PhonemeSpectra,RacketSports,SelfRegulationSCP1,SelfRegulationSCP2,StandWalkJump,UWaveGestureLibrary
Verifying Exp File: E:/git/experiments/knn/10-2-2022/m3/i1d2-norm/test//fold9/ArticularyWordRecognition/ArticularyWordRecognition-test-i.test.exp.csv
Verifying Exp File: E:/git/experiments/knn/10-2-2022/m3/i1d2-norm/test//fold9/ArticularyWordRecognition/ArticularyWordRecognition-test-d.test.exp.csv
Verifying Exp File: E:/git/experiments/knn/10-2-2022/m3/i1d2-norm/test//fold9/ArticularyWordRecognition/ArticularyWordRecognition-test-id.test.exp.csv
Verifying Exp File: E:/git/experiments/knn/10-2-2022/m3/i1d2-norm/test//fold9/ArticularyWordRecognition/ArticularyWordRecognition-test-b.test.exp.csv
Verifying Exp File: E:/git/experiments/knn/10-2-2022/m3/i1d2-norm/test//fold9/AtrialFi

## Fix a count file

In [None]:

def verify_count_file(input_dir, dataset, test_train  = "test" , fold = 0, group = 'i', rethrow = True, save = False, overwrite = False):

    from pandas.errors import ParserError
    import traceback

    try:
        file_name = f"{input_dir}/fold{fold}/{dataset}/{dataset}-{test_train}-{group}.test.exp.csv"
        print(f"Verifying Exp File: {file_name}")

        df_exp = pd.read_csv(file_name)
        exp_keys = df_exp['testKey']


        # check if all keys are in the count file
        count_file_name = f"{input_dir}/fold{fold}/{dataset}/{dataset}-{test_train}-{group}.test.count.csv"
        df_count = pd.read_csv(count_file_name)
        count_keys = df_count['testKey'].unique()
        count_keys = [c for c in count_keys]

        if set(exp_keys) != set(count_keys):
            cprint(list(exp_keys), 'yellow')
            cprint(count_keys, 'red')




            raise Exception(f"Mismatch between keys in the count file and exp file \n{count_file_name}");


        # display(df_temp.head(2))

    except ParserError as e:
        cprint(e, 'red')
        cprint(traceback.format_exc(), 'yellow')

        # raise(e)
    except Exception as e:
        if rethrow: raise(e)
        else: print(colored(e, 'red'))


config = "i1d2-norm"
prod_indir = f"E:/git/experiments/knn/10-2-2022/m3/{config}/train/"
prod_outdir = f"E:/git/experiments/knn/10-2-2022/m3/{config}/test/"
# datasets = ['ArticularyWordRecognition']
datasets = get_datasets_in_path(prod_indir + "/fold0/")
# folds = list(range(0,9))
# folds = [0,1,2,3,4,5]
# folds = [0,1,2,3,4,5]
# datasets = datasets[0:5]
# datasets = ['Cricket']

print(",".join(datasets))

folds = [0]

for f in folds:
    for d in datasets:
        for g in ["i", "d", "id", "b"]:
            verify_count_file(prod_outdir, d, fold = f, group = g, rethrow = True, save = True, overwrite = True)