In [None]:
def csvGet(file):
    
    str_data = np.genfromtxt(file, delimiter=',',skip_header=1, names=True, dtype=None, encoding=None)
    flt_data = np.genfromtxt(file, delimiter=',',skip_header=1, names=True, dtype=float)
    
    return str_data, flt_data

In [None]:
def sanityCheck1(str_data, column_title, row_number):
    
    print("Column: "+column_title)
    print("Row: "+str(row_number))
    print("Cell value: "+str(str_data[column_title][row_number]))
    
    return None

In [None]:
def retainRows(flt_data, str_data, column, category):
    
    # user setting to avoid filtering rows
    if ((column is None) and (category is None)):
        return flt_data.copy(), str_data.copy()
    
    # otherwise, filter rows
    filtered_flt_data = flt_data[str_data[column] == category]
    filtered_str_data = str_data[str_data[column] == category]

    return filtered_flt_data, filtered_str_data

In [None]:
def sanityCheck2(prev_data, filtered_data):
    
    print("Previous number of rows: "+str(prev_data.shape[0]))
    print("Updated number of rows: "+str(filtered_data.shape[0]))
    print(str(prev_data.shape[0] - filtered_data.shape[0])+" rows removed.")
    
    return None

In [None]:
def getColumns(filtered_flt_data, filtered_str_data, y_label, stimulus_label, subjectID_label):

    y_col = filtered_flt_data[y_label]
    stimulus_col = filtered_str_data[stimulus_label]
    subID_col = filtered_str_data[subjectID_label]
    object_list = np.unique(filtered_str_data[stimulus_label])
    subID_list = np.unique(filtered_str_data[subjectID_label])

    return y_col, stimulus_col, subID_col, object_list, subID_list

In [None]:
def getFeatures(filtered_flt_data, feature_list):
    
    X = np.array(filtered_flt_data[feature_list].tolist())
    
    return X

In [None]:
def binarify(y_col, subID_col, median):
    
    subject_ids, subject_indices = np.unique(subID_col, return_index=True)
    
    if median=='subject':
        return binarify_subject_median(y_col, subject_indices)
    
    if median=='overall':
        return binarify_overall_median(y_col)

    return binary_y_col

In [None]:
def binarify_subject_median(y_col, subject_indices):
    
    # initialize the output array 
    binary_y = np.array([])
    
    # loop over each subject
    for i in range(0,len(subject_indices)):
    
        # get the measures given by the current subject
        if i==len(subject_indices)-1:
            cur_y = y_col[subject_indices[i]:]
        else:
            cur_y = y_col[subject_indices[i]:subject_indices[i+1]]
            
        # get the subject's median for the measure
        cur_y_median = np.median(cur_y)
        
        # create the binary version of the vector, based on the median
        cur_binary_y = (cur_y > cur_y_median).astype(int)
        
        # insert the modified binary vector to the output array
        binary_y = np.concatenate((binary_y, cur_binary_y))
        
    return binary_y

In [None]:
def binarify_overall_median(y_col):
    
    y_col_median = np.median(y_col)
    binary_y = (y_col > y_col_median).astype(int)
    
    return binary_y

In [None]:
def dataCheck(binary_y_col):
    
    per_ones = int(np.mean(binary_y_col) * 100)
    per_zeros = 100 - per_ones
    
    print("The y vector has "+str(per_ones)+"% 1s, and "+str(per_zeros)+"% zeros.")
    
    return None

In [None]:
def replaceMissingValues(X):
    
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    replaced_X = imp.fit_transform(X)
    
    return replaced_X

In [None]:
def scale(replaced_X):

    std_scaler = StandardScaler()
    scaled_X = std_scaler.fit_transform(replaced_X)

    return scaled_X

In [None]:
def sanityCheck3(scaled_X, outlier_fraction):
    
    print(f"{outlier_fraction*100} percent of the data is around {int(scaled_X.shape[0]*outlier_fraction)} outliers.")
    
    return None

In [None]:
def displayOutliers(scaled_X, outlier_fraction):
    
    # get PCA components
    pca_components = getPCA(scaled_X)
    
    # envelop and make the outliers
    pca_outliers, pca_no_outliers, outliers_indices = getMapping(pca_components, outlier_fraction)
    
    # display the outliers
    displayMapping(pca_outliers, pca_no_outliers)
    
    return outliers_indices

In [None]:
def getPCA(scaled_X):
    
    # apply PCA to get 2 components
    pca = PCA(n_components=2)
    pca_components = pca.fit_transform(scaled_X)

    return pca_components

In [None]:
def getMapping(pca_components, outlier_fraction):
    
    # create elliptic envelope
    elenv = EllipticEnvelope(contamination=outlier_fraction)

    # fit ellipsoid to the data
    outlier_pred = elenv.fit_predict(pca_components)

    # get indices with outliers
    anom_index = (outlier_pred==-1).nonzero()[0]

    # get outlier data
    pca_outliers = pca_components[anom_index]

    # get non-outlier data
    pca_no_outliers = np.delete(pca_components, anom_index, 0)
    
    return pca_outliers, pca_no_outliers, anom_index

In [None]:
def displayMapping(pca_outliers, pca_no_outliers):
    
    # visualizing the non-outliers
    x = pca_no_outliers[:,0]
    y = pca_no_outliers[:,1]

    # visualizing the outliers
    x_anom = pca_outliers[:,0]
    y_anom = pca_outliers[:,1]

    fig = plt.figure(figsize = (10, 10))
    ax = fig.add_subplot(111)
    ax.scatter(x,y)
    ax.scatter(x_anom, y_anom, color='r')
    
    return None

In [None]:
def removeOutliers(outliers_indices, scaled_X, binary_y_col, y_col, stimulus_col, subID_col):

    processed_X = np.delete(scaled_X, outliers_indices, 0)
    processed_bi_y = np.delete(binary_y_col, outliers_indices, 0)
    processed_y = np.delete(y_col, outliers_indices, 0)
    processed_stim_col = np.delete(stimulus_col, outliers_indices, 0)
    processed_subID_col = np.delete(subID_col, outliers_indices, 0)
    
    return processed_X, processed_bi_y, processed_y, processed_stim_col, processed_subID_col