## This notebook performs checks on the data.

In [None]:
import pandas as pd
import os
import glob
import hashlib
import sys

sys.path.insert(0,'..')
import local_paths

def get_folder_size(path):
    size = 0
    for entry in os.scandir(path):
        if entry.is_file():
            size += os.path.getsize(entry)
        elif entry.is_dir():
            size += get_folder_size(entry)
    return size

def files_check(PATH, expected_count, expected_size):
    _, _, files = next(os.walk(PATH))
    count = len(files)
    size = get_folder_size(PATH)     

    if(count != expected_count):
        print(PATH, ": incorrect number of files." )
        print("n.o. files: ", count)
        print("n.o. files expected:", expected_count )
    if(size != expected_size):
        print(PATH, ": incorrect size." )
        print("size is: ", size)
        print("expected size:", expected_size )
        
## Created by Candace Makeda Moore:
def create_hash_df(folder, file_extension):
    hash_list = []
    file_names = []
    files = glob.glob(os.path.join(folder, '*' + file_extension))
    BUF_SIZE = 65536
    for file in files:
        sha256 = hashlib.sha256()
        with open(file, 'rb') as f:
            while True:
                data = f.read(BUF_SIZE)
                if not data:
                    break
                sha256.update(data)
        result = sha256.hexdigest()
        hash_list.append(result)
        file_names.append(os.path.basename(file))
        
    df = pd.DataFrame(hash_list, file_names)
    df.columns = ["hash"]
    df = df.reset_index() 
    df = df.rename(columns = {'index':'file_name'})
    
    return df

#### Create hashes

In [None]:
## DDP
for age_group, path in local_paths.DDP_dict.items():
    df = create_hash_df(path, '.cnt')
    savepath = os.path.join(local_paths.hashes, 'DDP_' + str(age_group) + '.csv')
    df.to_csv(savepath, index = False)
    print(f"Hash file ")
    
## ePod
df = create_hash_df(local_paths.ePod_dataset, '.bdf')
savepath = os.path.join(local_paths.hashes, 'ePod.csv')
df.to_csv(savepath)

#### Compare hash for each file

In [None]:
folders = ["ePod", "DDP_5", "DDP_11", "DDP_17", "DDP_23", 
            "DDP_29", "DDP_35", "DDP_41", "DDP_47"]

for folder in folders:

    df_cloud = pd.read_csv(os.path.join(local_paths.hashes, folder + '.csv'))
    df_local = pd.read_csv(os.path.join(local_paths.hashes, folder + '_local.csv'))

    df_merged = pd.merge(df_cloud, df_local, how = 'left', on = ['file_name'])
    df_merged = df_merged[['file_name', 'hash_x', 'hash_y']]
    df_merged.to_csv(os.path.join(local_paths.hashes, folder + '_merged.csv'), index = False)
    
    for index, row in df_merged.iterrows():
        num_errors = 0
        if(row.iloc[1] != row.iloc[2]):
            print(row.iloc[0])
            num_errors += 1
    if num_errors == 0:
        print(f"All files from {folder} have the correct hash.")

#### Find number of files in folder

In [None]:
import glob
import os
import local_paths
len(glob.glob(os.path.join(local_paths.ePod_dataset, '*.bdf')))


#### Get size of folder and subfolders

In [None]:
path = os.path.join("home", os.path.expanduser('~'), "eegyolk")
# path = "/volume-ceph"
# path = "/eegyolk"
path = local_paths.DDP_epochs
# path = os.path.join("home", os.path.expanduser('~'))

size_GB = get_folder_size(path)/1024**3
print(f"The path: {path} \ncontains {round(size_GB, 2)} GB of data")

In [None]:
len(os.listdir(local_paths.DDP_processed))