In [1]:
import os
import numpy as np
import pandas as pd
import glob

In [2]:
def collect_data(regex_mask: str,
                 save_to: str,
                 new_name: str,
                 new_columns: list = None,
                 different_shapes: bool = False,
                 index_label: str = None):
    # select all filter satisfying files
    files = glob.glob(regex_mask)
    # create empty list to store dataframes
    li = []
    if different_shapes:
        for f in files:
            # read in np file
            temp_df = pd.DataFrame(np.load(f))
    
            # append df to list
            li.append(temp_df)

        # concatenate our list of dataframes into one!
        data = pd.concat(li, axis=0)
    else:
        # loop through list of files
        for f in files:
            # read in np file
            temp = np.load(f)
    
            # append df to list
            li.append(temp)
        
        # concatenate our list of np_arrays into one Pandas Frame!
        data = pd.DataFrame(np.concatenate(li, axis=0))
    if new_columns:
        data.columns = new_columns
    os.makedirs(save_to, exist_ok=True)  
    data.to_csv(save_to + '/' + new_name + '.csv', index_label=index_label)
    print(f"Collected shape = {data.shape}, saved at {save_to + '/' + new_name + '.csv'}")

In [3]:
labels_cols = ['Aggregated', '1', '2', '3', '4', '5', '6', '7']
collect_data(regex_mask='./dataset/*/*.labels.npy',
            save_to='./dataset/new',
            new_name='labels',
            new_columns=labels_cols,
            different_shapes=True,
            index_label='Position')

Collected shape = (120000, 8), saved at ./dataset/new/labels.csv


In [5]:
folders = ["comcuc", "cowpig1", "eucdov", "eueowl1", "grswoo", "tawowl1"]
feature_names = []
with open("./description/feature_names.txt", "r") as file:
    feature_names = file.read().splitlines()
print(f"{len(feature_names)} features are read")

548 features are read


In [6]:
# TAKES A WHILE, COMPLETES ALL THE DATA INTO 1 ~700 MB pd DataFrame with shape (120_000, 548)
collect_data(regex_mask='./dataset/*/*[!labels].npy',
             save_to='./dataset/new',
             new_name='data',
             new_columns=feature_names)

Collected shape = (120000, 548), saved at ./dataset/new/data.csv


In [7]:
for folder in folders:
    collect_data(regex_mask='./dataset/' + folder + '/*[!labels].npy',
             save_to='./dataset/new',
             new_name=folder,
             new_columns=feature_names)

    collect_data(regex_mask='./dataset/' + folder + '/*.labels.npy',
                save_to='./dataset/new',
                new_name=folder + '_labels',
                new_columns=labels_cols,
                different_shapes=True,
                index_label='Position')

Collected shape = (20000, 548), saved at ./dataset/new/comcuc.csv
Collected shape = (20000, 8), saved at ./dataset/new/comcuc_labels.csv
Collected shape = (20000, 548), saved at ./dataset/new/cowpig1.csv
Collected shape = (20000, 8), saved at ./dataset/new/cowpig1_labels.csv
Collected shape = (20000, 548), saved at ./dataset/new/eucdov.csv
Collected shape = (20000, 8), saved at ./dataset/new/eucdov_labels.csv
Collected shape = (20000, 548), saved at ./dataset/new/eueowl1.csv
Collected shape = (20000, 8), saved at ./dataset/new/eueowl1_labels.csv
Collected shape = (20000, 548), saved at ./dataset/new/grswoo.csv
Collected shape = (20000, 8), saved at ./dataset/new/grswoo_labels.csv
Collected shape = (20000, 548), saved at ./dataset/new/tawowl1.csv
Collected shape = (20000, 8), saved at ./dataset/new/tawowl1_labels.csv
