# Listing all the unique IDs

Run to execute. Use Restart to run again.

You'll be asked to select an Excel file that contains the mapping and configuration. Use the same one as used for Remapping Datasets.

In [None]:
# Run to execute, use Restart to run again
import json
import os
import pandas as pd
from datetime import datetime
import generic_functions2 as gf

In [None]:
# some functions to allow to create more than one conversion.xlsx

class StopExecution(Exception):
    def _render_traceback_(self):
        pass


def get_all_items(folder, type="folders"):
    """
    This function gets by default all the subfolders from folder
    If type == files, then it gets all the files in that folder
    """
    all_outputs = []
    for item in os.listdir(folder):
        if type == "folders":
            if os.path.isdir(folder + "/" + item):
                all_outputs.append(item)
        if type == "files":
            if os.path.isfile(folder + "/" + item):
                all_outputs.append(item)
    return sorted(all_outputs, reverse=True)


def title_wrapper(func):
    """
    This function wraps the title with * and -.
    
    Used by print_title
    """

    def wrapper(*args, **kwargs):
        print("*" * 90)
        func(*args, **kwargs)
        print("-" * 90)

    return wrapper


@title_wrapper
def print_title(text):
    """
    This function wraps the text in the title_wrapper
    """
    print(text)


def choose_retry(text):
    """
    This generic function returns the input from the user.
    """
    return input("Please choose Q(uit) or between " + text + ": ")


def choose_dir_item(folder, type="folders", what="All"):
    """
    Pick a subfolder or file from a folder.
    
    """
    all_items = get_all_items(folder, type)
    print_title("Choose source by number")
    # create dictionary with all folders
    items = {}
    if what != "All":
        temp_all_items = []
        for item in all_items:
            if what in item:
                if item[0] != '~':
                    temp_all_items.append(item)
        all_items = temp_all_items

    for count, item in enumerate(all_items):
        items[count + 1] = item
        extra = ""
        if count + 1 < 10:
            extra = " "
        print(f"[{count+1}]{extra} {items[count+1]}", end="\t")
        if (count + 1) % 3 == 0:
            print("")
    print("\n")
    item_choosen = False
    while item_choosen == False:
        this_answer = choose_retry(str(1) + " and " + str(count + 1))
        try:
            if int(this_answer) > 0 and int(this_answer) <= count + 1:
                item_choosen = True
                return all_items[int(this_answer) - 1]
        except:
            if this_answer.lower() == "q":
                print_title("Quiting the notebook run on instruction of the user")
                item_choosen = True
                raise StopExecution

In [None]:
# select the conversion.xslx
folder = './'
print('\n'*2)
print_title('Select the Excel containing all the configuration & mapping')
conversion_excel = choose_dir_item(folder,'files', 'xlsx')

In [None]:
sheet_name='Settings'
settings_table = pd.read_excel(conversion_excel, sheet_name=sheet_name, index_col='Item')

# read settings
source_dir = settings_table.loc["source_dir"]["Variable"]
converted_dir = settings_table.loc["converted_dir"]["Variable"]
source_file = settings_table.loc["source_filename"]["Variable"]
source_separator = settings_table.loc["source_separator"]["Variable"]
converted_file = settings_table.loc["converted_filename"]["Variable"]
converted_separator = settings_table.loc["converted_separator"]["Variable"]
id_variable = settings_table.loc["id_variable"]["Variable"]

# read data
df = pd.read_csv(source_dir + '/' + source_file, index_col=None, low_memory=False, dtype="string")

In [None]:
df_unique_ids = pd.DataFrame({'UniqueIDs': df[id_variable].unique()})

In [None]:
stamp = f'{datetime.now():%Y%m%d-%H%M%S}'
# not always there will be ids
try:
    file_name = source_dir + f"/{stamp}_" + converted_file + '_UNIQUE_IDS'
    df_unique_ids.to_csv(file_name + ".csv", sep=converted_separator, index=False)
except:
    pass

input('Enter to continue....')
print('\n'*2)
print_title(
    f"All done, the output (CSV and Excel) can be found timestamped in: {source_dir}"
)