In [2]:
from collections import defaultdict, Counter
from graphviz import Digraph
import os

# List the contents of the current directory
print(os.listdir(os.getcwd()))

['Wrangling.ipynb', '.ipynb_checkpoints', 'data_old_files.txt', 'data_files.txt', 'folder_structure', 'folder_structure.pdf']


First we will visualize the old data structure:

In [3]:
# Read file paths from the text file
file_paths_txt = "data_old_files.txt"  # Adjust as needed

def read_file_paths(file_path):
    with open(file_path, 'r') as f:
        file_paths = [line.strip() for line in f]
    return file_paths

file_paths = read_file_paths(file_paths_txt)

In [4]:
from graphviz import Digraph
from collections import defaultdict

# Function to count files in each folder
def create_folder_structure(file_paths):
    folder_dict = defaultdict(list)
    
    for file_path in file_paths:
        parts = file_path.strip().split('/')
        folder_path = '/'.join(parts[:-1])  # Folder path without file name
        folder_dict[folder_path].append(parts[-1])  # Append file name to folder
    
    return folder_dict

# Function to generate the folder structure graph
def create_graph(folder_dict):
    dot = Digraph(comment='Folder Structure')
    dot.attr(rankdir='TB', ranksep='2', nodesep='0.5')  # Set to vertical orientation and adjust spacing
    
    for folder, files in folder_dict.items():
        subfolders = folder.split('/')
        
        # Create nodes for the folder hierarchy
        for i in range(1, len(subfolders)):
            parent = '/'.join(subfolders[:i])
            child = '/'.join(subfolders[:i+1])
            dot.node(child, label=subfolders[i])
            dot.edge(parent, child)
        
        # Add the file count at the lowest-level folder
        file_count = len(files)
        dot.node(folder, label=f"{subfolders[-1]} ({file_count} files)")
    
    return dot

# Assuming `file_paths` contains the list of absolute file paths
file_paths = []
with open("data_files.txt", 'r') as f:
    file_paths = [line.strip() for line in f]

folder_dict = create_folder_structure(file_paths)
graph = create_graph(folder_dict)
graph.attr(dpi='300')  # Set DPI to 300 for high resolution
graph.render('folder_structure', format='pdf')  # Save as PDF

'folder_structure.pdf'

This worked okay... but the data set is quite large and the structure is complicated. It will likely be more comprehensive to just filter through what files have already been transferred over to the new data structure and which have not:

In [5]:
# Paths to the text files (inside our project folder)
data_old_path = "data_old_files.txt"
data_path = "data_files.txt"

# Read file names from text files
def read_file_names(file_path):
    with open(file_path, 'r') as f:
        # Strip whitespace and get the last part of the path (file name)
        file_names = ['/'.join(line.strip().split('/')[-1:]) for line in f]

    return file_names

# Read file names from text files
data_old_files1 = read_file_names(data_old_path)
data_files1 = read_file_names(data_path)

In [6]:
#The above function works like this
path_to_file = 'this/is/my/path/until/my/file'
'/'.join(path_to_file.strip().split('/')[-1:])

'file'

In [7]:
# Function to filter out .DS_Store files
def filter_out_ds_store(file_list):
    return [file for file in file_list if not (file.endswith('.DS_Store') or file.endswith('._.DS_Store'))]

# Filter out .DS_Store files from both lists
data_old_files = filter_out_ds_store(data_old_files1)
data_files = filter_out_ds_store(data_files1)

# Count occurrences of each file name
file_name_counts_old = Counter(data_old_files)

# Find file names with duplicates in data_old_files
duplicate_file_names_old = {file_name: count for file_name, count in file_name_counts_old.items() if count > 1}

# Number of duplicate file names in data_old_files
num_duplicate_file_names_old = len(duplicate_file_names_old)
total_duplicates_old = sum(count - 1 for count in duplicate_file_names_old.values())

print(f"Number of duplicate file names in data_old_files: {num_duplicate_file_names_old}")
print(f"Total number of duplicate occurrences in data_old_files: {total_duplicates_old}")

# Optional: Display file names and their counts
for file_name, count in duplicate_file_names_old.items():
    print(f"{file_name}: {count} occurrences")

Number of duplicate file names in data_old_files: 84
Total number of duplicate occurrences in data_old_files: 261
ensemble1.nc: 3 occurrences
ensemble10.nc: 3 occurrences
ensemble11.nc: 3 occurrences
ensemble12.nc: 3 occurrences
ensemble13.nc: 3 occurrences
ensemble2.nc: 3 occurrences
ensemble3.nc: 3 occurrences
ensemble4.nc: 3 occurrences
ensemble5.nc: 3 occurrences
ensemble6.nc: 3 occurrences
ensemble7.nc: 3 occurrences
ensemble8.nc: 3 occurrences
ensemble9.nc: 3 occurrences
out_0_ensemble1.nc: 5 occurrences
out_0_ensemble10.nc: 5 occurrences
out_0_ensemble11.nc: 5 occurrences
out_0_ensemble12.nc: 5 occurrences
out_0_ensemble13.nc: 5 occurrences
out_0_ensemble2.nc: 5 occurrences
out_0_ensemble3.nc: 5 occurrences
out_0_ensemble4.nc: 5 occurrences
out_0_ensemble5.nc: 5 occurrences
out_0_ensemble6.nc: 5 occurrences
out_0_ensemble7.nc: 5 occurrences
out_0_ensemble8.nc: 5 occurrences
out_0_ensemble9.nc: 5 occurrences
out_1_ensemble1.nc: 5 occurrences
out_1_ensemble10.nc: 5 occurrences
out

For now we will remove the duplicates from data_old_files. We can then easily automate the transfer of files with non-duplicate names and deal with the more difficult duplicates later.

In [9]:
# Get unique file names by using a set to filter duplicates
unique_data_old_files = list(set(data_old_files))

In [10]:
#Lets see which (of our non-duplicate) files have already been transferred and which have not
# Convert lists of lists to sets of tuples for easier comparison
def convert_to_set_of_tuples(file_list):
    return set(tuple(file) for file in file_list)

# Convert lists to sets of tuples
data_old_set = convert_to_set_of_tuples(unique_data_old_files)
data_set = convert_to_set_of_tuples(data_files)

# Files in data_old_files but not in data_files
files_only_in_old = data_old_set - data_set

# Files in both data_old_files and data_files
files_in_both = data_old_set & data_set

In [11]:
print(str(len(files_in_both)) + ' files have been transferred over from data_old to data.') 
print(str(len(files_only_in_old)) + ' files exist only in data_old.')
print('data_old contains ' + str(len(unique_data_old_files)) + ' total files (with unique names).')
print('6874 + 13550 = ' + str(6874 + 13550))

6874 files have been transferred over from data_old to data.
13550 files exist only in data_old.
data_old contains 20424 total files (with unique names).
6874 + 13550 = 20424


Now we want to consider how to move all of these old files into data. To make sure we aren't missing any files, lets check that out of the files with names appearing in both data_old and data, there are no duplicates in data (ie that we are not assuming 

In [21]:
# Check if any duplicate file names exist in both data and data_old
common_file_names = set(data_files).intersection(files_in_both)

# Number of common file names
num_common_file_names = len(common_file_names)

print(f"Number of recurrent names in both data and data_old: {num_common_file_names}")


Number of recurrent names in both data and data_old: 0


In [17]:
This means that none of 

20424