# Automating File Movement Based on ID Matching

In [1]:
import os
import shutil
import pandas as pd

# 1 -Defining the File Moving Function

This cell contains a function named move_files_based_on_ids that moves files from a source directory to a target directory based on IDs listed in a TSV file. The function is documented with comments explaining each step and parameter.

## Parameters:

**ids_file_path:** The path to the TSV file containing the IDs of interest. This file should have a column named HudAlphaID.

**source_dir:** The directory where the files currently reside.

**target_dir:** The destination directory where the files should be moved.

## Process:

- The function starts by reading the TSV file using pandas to get the unique HudAlphaID values.

- It then iterates over these IDs, and for each ID, it checks every file in the source directory to see if the file name contains the ID.

- If a match is found, the function moves the file from the source directory to the target directory using shutil.move.
  
- It keeps a count of the moved files and prints the names of the files as they are moved.

- Finally, it prints a summary of the total number of files moved.

In [5]:
def move_files_based_on_ids(ids_file_path, source_dir, target_dir):
    """
    Move files from source directory to target directory based on IDs listed in a TSV file.

    Parameters:
    - ids_file_path: Path to the TSV file containing IDs of interest.
    This file is produced in the notebook Patient_selection.ipynb. This notebook is stored in the folder "S2_PatientSelection"
    
    - source_dir: Directory where files are located.
    - target_dir: Directory to move the files to.
    """
    # 1 - Read IDs of interest from the TSV file using Pandas
    ids_df = pd.read_csv(ids_file_path, sep='\t', usecols=['HudAlphaID'])
    
    # 2 - Selecting the series "HudAlphaID". Apply the method .unique() to have only unique entries in the series
    ids_of_interest = ids_df['HudAlphaID'].unique() 
    
    # 3 -Initialize a counter for the moved files
    moved_files_count = 0

    # 4 - Iterate over IDs of interest
    # first we will print 1 id_of_interest and then we will iterate over all the file_name and evaluate 1 by 1 the presence of id_of_interest
    # if id_of_interest is in the file_name. Then we proceed to create source and target paths and move the files
    for id_of_interest in ids_of_interest:
        # Check each file in the source directory
        for file_name in os.listdir(source_dir):
            # If the current ID is found in the file name
            if id_of_interest in file_name:
                source_path = os.path.join(source_dir, file_name)
                target_path = os.path.join(target_dir, file_name)
                # Move the file
                shutil.move(source_path, target_path) # Execute the moving
                print(f'Moved: {file_name}')
                moved_files_count += 1

    # Print summary
    print(f"Done moving files. Total files moved: {moved_files_count}.")

# 2 - Running the function

In [4]:
# Example usage:
ids_file_path = '/Volumes/LaCie/PPMI_RNASEQDATA/Data4Docker/output-hud_alpha_ids/unique_hud_alpha_ids.tsv'  # Update this path to your TSV file
source_dir = '/Volumes/LaCie/PPMI_RNASEQDATA/IR3/counts'
target_dir = '/Volumes/LaCie/PPMI_RNASEQDATA/Data4Docker/PD_RNAseq_CountData'

# Call the function
move_files_based_on_ids(ids_file_path, source_dir, target_dir)

# Data4Docker is the directory that will be mounted to Docker

Moved: PPMI-Phase2-IR3.40242.BL.0003451967.5104-SL-2821.featureCounts.GencodeV29.txt
Moved: PPMI-Phase2-IR3.40010.BL.0003451958.5104-SL-2823.featureCounts.GencodeV29.txt
Moved: PPMI-Phase2-IR3.3658.V02.PP0018-1171.5104-SL-2826.featureCounts.GencodeV29.txt
Moved: PPMI-Phase2-IR3.92490.V02.0000377639.5104-SL-2827.featureCounts.GencodeV29.txt
Moved: PPMI-Phase2-IR3.40898.V06.0001238258.5104-SL-2829.featureCounts.GencodeV29.txt
Moved: PPMI-Phase2-IR3.58138.BL.0003255318.5104-SL-2830.featureCounts.GencodeV29.txt
Moved: PPMI-Phase2-IR3.41141.V06.0003285588.5104-SL-2831.featureCounts.GencodeV29.txt
Moved: PPMI-Phase2-IR3.50159.V06.0003031490.5104-SL-2833.featureCounts.GencodeV29.txt
Moved: PPMI-Phase2-IR3.40898.BL.PP0041-7466.5104-SL-2837.featureCounts.GencodeV29.txt
Moved: PPMI-Phase2-IR3.50901.V02.0000373958.5104-SL-2841.featureCounts.GencodeV29.txt
Moved: PPMI-Phase2-IR3.40922R.V06.0001241184.5104-SL-2843.featureCounts.GencodeV29.txt
Moved: PPMI-Phase2-IR3.41442.V06.0003109390.5104-SL-2844