# CSV Randomiser

This notebook randomizes the order of entries in a CSV file while preserving the header row and original file.

**Features:**
- Reads CSV file and preserves header
- Randomly shuffles data rows
- Outputs to new file with 'random_' prefix
- Maintains original file integrity
- Progress reporting

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import os
from datetime import datetime
import random

## Configuration

Set your input file path and configuration options here.

In [2]:
# Configuration
INPUT_FILE = 'Dataset/top20_entries_across_journals.csv'
SEED = None  # Set to a number for reproducible randomization, None for truly random

# Set random seed if specified
if SEED is not None:
    random.seed(SEED)
    np.random.seed(SEED)
    print(f"Random seed set to: {SEED}")
else:
    print("Using random seed (results will vary each run)")

Using random seed (results will vary each run)


## Randomization Function

Main function that handles the CSV randomization process.

In [3]:
def randomize_csv(input_file, seed=None):
    """
    Randomize the order of rows in a CSV file while preserving the header.
    
    Args:
        input_file (str): Path to the input CSV file
        seed (int, optional): Random seed for reproducible results
        
    Returns:
        str: Path to the output file
    """
    try:
        # Check if input file exists
        if not os.path.exists(input_file):
            raise FileNotFoundError(f"Input file not found: {input_file}")
        
        print(f"Reading CSV file: {input_file}")
        
        # Read the CSV file
        df = pd.read_csv(input_file)
        original_rows = len(df)
        
        print(f"Original file contains {original_rows} rows (excluding header)")
        
        # Create a copy and shuffle the rows
        df_shuffled = df.copy()
        df_shuffled = df_shuffled.sample(frac=1, random_state=seed).reset_index(drop=True)
        
        # Generate output filename
        directory = os.path.dirname(input_file)
        filename = os.path.basename(input_file)
        name, ext = os.path.splitext(filename)
        
        if directory:
            output_file = os.path.join(directory, f"random_{name}{ext}")
        else:
            output_file = f"random_{name}{ext}"
        
        # Save the shuffled data
        df_shuffled.to_csv(output_file, index=False)
        
        print(f"Randomized file saved as: {output_file}")
        print(f"Successfully shuffled {original_rows} rows")
        
        # Verify the output
        df_verify = pd.read_csv(output_file)
        if len(df_verify) == original_rows:
            print("✓ Verification passed: Row count matches original")
        else:
            print("⚠ Warning: Row count mismatch!")
        
        return output_file
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return None

## Execute Randomization

Run the randomization process on the specified CSV file.

In [4]:
# Execute the randomization
print("=" * 60)
print("CSV RANDOMIZER")
print("=" * 60)
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()

output_file = randomize_csv(INPUT_FILE, SEED)

if output_file:
    print()
    print("=" * 60)
    print("RANDOMIZATION COMPLETE")
    print("=" * 60)
    print(f"Input file:  {INPUT_FILE}")
    print(f"Output file: {output_file}")
    print()
    print("Original file has been preserved.")
else:
    print()
    print("=" * 60)
    print("RANDOMIZATION FAILED")
    print("=" * 60)

CSV RANDOMIZER
Timestamp: 2025-11-10 15:12:17

Reading CSV file: Dataset/top20_entries_across_journals.csv
Original file contains 104056 rows (excluding header)
Randomized file saved as: Dataset/random_top20_entries_across_journals.csv
Successfully shuffled 104056 rows
✓ Verification passed: Row count matches original

RANDOMIZATION COMPLETE
Input file:  Dataset/top20_entries_across_journals.csv
Output file: Dataset/random_top20_entries_across_journals.csv

Original file has been preserved.


## Verification and Statistics

Optional verification and comparison between original and randomized files.

In [5]:
# Verification function
def compare_files(original_file, randomized_file):
    """
    Compare original and randomized files to ensure data integrity.
    
    Args:
        original_file (str): Path to original CSV file
        randomized_file (str): Path to randomized CSV file
    """
    try:
        # Read both files
        df_original = pd.read_csv(original_file)
        df_randomized = pd.read_csv(randomized_file)
        
        print("FILE COMPARISON")
        print("-" * 40)
        print(f"Original rows:    {len(df_original)}")
        print(f"Randomized rows:  {len(df_randomized)}")
        print(f"Columns match:    {list(df_original.columns) == list(df_randomized.columns)}")
        
        # Check if all data is preserved (just reordered)
        original_sorted = df_original.sort_values(by=df_original.columns.tolist()).reset_index(drop=True)
        randomized_sorted = df_randomized.sort_values(by=df_randomized.columns.tolist()).reset_index(drop=True)
        
        data_preserved = original_sorted.equals(randomized_sorted)
        print(f"Data preserved:   {data_preserved}")
        
        if data_preserved:
            print("✓ All data successfully preserved and randomized")
        else:
            print("⚠ Warning: Data may have been lost or corrupted")
            
        # Show first few rows of each file for visual comparison
        print(f"\nFirst 3 rows of original file:")
        print(df_original.head(3).to_string())
        
        print(f"\nFirst 3 rows of randomized file:")
        print(df_randomized.head(3).to_string())
        
    except Exception as e:
        print(f"Comparison failed: {str(e)}")

# Run comparison if randomization was successful
if output_file and os.path.exists(output_file):
    print()
    compare_files(INPUT_FILE, output_file)
else:
    print("Cannot perform comparison - randomization was not successful")


FILE COMPARISON
----------------------------------------
Original rows:    104056
Randomized rows:  104056
Columns match:    True
Data preserved:   True
✓ All data successfully preserved and randomized

First 3 rows of original file:
                             DOI        PMID        PMCID    Year  Citation_Count                                                                                                                                                                                           Title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

## Utilities

Additional utility functions for file management.

In [None]:
# Utility functions
def list_random_files(directory="Dataset"):
    """List all files with 'random_' prefix in the specified directory."""
    try:
        if not os.path.exists(directory):
            print(f"Directory not found: {directory}")
            return
        
        random_files = [f for f in os.listdir(directory) if f.startswith('random_')]
        
        if random_files:
            print(f"Random files found in {directory}:")
            for i, file in enumerate(random_files, 1):
                file_path = os.path.join(directory, file)
                size = os.path.getsize(file_path)
                mod_time = datetime.fromtimestamp(os.path.getmtime(file_path))
                print(f"  {i}. {file} ({size} bytes, modified: {mod_time.strftime('%Y-%m-%d %H:%M:%S')})")
        else:
            print(f"No random files found in {directory}")
            
    except Exception as e:
        print(f"Error listing files: {str(e)}")

def clean_random_files(directory="Dataset", confirm=True):
    """Remove all files with 'random_' prefix (use with caution!)."""
    try:
        if not os.path.exists(directory):
            print(f"Directory not found: {directory}")
            return
        
        random_files = [f for f in os.listdir(directory) if f.startswith('random_')]
        
        if not random_files:
            print(f"No random files found in {directory}")
            return
        
        print(f"Found {len(random_files)} random files:")
        for file in random_files:
            print(f"  - {file}")
        
        if confirm:
            response = input("\nAre you sure you want to delete these files? (yes/no): ")
            if response.lower() != 'yes':
                print("Operation cancelled.")
                return
        
        deleted_count = 0
        for file in random_files:
            file_path = os.path.join(directory, file)
            os.remove(file_path)
            deleted_count += 1
            print(f"Deleted: {file}")
        
        print(f"\nSuccessfully deleted {deleted_count} files.")
        
    except Exception as e:
        print(f"Error cleaning files: {str(e)}")

# Uncomment to list existing random files
# list_random_files()

# Uncomment to clean up random files (BE CAREFUL!)
# clean_random_files()