# Get XLS Files BIFF Version with XLRD

In [15]:
import os
import xlrd
from collections import Counter

# Directory containing .xls files
data_dir = "../data/train_big"

# Dictionary to store counts of BIFF versions
biff_versions = Counter()

# Counter for files with no BIFF version information
no_version_files = 0

# List to log files causing warnings
warning_files = []

# Dictionary to track filenames for each non-BIFF8 version
non_biff8_files = {}

# Loop through each .xls file in the directory
for filename in os.listdir(data_dir):
    if filename.endswith(".xls"):  # Process only .xls files
        file_path = os.path.join(data_dir, filename)
        try:
            # Open the workbook using xlrd
            workbook = xlrd.open_workbook(file_path, on_demand=True)
            
            # Check for BIFF version
            version = workbook.biff_version
            
            # Increment the counter for this BIFF version
            biff_versions[version] += 1
            
            # Log filenames for non-BIFF8 versions
            if version != 80:  # BIFF8 corresponds to 80
                if version not in non_biff8_files:
                    non_biff8_files[version] = []
                non_biff8_files[version].append(filename)
        except Exception as e:
            # Log the file that caused the exception
            no_version_files += 1
            warning_files.append((filename, str(e)))  # Save file name and exception
        finally:
            # Close the workbook if opened
            if 'workbook' in locals():
                workbook.release_resources()

# Print metrics
print("BIFF Version Counts:")
for version, count in biff_versions.items():
    print(f"  BIFF{version // 10}: {count} files")  # Format BIFF version

print(f"\nFiles with no BIFF version information: {no_version_files}")

if warning_files:
    print("\nFiles causing warnings:")
    for file, warning in warning_files:
        print(f"  {file}: {warning}")

if non_biff8_files:
    print("\nFiles with Non-BIFF8 Formats:")
    for version, files in non_biff8_files.items():
        print(f"\n  BIFF{version // 10}:")
        for file in files:
            print(f"    {file}")


*** Setting on_demand to False.
*** Setting on_demand to False.
*** Setting on_demand to False.
BIFF Version Counts:
  BIFF8: 392 files
  BIFF7: 11 files
  BIFF4: 3 files
  BIFF5: 3 files

Files with no BIFF version information: 0

Files with Non-BIFF8 Formats:

  BIFF7:
    mark_taylor_000_1_1.pst.91.xls
    darrell_schoolcraft_000_1_1_1.pst.533.xls
    sara_shackleton_000_1_2.pst.239.xls
    lindy_donoho_000_1_1_1.pst.110.xls
    sara_shackleton_000_1_1_1.pst.53.xls
    theresa_staab_000_1_1.pst.104.xls
    sara_shackleton_000_1_1_1.pst.54.xls
    shelley_corman_000_1_1.pst.55.xls
    richard_sanders_001_1_1_1.pst.11.xls
    david_delainey_000_1_1_1.pst.20.xls
    mark_taylor_000_1_1.pst.174.xls

  BIFF4:
    shelley_corman_000_1_1.pst.28.xls
    dutch_quigley_000_1_1.pst.213.xls
    benjamin_rogers_000_1_1.pst.72.xls

  BIFF5:
    kevin_hyatt_000_1_1.pst.62.xls
    benjamin_rogers_000_1_1.pst.71.xls
    dutch_quigley_000_1_1.pst.1.xls


## Count Total Files in a Directory

In [6]:
import os

# Directory path
data_dir = "../data/enron_clean"

# Count the total number of files
total_files = len([file for file in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, file))])

# Print the total count
print(f"Total number of files in '{data_dir}': {total_files}")


Total number of files in '../data/enron_clean': 945


## List Files in a Directory with Recursive Search

In [45]:
import os

# Define the relative path to the target directory
target_directory = '../data2/'

# Initialize a dictionary to store subdirectory names and file counts
subdirectory_file_counts = {}

# Walk through the target directory and its subdirectories
for root, dirs, files in os.walk(target_directory):
    # Skip the top-level directory itself (if needed)
    if root == target_directory:
        continue
    
    # Get the name of the current subdirectory
    subdirectory_name = os.path.basename(root)
    
    # Count the number of files in the current subdirectory
    file_count = len(files)
    
    # Store the subdirectory name and file count in the dictionary
    subdirectory_file_counts[subdirectory_name] = file_count

# Sort the dictionary by subdirectory name in ascending order
sorted_file_counts = sorted(subdirectory_file_counts.items())

# Display the sorted results
for subdirectory, count in sorted_file_counts:
    print(f"{subdirectory}: {count} files")


.ipynb_checkpoints: 1 files
csv: 2 files
enron: 1253 files
enron (1): 624 files
enron_clean: 945 files
infer_small: 4 files
massive: 3 files
one_bold_small: 10 files
test: 2 files
test_big: 283 files
test_big (1): 183 files
test_big (2): 100 files
test_medium: 173 files
test_medium (1): 50 files
test_micro: 23 files
test_micro (1): 6 files
test_small: 92 files
test_small (1): 25 files
test_teeny: 3 files
test_tiny: 12 files
train2: 871 files
train_big: 799 files
train_double: 1247 files
train_medium: 1196 files
train_medium (1): 400 files
train_micro: 159 files
train_micro (1): 50 files
train_small: 634 files
train_small (1): 200 files
train_teeny: 25 files
train_tiny: 100 files
val_big: 312 files
val_big (1): 212 files
val_big (2): 100 files
val_medium: 187 files
val_medium (1): 50 files
val_micro: 22 files
val_micro (1): 6 files
val_small: 93 files
val_small (1): 25 files
val_teeny: 3 files
val_tiny: 12 files


# Clean Directories for Duplicates

big (800/100/100) > medium (400/50/50) > small (200/25/25) > tiny (100/12/12) > micro (50/6/6) > teeny (25/3/3)



In [15]:
import os

# Define the relative path to the target directory
target_directory = '../data2/val_teeny'

# Walk through the target directory and its subdirectories
for root, dirs, files in os.walk(target_directory):
    # Iterate through all files in the current directory
    for file_name in files:
        # Check if the file ends with '(1)'
        if file_name.endswith('(1).xlsx'):
            # Generate the corresponding non-duplicate file name
            base_name_with_ext = file_name.replace('(1)', '').strip()
            
            # Normalize file names for comparison
            base_name_normalized = os.path.normcase(base_name_with_ext)
            
            # Check if the non-duplicate file exists in the same directory
            if any(os.path.normcase(f) == base_name_normalized for f in files):
                # Generate the full path of the duplicate file
                duplicate_file_path = os.path.join(root, file_name)
                
                # Delete the duplicate file
                os.remove(duplicate_file_path)
                print(f"Deleted duplicate: {duplicate_file_path}")
            else:
                print(f"Original file not found for: {file_name}")

# Initialize a counter for the total number of files
file_count = 0

# List all remaining files for debugging purposes
print("\nRemaining files in the directory:")
for root, dirs, files in os.walk(target_directory):
    for file_name in files:
        # Print each file's full path
        file_path = os.path.join(root, file_name)
        print(file_path)
        # Increment the file counter
        file_count += 1

# Print the total number of files
print(f"\nTotal number of files in '{target_directory}': {file_count}")


Deleted duplicate: ../data2/val_teeny/Table1-d_far(1).xlsx
Original file not found for: Results_2520Sheet_2520Test_25203_2520WRR(1).xlsx
Original file not found for: 40_2520Lakes_2520Central_2520UnmixO_2520SciHub_2520-_2520Published(1).xlsx

Remaining files in the directory:
../data2/val_teeny/40%2520Lakes%2520Central%2520UnmixO%2520SciHub%2520-%2520Published.xlsx
../data2/val_teeny/Results%2520Sheet%2520Test%25203%2520WRR.xlsx
../data2/val_teeny/Results_2520Sheet_2520Test_25203_2520WRR(1).xlsx
../data2/val_teeny/40_2520Lakes_2520Central_2520UnmixO_2520SciHub_2520-_2520Published(1).xlsx
../data2/val_teeny/Table1-d_far.xlsx

Total number of files in '../data2/val_teeny': 5



List count of files

big (800/100/100) > medium (400/50/50) > small (200/25/25) > tiny (100/12/12) > micro (50/6/6) > teeny (25/3/3)

In [64]:
import os

# Define the directories to count files in
directories = [
    '../data2/enron',
    '../data2/enron_clean',
]

# Iterate through the directories
for directory in directories:
    # Initialize a counter for the total number of files
    file_count = 0
    
    # Check if the directory exists
    if os.path.exists(directory):
        # Walk through the directory and its subdirectories
        for root, dirs, files in os.walk(directory):
            file_count += len(files)  # Count the files in the current directory
    else:
        print(f"Directory '{directory}' does not exist.")
        continue
    
    # Print the total number of files for this directory
    print(f"Total number of files in '{directory}': {file_count}")


Total number of files in '../data2/enron': 624
Total number of files in '../data2/enron_clean': 478


In [60]:
import os

# Define the directories to count files in
directories = [
    '../data2/val_big',
    '../data2/test_big',
    '../data2/train_big'
]

# Iterate through the directories
for directory in directories:
    # Initialize a counter for the total number of files
    file_count = 0
    
    # Check if the directory exists
    if os.path.exists(directory):
        # Walk through the directory and its subdirectories
        for root, dirs, files in os.walk(directory):
            file_count += len(files)  # Count the files in the current directory
    else:
        print(f"Directory '{directory}' does not exist.")
        continue
    
    # Print the total number of files for this directory
    print(f"Total number of files in '{directory}': {file_count}")


Total number of files in '../data2/val_big': 100
Total number of files in '../data2/test_big': 100
Total number of files in '../data2/train_big': 799


In [63]:
import os

# Define the directory containing the duplicates
target_directory = '../data2/enron_clean'

# Walk through the target directory
for root, dirs, files in os.walk(target_directory):
    for file_name in files:
        # Check if the file ends with '(1).xls'
        if file_name.endswith('(1).xls'):
            # Generate the corresponding non-duplicate file name
            original_file_name = file_name.replace('(1)', '').strip()
            
            # Check if the non-duplicate version exists in the same directory
            if original_file_name in files:
                # Generate the full path of the duplicate file
                duplicate_file_path = os.path.join(root, file_name)
                
                # Delete the duplicate file
                os.remove(duplicate_file_path)
                print(f"Deleted duplicate: {duplicate_file_path}")
            else:
                print(f"No original found for: {file_name}")


Deleted duplicate: ../data2/enron_clean/cara_semperger_000_1_1.pst.40(1).xls
Deleted duplicate: ../data2/enron_clean/scott_neal_000_1_1.pst.778(1).xls
Deleted duplicate: ../data2/enron_clean/darrell_schoolcraft_000_1_1_1.pst.523(1).xls
Deleted duplicate: ../data2/enron_clean/scott_neal_000_1_1.pst.705(1).xls
Deleted duplicate: ../data2/enron_clean/teb_lokey_000_1_1.pst.18(1).xls
Deleted duplicate: ../data2/enron_clean/darrell_schoolcraft_000_1_1_1.pst.211(1).xls
Deleted duplicate: ../data2/enron_clean/scott_neal_000_1_1.pst.766(1).xls
Deleted duplicate: ../data2/enron_clean/tracy_geaccone_000_1_1.pst.269(1).xls
Deleted duplicate: ../data2/enron_clean/james_steffes_000_1_1.pst.165(1).xls
Deleted duplicate: ../data2/enron_clean/richard_ring_000_1_1.pst.163(1).xls
Deleted duplicate: ../data2/enron_clean/scott_neal_000_1_1.pst.802(1).xls
Deleted duplicate: ../data2/enron_clean/darrell_schoolcraft_000_1_1_1.pst.431(1).xls
Deleted duplicate: ../data2/enron_clean/chris_stokley_000_1_1.pst.173