# Setup

## Imports

In [7]:
# Import importlib to reload modules and sys and os to add the path for other imports
import importlib
import sys
import os

# Append the parent directory to the path to import the necessary modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import the utilities and the dataloader
from utils import selfutil, parseutil
from classes import SpreadsheetDataLoader

# Now reload the modules to ensure they are up-to-date
importlib.reload(parseutil)
importlib.reload(selfutil)
importlib.reload(SpreadsheetDataLoader)

# Import the funcs needed from utils
from utils.parseutil import process_xls, process_xlsx, process_csv, process_spreadsheet
from utils.selfutil import get_vocabulary

# Import the SpreadsheetDataLoader class
from classes.SpreadsheetDataLoader import SpreadsheetDataLoader

## Directory and Vocab

Create the vocabulary object and setup the directories of use

In [9]:
# Set the directory containing the spreadsheets
data_dir = '../data/train/'

# Get the list of file paths
spreadsheet_vocab,file_paths = get_vocabulary(data_dir)

# Print info
print(f'\n\nVocabulary size: {len(spreadsheet_vocab._word2idx)}')
print(f'Files Processed: {len(file_paths)}')

ValueError: train/ NOT FOUND

Now setup the dictionaries and funcs for testing our tensors

In [3]:
import itertools

def showtensor(y_tok):
    # Define the checks directly inside the function
    checks = [
        ('Type Key', {
            'I1': (0,8), 'H1': (0, 7), 'C11': (10, 2), 'D19':(18,3), 'F24': (23, 5), 
            'H14': (13, 7), 'H13': (12, 7), 'G16': (15, 6), 'G20': (19,6), 'G21': (20,6), 
            'C7': (6,2), 'C8': (7,2), 'D2':  (1, 3), 'G15': (14, 6), 'B22': (21, 1), 
            'G18': (17,6), 'H17': (16, 7), 'H18': (17, 7), 'A1': (0, 0), 'G14': (13, 6)
        }),
        ('Fill Key', {
            'A1': (0,0), 'B2': (1,1), 'C14': (13,2), 'G14': (13,6)
        }),
        ('Align Key', {
            'A1': (0,0), 'B1': (0,1), 'C1': (0,2), 'D1': (0,3)
        }),
        ('Block Merge', {
            'D15': (14,3), 'D16': (15,3), 'E16': (15,4), 'F16': (15,5), 'D17': (16,3), 
            'E17': (16,4), 'F17': (16,5), 'D18': (17,3), 'E18': (17,4), 'F18': (17,5), 
            'D19': (18,3), 'E19': (18,4), 'F19': (18,5), 'D20': (19,3), 'E20': (19,4), 'F20': (19,5)
        }),
        ('Horizontal Merge', {
            'G12': (11,6), 'H12': (11,7)
        }),
        ('Vertical Merge', {
            'G13': (12,6), 'G14': (13,6)
        }),
        ('Outside Merge', {
            'H13': (12,7), 'H14': (13,7), 'G15': (14,6), 'H15': (14,7)
        }),
    ]

    # Loop through each dictionary to perform the checks
    for check_type, indices_dict in checks:
        print(f"\n{check_type} Results:")
        for cell, (row, col) in indices_dict.items():
            print(f"Cell {cell} at ({row}, {col}): {y_tok[row, col]}")
		
def showdiff(y_tok, y_new, mode='all'):
    # Define the checks directly inside the function
    checks = [('Type Key', {'I1': (0,8), 'H1': (0, 7), 'C11': (10, 2), 'D19':(18,3), 'F24': (23, 5),'H14': (13, 7), 'H13': (12, 7), 'G16': (15, 6), 'G20': (19,6), 'G21': (20,6),'C7': (6,2), 'C8': (7,2), 'D2':  (1, 3), 'G15': (14, 6), 'B22': (21, 1), 'G18': (17,6), 'H17': (16, 7), 'H18': (17, 7), 'A1': (0, 0), 'G14': (13, 6)
        }),('Fill Key', {'A1': (0,0), 'B2': (1,1), 'C14': (13,2), 'G14': (13,6)}),('Align Key', {'A1': (0,0), 'B1': (0,1), 'C1': (0,2), 'D1': (0,3)}),
        ('Block Merge', {
            'D15': (14,3), 'D16': (15,3), 'E16': (15,4), 'F16': (15,5), 'D17': (16,3), 
            'E17': (16,4), 'F17': (16,5), 'D18': (17,3), 'E18': (17,4), 'F18': (17,5), 
            'D19': (18,3), 'E19': (18,4), 'F19': (18,5), 'D20': (19,3), 'E20': (19,4), 'F20': (19,5)
        }),
        ('Horizontal Merge', {
            'G12': (11,6), 'H12': (11,7)
        }),
        ('Vertical Merge', {
            'G13': (12,6), 'G14': (13,6)
        }),
        ('Outside Merge', {
            'H13': (12,7), 'H14': (13,7), 'G15': (14,6), 'H15': (14,7)
        }),
    ]

    for check_type, indices_dict in checks:
        print(f"\n######## {check_type} Results ########\n")
        
        for cell, (row, col) in indices_dict.items():
            if mode == 'all':
                # Apply check_indices_comparison logic
                print(f"{cell} at ({row}, {col}):")
                print(f"\t{y_tok[row, col]}")
                print(f"\t{y_new[row, col]}")
            else:
                # Apply check_change logic
                print(f"{cell} at ({row}, {col}):\t\t"
                      f"[{y_tok[row, col, 2]}],[{y_new[row, col, 2]},{y_new[row, col, 3]}]  |  "
                      f"[{y_tok[row, col, -1]}],[{y_new[row, col, -2]},{y_new[row, col, -1]}]")

def comp(x_tok, x_new, y_tok, y_new):
    # Check for discrepancies in x_tok vs x_new and y_tok[,,0] vs y_new[,,0]
    print("Discrepancy exists" if any(x_tok[row, col, seq] != x_new[row, col, seq] for row, col, seq in itertools.product(range(x_tok.shape[0]), range(x_tok.shape[1]), range(x_tok.shape[2]))) or any(y_tok[row, col, 0] != y_new[row, col, 0] for row, col in itertools.product(range(y_tok.shape[0]), range(y_tok.shape[1]))) else "No type, value discrepancy")
       

# Individual Functions


In [6]:
xls_file = 'train/parse_input.xls'
xlsx_file = 'train/parse_input.xlsx'
csv_file = 'train/parse_input.csv'

print("///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////")
x_tok, y_tok = process_xls(xls_file, spreadsheet_vocab, 100, 100, 32)
showtensor(y_tok)

print("///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////")
x_tok, y_tok = process_xlsx(xlsx_file, spreadsheet_vocab, 100, 100, 32)
showtensor(y_tok)

print("///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////")
x_tok, y_tok = process_csv(csv_file, spreadsheet_vocab, 100, 100, 32)
showtensor(y_tok)

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

Type Key Results:
Cell I1 at (0, 8): tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Cell H1 at (0, 7): tensor([ 1,  0,  0,  2,  0, 12,  1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0])
Cell C11 at (10, 2): tensor([ 3,  0,  2,  1,  2, 12,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,  0])
Cell D19 at (18, 3): tensor([13,  0,  2,  0,  2, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2])
Cell F24 at (23, 5): tensor([ 4,  0,  0,  2,  2, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
Cell H14 at (13, 7): tensor([ 5,  0,  0,  2,  2, 12,  0,  0,  0,  0,  0,  9, 11,  0,  0,  0,  0])
Cell H13 at (12, 7): tensor([ 6,  0,  0,  2,  2, 12,  0,  0,  0,  0,  0,  3,  9,  0,  0,  0,  0])
Cell G16 at (15, 6): tensor([ 6,  0,  0,  2,  2, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
Cell G20 at (19, 6): tensor([ 6,  0,  0,  2,  2, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  

# Combined Function
This function processes a spreadsheet file and extracts both the data tensor (x_tok) and metadata tensor (y_tok). It calls the relevant xlsx, xls, csv function to process the spreadsheets and if an exception occurs during processing, it returns None. This will be called in SpreadsheetDataLoader to featurize the data.

In [7]:
import os
from tqdm import tqdm

# List to store results for each file
results = []

# Iterate over every file in file_paths
for file_path in tqdm(file_paths):
    
    # Check if it's a file (and not a directory)
    if os.path.isfile(file_path):
        # Process the file and get the x_tok and y_tok tensors
        result = process_spreadsheet(file_path, spreadsheet_vocab)
        
        if result is not None:
            x_tok, y_tok = result
            results.append((x_tok, y_tok))
        else:
            print(f"Failed to process {file_path} or unsupported file format.")



 77%|███████▋  | 477/621 [01:08<00:07, 20.05it/s]

Failed to process data/input.xlsx or unsupported file format.


100%|██████████| 621/621 [01:28<00:00,  7.00it/s]


# SpreadSheetDataLoader

Now we can create our SpreadSheetDataLoader object with the x and y tokens in it also

In [8]:
check_loader = SpreadsheetDataLoader(file_paths, spreadsheet_vocab)

Processing files: 100%|██████████| 621/621 [00:11<00:00, 55.91it/s]


In [9]:
print(f'Spreadsheets Processed: {len(check_loader)}')
print(f'x_tok Tensor Shape: {check_loader.x_tok[0].shape}')
print(f'y_tok Tensor Shape: {check_loader.y_tok[0].shape}')

Spreadsheets Processed: 620
x_tok Tensor Shape: torch.Size([100, 100, 32])
y_tok Tensor Shape: torch.Size([100, 100, 17])


In [3]:
import os
import xlrd
from collections import Counter

# Directory containing .xls files
data_dir = "../data/enron_clean"

# Dictionary to store counts of BIFF versions
biff_versions = Counter()

# Counter for files with no BIFF version information
no_version_files = 0

# Loop through each .xls file in the directory
for filename in os.listdir(data_dir):
    print(filename)
    if filename.endswith(".xls"):  # Process only .xls files
        file_path = os.path.join(data_dir, filename)
        try:
            # Open the workbook using xlrd
            workbook = xlrd.open_workbook(file_path, on_demand=True)

            # Get the BIFF version
            version = workbook.biff_version

            # Increment the counter for this BIFF version
            biff_versions[version] += 1
        except Exception as e:
            # Increment counter for files with no version information
            no_version_files += 1
        finally:
            # Close the workbook if opened
            if 'workbook' in locals():
                workbook.release_resources()


# Print metrics
print("BIFF Version Counts:")
for version, count in biff_versions.items():
    print(f"  BIFF{version // 10}: {count} files")  # Format BIFF version

print(f"\nFiles with no BIFF version information: {no_version_files}")

cara_semperger_000_1_1.pst.40(1).xls
scott_neal_000_1_1.pst.778(1).xls
darrell_schoolcraft_000_1_1_1.pst.523(1).xls
scott_neal_000_1_1.pst.705(1).xls
teb_lokey_000_1_1.pst.18(1).xls
darrell_schoolcraft_000_1_1_1.pst.211(1).xls
scott_neal_000_1_1.pst.766(1).xls
tracy_geaccone_000_1_1.pst.269(1).xls
michelle_lokay_000_1_2_1.pst.263.xls
james_steffes_000_1_1.pst.165(1).xls
richard_ring_000_1_1.pst.163(1).xls
mark_taylor_000_1_1.pst.91.xls
scott_neal_000_1_1.pst.704.xls
sara_shackleton_000_1_2.pst.107.xls
darrell_schoolcraft_000_1_1_1.pst.291.xls
darron_c_giron_002_1_1_1.pst.150.xls
scott_neal_000_1_1.pst.802(1).xls
darrell_schoolcraft_000_1_1_1.pst.431(1).xls
chris_stokley_000_1_1.pst.173(1).xls
scott_neal_000_1_1.pst.774(1).xls
darrell_schoolcraft_000_1_1_1.pst.452(1).xls
michelle_lokay_000_1_2_1.pst.129.xls
richard_ring_000_1_1.pst.171(1).xls
sara_shackleton_003_1_1.pst.77(1).xls
richard_ring_000_1_1.pst.168.xls
paul_y_barbo_000_1_1.pst.53.xls
scott_neal_000_1_1.pst.949(1).xls
darron_c_

In [4]:
# Print metrics
print("BIFF Version Counts:")
for version, count in biff_versions.items():
    print(f"  BIFF{version // 10}: {count} files")  # Format BIFF version

print(f"\nFiles with no BIFF version information: {no_version_files}")

BIFF Version Counts:
  BIFF8: 891 files
  BIFF7: 30 files
  BIFF4: 6 files
  BIFF5: 10 files

Files with no BIFF version information: 0
