In [1]:
import csv
import sys
from tqdm import tqdm
from pathlib import Path
import logging
from collections import defaultdict
import ast
import pickle
import pprint

In [17]:
csv.field_size_limit(sys.maxsize)

features_outfile = '../data/tsv/merged.features.tsv'
labels_outfile = '../data/tsv/merged.labels.tsv'
data_path = '../data/features'

features_files = list(Path(data_path).glob('features.tsv.*'))
labels_files = list(Path(data_path).glob('labels.tsv.*'))

In [3]:
class Error:
    def __init__(self):
        self.dups = []
        self.write_errors = []
        self.parse_errors = []
    def __str__(self):
        dups = 'dups: ' + pprint.pformat(self.dups)
        write = 'write: ' + pprint.pformat(self.write_errors)
        parse = 'parse: ' + pprint.pformat(self.parse_errors)
        return '\n'.join([dups, write, parse])
    def get_error(self):
        error_types = []
        if self.dups:
            error_types.append('dups')
        if self.write_errors:
            error_types.append('write')
        if self.parse_errors:
            error_types.append('parse')
        return error_types

In [4]:
def validate(infiles):
    image_ids = set()
    def init():
        return Error()
    errors= defaultdict(lambda : init())
    
    with tqdm(total=210000) as pbar:
        for infile in infiles:
            with open(infile) as tsv_in_file:
                reader = csv.reader(tsv_in_file, delimiter='\t')
                
                for item in reader:
                    
                    image_id = item[0]
                    
                    # more than 2 fields
                    if len(item) > 2:
                        errors[image_id].write_errors.append(item)
                    # list not stored correctly    
                    else: 
                        try:
                            ast.literal_eval(item[1])
                        except:
                            errors[image_id].parse_errors.append(item[1])

                    # duplicates
                    if image_id in image_ids:
                        errors[image_id].dups.append(item)
                        
                    image_ids.add(image_id)
                    pbar.update(1)
    print(f"found {len(image_ids)} unique ids")
    print(f"found {len(errors)} errors")
    return image_ids, errors

# Labels

In [5]:
label_ids, label_errors = validate(labels_files)

 96%|█████████▌| 201371/210000 [02:10<00:05, 1547.99it/s]

found 201368 unique ids
found 135 errors





In [6]:
with open('labels.pkl', 'wb') as f:
    pickle.dump((label_ids, dict(label_errors)), f)

# Features

In [7]:
feature_ids, feature_errors = validate(features_files)

 96%|█████████▌| 201405/210000 [36:20<01:33, 92.36it/s] 

found 201405 unique ids
found 138 errors





In [8]:
with open('features.pkl', 'wb') as f:
    pickle.dump((feature_ids, dict(feature_errors)), f)

# Cleaned IDs

In [9]:
# with open('labels.pkl', 'rb') as f:
#     label_ids, label_errors = pickle.load(f)
# with open('features.pkl', 'rb') as f:
#     feature_ids, feature_errors = pickle.load(f)

In [10]:
cleaned_labels = label_ids - set(label_errors.keys())
cleaned_features = feature_ids - set(feature_errors.keys())

print(len(cleaned_labels))
print(len(cleaned_features))

201233
201267


In [11]:
cleaned_ids = cleaned_labels.intersection(cleaned_features)

print(len(cleaned_ids))

201230


In [20]:
with open('cleaned_ids.pkl' , 'wb') as f:
    pickle.dump(cleaned_ids, f)

# Merge TSV files

In [15]:
# read labels into memory
labels = {}
with tqdm(total=210000) as pbar:
    for infile in labels_files:
        with open(infile) as tsv_in_file:
            reader = csv.reader(tsv_in_file, delimiter='\t')
            for item in reader:
                if item[0] in cleaned_ids:
                    labels[item[0]] = item
                pbar.update(1)

 96%|█████████▌| 201371/210000 [00:23<00:00, 8642.45it/s]


In [16]:
for i in labels.keys():
    assert i in cleaned_ids

In [19]:
# merge tsv files
with tqdm(total=len(cleaned_ids)) as pbar:
    with open(features_outfile, 'w') as features_tsv, open(labels_outfile, 'w') as labels_tsv:
        features_writer = csv.writer(features_tsv, delimiter = '\t')   
        labels_writer = csv.writer(labels_tsv, delimiter = '\t')  
        
        dummy_writer = csv.writer(open("/dev/null", 'w'), delimiter = '\t')
        
        for infile in features_files:
            with open(infile) as tsv_in_file:
                reader = csv.reader(tsv_in_file, delimiter='\t')
                for item in reader:
                    image_id = item[0]
                    if image_id not in cleaned_ids:
                        continue
                    try:
                        dummy_writer.writerow(item)
                        dummy_writer.writerow(labels[image_id])
                    except Exception as e:
                        tqdm.write(f'error for {image_id}, {str(e)}')
                        continue
                    
                    features_writer.writerow(item)
                    labels_writer.writerow(labels[image_id])
                    assert image_id == labels[image_id][0]
                    pbar.update(1)

 66%|██████▌   | 132342/201230 [42:27<22:06, 51.94it/s] 


OSError: [Errno 5] Input/output error