# Remove path name from photo ids in hdf files
(photo_id was inadvertently saved with the full pathname in the 1st notebook)

In [1]:
data_root = '/home/cathy/repos/kaggle-yelp/features/'

import numpy as np
import pandas as pd
import h5py

In [2]:
def remove_path_from_photo_id(filename):
    """
    Remove path from photo_id string, e.g. b'/mnt/data/train_photos/204149.jpg' -> b'204149.jpg'
    
    INPUT:
    filename - string, name of hdf file, not including data_root path
    """
    
    f = h5py.File(data_root + filename, 'r+')
    photo_id = np.copy(f['photo_id'])
    
    print('photo_id shape: ', photo_id.shape)
    print('first few photo ids: ', photo_id[0:3])
    
    # example string to split: b'/mnt/data/train_photos/204149.jpg'
    photo_id_split = np.core.defchararray.rsplit(photo_id, b'/', maxsplit=1)
    
    if len(photo_id_split[0]) == 2:
        # select 2nd element of split string (containing photo_id.jpg).  
        # if statement in case data was already processed (so no split)
        photo_id_nopath = np.array([x[1] for x in photo_id_split])
    
        del f['photo_id']
        f['photo_id'] = photo_id_nopath
    else:
        print('photo_id is already processed.')
    f.close()

In [3]:
# process all hdf files
import os

filenames = os.listdir(data_root)
print(filenames)

for fname in filenames:
    remove_path_from_photo_id(fname)

['train_fc6_features.h5', 'test_prob_features.h5', 'train_fc7_features.h5', 'test_fc7_features.h5', 'train_prob_features.h5', 'test_fc6_features.h5']
photo_id shape:  (234842,)
first few photo ids:  [b'/mnt/data/train_photos/204149.jpg' b'/mnt/data/train_photos/52779.jpg'
 b'/mnt/data/train_photos/278973.jpg']
photo_id shape:  (237152,)
first few photo ids:  [b'/mnt/data/test_photos/317818.jpg' b'/mnt/data/test_photos/30679.jpg'
 b'/mnt/data/test_photos/455084.jpg']
photo_id shape:  (234842,)
first few photo ids:  [b'/mnt/data/train_photos/204149.jpg' b'/mnt/data/train_photos/52779.jpg'
 b'/mnt/data/train_photos/278973.jpg']
photo_id shape:  (237152,)
first few photo ids:  [b'/mnt/data/test_photos/317818.jpg' b'/mnt/data/test_photos/30679.jpg'
 b'/mnt/data/test_photos/455084.jpg']
photo_id shape:  (234842,)
first few photo ids:  [b'/mnt/data/train_photos/204149.jpg' b'/mnt/data/train_photos/52779.jpg'
 b'/mnt/data/train_photos/278973.jpg']
photo_id shape:  (237152,)
first few photo ids