In [7]:
import numpy as np
import pandas as pd
import h5py

DATA_DIR = r'D:\LICENTA\processed_data\size_224x224'

In [8]:
train_photo_to_biz = pd.read_csv('train_photo_to_biz_ids.csv')
train_labels = pd.read_csv('train.csv').dropna()
train_labels['labels'] = train_labels['labels'].apply(lambda x: tuple(sorted(int(t) for t in x.split())))
train_labels.set_index('business_id', inplace=True)
biz_ids = train_labels.index.unique()

print('Number of train business:', len(biz_ids), '4 business dropped due to missing labels')

Number of train business: 1996 4 business dropped due to missing labels


In [9]:
with h5py.File(DATA_DIR + r'\train_images_vgg16_features.h5', 'r') as f:
    train_images_features = np.copy(f['feature'])

In [10]:
%%time
from tqdm import tqdm

df = pd.DataFrame(columns=['business', 'label', 'feature_vector'])
index = 0

for biz in tqdm(biz_ids):
    label = train_labels.loc[biz]['labels']
    image_index = train_photo_to_biz[train_photo_to_biz['business_id'] == biz].index.tolist()
    
    features = train_images_features[image_index]
    mean_feature = list(np.mean(features, axis=0))
    
    df.loc[index] = [biz, label, mean_feature]
    index += 1
    
with open(DATA_DIR + r'\train_biz_vgg16_features.csv', 'w') as f:
    df.to_csv(f, index=False)

100%|█████████████████████████████████████████████████████████████████████████████| 1996/1996 [00:13<00:00, 142.70it/s]


Wall time: 15.9 s


In [11]:
# Check file content
train_business = pd.read_csv(DATA_DIR + r'\train_biz_vgg16_features.csv')
print(train_business.shape)
train_business[0:5]

(1996, 3)


Unnamed: 0,business,label,feature_vector
0,1000,"(1, 2, 3, 4, 5, 6, 7)","[28.877575, 33.530006, 46.876415, 34.705391, 2..."
1,1001,"(0, 1, 6, 8)","[36.042068, 38.030708, 65.95565, 38.702034, 25..."
2,100,"(1, 2, 4, 5, 6, 7)","[37.041508, 39.158905, 52.017677, 42.739239, 2..."
3,1006,"(1, 2, 4, 5, 6)","[33.927616, 30.564184, 73.064445, 39.443863, 1..."
4,1010,"(0, 6, 8)","[37.269249, 33.686077, 95.581299, 38.996761, 1..."


In [None]:
test_photo_to_biz = pd.read_csv('test_photo_to_biz.csv')
biz_ids = test_photo_to_biz['business_id'].unique()

with h5py.File(DATA_DIR + r'\test_images_vgg16_features.h5', 'r') as f:
    image_filenames = list(np.copy(f['photo_id']))
    image_filenames = [name.split('/')[-1][:-4] for name in image_filenames]
    image_features = np.copy(f['feature'])
    
print('Number of test business:', len(biz_ids))

In [None]:
import time

df = pd.DataFrame(columns=['business', 'feature_vector'])
index = 0
t = time.time()

for biz in biz_ids:
    image_ids = test_photo_to_biz[test_photo_to_biz['business_id'] == biz]['photo_id'].tolist()  
    image_index = [image_filenames.index(str(x)) for x in image_ids]
    
    features = image_features[image_index]
    mean_feature = list(np.mean(features, axis=0))
    
    df.loc[index] = [biz, mean_feature]
    index += 1
    
    if index % 1000 == 0:
        print(index, 'business processed  Time passed:', '{0:.1f}'.format(time.time() - t), 'sec')
        
with open(DATA_DESTINATION + r'\test_biz_vgg16_features.csv', 'w') as f:
    df.to_csv(f, index=False)

In [None]:
# Check file content
test_business = pd.read_csv(DATA_DIR + r'\test_biz_vgg16_features.csv')
print(test_business.shape)
test_business[0:5]