In [1]:
import numpy as np # linear algebra
import pandas as pd # csv I/O
import h5py # handle data that doesn't fit in memory

DATA_DIR = r'D:\LICENTA\processed_data\size_224x224'

MODEL_NAME = 'resnet50'

In [2]:
train_photo_to_biz = pd.read_csv('train_photo_to_biz_ids.csv')
train_labels = pd.read_csv('train.csv').dropna()
train_labels['labels'] = train_labels['labels'].apply(lambda x: tuple(sorted(int(t) for t in x.split())))
train_labels.set_index('business_id', inplace=True)
biz_ids = train_labels.index.unique()

print('Number of train business:', len(biz_ids), '4 business dropped due to missing labels')

Number of train business: 1996 4 business dropped due to missing labels


In [3]:
with h5py.File(DATA_DIR + r'\train_images_{name}_features.h5'.format(name=MODEL_NAME), 'r') as f:
    train_images_features = np.copy(f['feature'])

In [4]:
%%time
from tqdm import tqdm

df = pd.DataFrame(columns=['business', 'label', 'feature_vector'])
index = 0

for biz in tqdm(biz_ids):
    label = train_labels.loc[biz]['labels']
    image_index = train_photo_to_biz[train_photo_to_biz['business_id'] == biz].index.tolist()
    
    features = train_images_features[image_index]
    mean_feature = list(np.mean(features, axis=0))
    
    df.loc[index] = [biz, label, mean_feature]
    index += 1
    
with open(DATA_DIR + r'\train_biz_{name}_features.csv'.format(name=MODEL_NAME), 'w') as f:
    df.to_csv(f, index=False)

100%|█████████████████████████████████████████████████████████████████████████████| 1996/1996 [00:17<00:00, 113.13it/s]


Wall time: 25.2 s


In [5]:
# Check file content
train_business = pd.read_csv(DATA_DIR + r'\train_biz_{name}_features.csv'.format(name=MODEL_NAME))
print(train_business.shape)
train_business[0:5]

(1996, 3)


Unnamed: 0,business,label,feature_vector
0,1000,"(1, 2, 3, 4, 5, 6, 7)","[0.26928094, 0.67035854, 1.505075, 0.63246781,..."
1,1001,"(0, 1, 6, 8)","[0.061924133, 0.47727728, 1.2030253, 0.3298085..."
2,100,"(1, 2, 4, 5, 6, 7)","[0.28768438, 0.31944394, 2.1715095, 0.38561299..."
3,1006,"(1, 2, 4, 5, 6)","[0.44401026, 0.46946144, 1.987806, 0.19735378,..."
4,1010,"(0, 6, 8)","[0.10140131, 0.52415538, 3.7969804, 0.41625538..."


In [None]:
test_photo_to_biz = pd.read_csv('test_photo_to_biz.csv')
biz_ids = test_photo_to_biz['business_id'].unique()

with h5py.File(DATA_DIR + r'\test_images_{name}_features.h5'.format(name=MODEL_NAME), 'r') as f:
    image_filenames = list(np.copy(f['photo_id']))
    image_filenames = [name.split('/')[-1][:-4] for name in image_filenames]
    image_features = np.copy(f['feature'])
    
print('Number of test business:', len(biz_ids))

In [None]:
import time

df = pd.DataFrame(columns=['business', 'feature_vector'])
index = 0
t = time.time()

for biz in biz_ids:
    image_ids = test_photo_to_biz[test_photo_to_biz['business_id'] == biz]['photo_id'].tolist()  
    image_index = [image_filenames.index(str(x)) for x in image_ids]
    
    features = image_features[image_index]
    mean_feature = list(np.mean(features, axis=0))
    
    df.loc[index] = [biz, mean_feature]
    index += 1
    
    if index % 1000 == 0:
        print(index, 'business processed  Time passed:', '{0:.1f}'.format(time.time() - t), 'sec')
        
with open(DATA_DIR + r'\test_biz_{name}_features.csv'.format(name=MODEL_NAME), 'w') as f:
    df.to_csv(f, index=False)

In [None]:
# Check file content
test_business = pd.read_csv(DATA_DIR + r'\test_biz_{name}_features.csv'.format(name=MODEL_NAME))
print(test_business.shape)
test_business[0:5]