In [9]:
import numpy as np # linear algebra
import pandas as pd # csv I/O
import h5py # handle data that doesn't fit in memory

IMG_SIZE = 299
MODEL_NAME = 'xception'

DATA_DIR = r'D:\LICENTA\processed_data\size_{size1}x{size2}'.format(size1=IMG_SIZE, size2=IMG_SIZE)

In [10]:
train_photo_to_biz = pd.read_csv('train_photo_to_biz_ids.csv')
train_labels = pd.read_csv('train.csv').dropna()
train_labels['labels'] = train_labels['labels'].apply(lambda x: tuple(sorted(int(t) for t in x.split())))
train_labels.set_index('business_id', inplace=True)
biz_ids = train_labels.index.unique()

print('Number of train business:', len(biz_ids), '4 business dropped due to missing labels')

Number of train business: 1996 4 business dropped due to missing labels


In [11]:
with h5py.File(DATA_DIR + r'\train_images_{name}_features.h5'.format(name=MODEL_NAME), 'r') as f:
    train_images_features = np.copy(f['feature'])

In [38]:
%%time
from tqdm import tqdm

df = pd.DataFrame(columns=['business', 'label', 'feature_vector'])
index = 0

for biz in tqdm(biz_ids):
    label = train_labels.loc[biz]['labels']
    image_index = train_photo_to_biz[train_photo_to_biz['business_id'] == biz].index.tolist()
    
    features = train_images_features[image_index]
    mean_feature = list(np.mean(features, axis=0))
    
    df.loc[index] = [biz, label, mean_feature]
    index += 1
    
with open(DATA_DIR + r'\train_biz_{name}_features.csv'.format(name=MODEL_NAME), 'w') as f:
    df.to_csv(f, index=False)

100%|█████████████████████████████████████████████████████████████████████████████| 1996/1996 [00:16<00:00, 122.63it/s]


Wall time: 23.8 s


In [39]:
# Check file content
train_business = pd.read_csv(DATA_DIR + r'\train_biz_{name}_features.csv'.format(name=MODEL_NAME))
print(train_business.shape)
train_business[0:5]

(1996, 3)


Unnamed: 0,business,label,feature_vector
0,1000,"(1, 2, 3, 4, 5, 6, 7)","[0.98409426, 0.5011915, 0.80012208, 0.39773199..."
1,1001,"(0, 1, 6, 8)","[1.1223717, 0.39256608, 0.36077923, 1.2178262,..."
2,100,"(1, 2, 4, 5, 6, 7)","[1.0060079, 0.57109088, 0.84190941, 0.32770807..."
3,1006,"(1, 2, 4, 5, 6)","[1.1555356, 0.39558065, 0.63720721, 0.28982833..."
4,1010,"(0, 6, 8)","[0.98949295, 0.6631335, 0.60745454, 0.50947839..."


In [None]:
test_photo_to_biz = pd.read_csv('test_photo_to_biz.csv')
biz_ids = test_photo_to_biz['business_id'].unique()

with h5py.File(DATA_DIR + r'\test_images_{name}_features.h5'.format(name=MODEL_NAME), 'r') as f:
    image_filenames = list(np.copy(f['photo_id']))
    image_filenames = [name.split('/')[-1][:-4] for name in image_filenames]
    image_features = np.copy(f['feature'])
    
print('Number of test business:', len(biz_ids))

In [None]:
import time

df = pd.DataFrame(columns=['business', 'feature_vector'])
index = 0
t = time.time()

for biz in biz_ids:
    image_ids = test_photo_to_biz[test_photo_to_biz['business_id'] == biz]['photo_id'].tolist()  
    image_index = [image_filenames.index(str(x)) for x in image_ids]
    
    features = image_features[image_index]
    mean_feature = list(np.mean(features, axis=0))
    
    df.loc[index] = [biz, mean_feature]
    index += 1
    
    if index % 1000 == 0:
        print(index, 'business processed  Time passed:', '{0:.1f}'.format(time.time() - t), 'sec')
        
with open(DATA_DIR + r'\test_biz_{name}_features.csv'.format(name=MODEL_NAME), 'w') as f:
    df.to_csv(f, index=False)

In [None]:
# Check file content
test_business = pd.read_csv(DATA_DIR + r'\test_biz_{name}_features.csv'.format(name=MODEL_NAME))
print(test_business.shape)
test_business[0:5]