In [1]:
import numpy as np # linear algebra
import pandas as pd # csv I/O
import h5py # handle data that doesn't fit in memory

IMG_SIZE = 299
MODEL_NAME = 'xception'

DATA_DIR = r'D:\LICENTA\processed_data\size_{size1}x{size2}'.format(size1=IMG_SIZE, size2=IMG_SIZE)

In [6]:
train_photo_to_biz = pd.read_csv('train_photo_to_biz_ids.csv')
train_labels = pd.read_csv('train.csv').dropna()
train_labels['labels'] = train_labels['labels'].apply(lambda x: tuple(sorted(int(t) for t in x.split())))
train_labels.set_index('business_id', inplace=True)
biz_ids = train_labels.index.unique()

print('Number of train business:', len(biz_ids), '4 business dropped due to missing labels')

Number of train business: 1996 4 business dropped due to missing labels


In [7]:
with h5py.File(DATA_DIR + r'\train_images_{name}_features_avg.h5'.format(name=MODEL_NAME), 'r') as f:
    train_images_features = np.copy(f['feature'])

In [8]:
%%time
from tqdm import tqdm

df = pd.DataFrame(columns=['business', 'label', 'feature_vector'])
index = 0

for biz in tqdm(biz_ids):
    label = train_labels.loc[biz]['labels']
    image_index = train_photo_to_biz[train_photo_to_biz['business_id'] == biz].index.tolist()
    
    features = train_images_features[image_index]
    mean_feature = list(np.mean(features, axis=0))
    
    df.loc[index] = [biz, label, mean_feature]
    index += 1
    
with open(DATA_DIR + r'\train_biz_{name}_features.csv'.format(name=MODEL_NAME), 'w') as f:
    df.to_csv(f, index=False)

100%|██████████████████████████████████████████████████████████████████████████████| 1996/1996 [00:20<00:00, 99.52it/s]


Wall time: 28.2 s


In [None]:
# Check file content
train_business = pd.read_csv(DATA_DIR + r'\train_biz_{name}_features.csv'.format(name=MODEL_NAME))
print(train_business.shape)
train_business[0:5]

In [19]:
test_photo_to_biz = pd.read_csv('test_photo_to_biz.csv')
biz_ids = test_photo_to_biz['business_id'].unique()

with h5py.File(DATA_DIR + r'\test_images_{name}_features_avg.h5'.format(name=MODEL_NAME), 'r') as f:
    image_filenames = list(np.copy(f['photo_id']))
    image_filenames = [name.split(b'\\')[-1][:-4] for name in image_filenames]
    image_features = np.copy(f['feature'])
    
print('Number of test business:', len(biz_ids))

Number of test business: 10000


In [23]:
%%time
from tqdm import tqdm

df = pd.DataFrame(columns=['business', 'feature_vector'])
index = 0

for biz in tqdm(biz_ids):
    image_ids = test_photo_to_biz[test_photo_to_biz['business_id'] == biz]['photo_id'].tolist()
    image_index = [image_filenames.index(str(x).encode()) for x in image_ids]
    
    features = image_features[image_index]
    mean_feature = list(np.mean(features, axis=0))
    
    df.loc[index] = [biz, mean_feature]
    index += 1
        
with open(DATA_DIR + r'\test_biz_{name}_features.csv'.format(name=MODEL_NAME), 'w') as f:
    df.to_csv(f, index=False)

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [1:35:30<00:00,  1.95it/s]


Wall time: 1h 36min 20s


In [24]:
# Check file content
test_business = pd.read_csv(DATA_DIR + r'\test_biz_{name}_features.csv'.format(name=MODEL_NAME))
print(test_business.shape)
test_business[0:5]

(10000, 2)


Unnamed: 0,business,feature_vector
0,003sg,"[0.11907065, 0.073270835, 0.10785412, 0.073216..."
1,00er5,"[0.13801534, 0.069412991, 0.10356507, 0.093447..."
2,00kad,"[0.11502581, 0.073984087, 0.12335605, 0.070464..."
3,00mc6,"[0.11302663, 0.069561109, 0.10974503, 0.097833..."
4,00q7x,"[0.080705076, 0.051065292, 0.079456739, 0.1045..."
