In [1]:
import facenet

In [None]:
import tensorflow as tf
import numpy as np
import imageio
import cv2
import facenet
from matplotlib import pyplot as plt
import glob
import random
import pandas as pd
import os
from utils.image import resize_image_to_larger_dimension_and_pad
import shutil
from sklearn.metrics import confusion_matrix
import itertools

## spot check

In [None]:
image_size = 256  #don't need equal to real image size, but this value should not small than this
modeldir = '/home/caffe/facenet/sku_triplet_500k.pb' #change to your model dir
image_name1 = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test/1018120/tmp#110460.jpg' #change to your image name
image_name2 = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test/1018440/tmp#216495.jpg' #change to your image name

In [None]:
image1 = imageio.imread(image_name1, pilmode='RGB')
plt.imshow(image1)

In [None]:
image1 = facenet.prewhiten(image1)
plt.imshow(image1)

In [None]:
image2 = imageio.imread(image_name2, pilmode='RGB')
plt.imshow(image2)

In [None]:
image2 = facenet.prewhiten(image2)
plt.imshow(image2)

In [None]:
print('setting up facenet embedding')
tf.Graph().as_default()
sess = tf.Session()
facenet.load_model(modeldir)
images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
embedding_size = embeddings.get_shape()[1]

print('facenet embedding is generated')

In [None]:
def cal_dist(emb_dir1, image_file1, emb_dir2, image_file2):
    emb1 = load_emb(emb_dir1,image_file1)
    emb2 = load_emb(emb_dir2,image_file2)
    dist = np.sqrt(np.sum(np.square(emb1-emb2)))
    return dist

In [None]:
def cal_sku_embedding_mean(emb_dir,images_files):
    emb_array = np.empty((0,embedding_size))
    for idx,image_file in enumerate(images_files):
        emb = load_emb(emb_dir,image_file)
        emb_array = np.vstack((emb,emb_array))
    return np.mean(emb_array, axis=0)

In [None]:
def cal_emb(image_file):
    image = imageio.imread(image_file, pilmode='RGB')
    image = facenet.prewhiten(image)
    scaled_reshape = image.reshape(-1,image_size,image_size,3)
    emb = np.zeros((1, embedding_size))
    emb[0,:] = sess.run(embeddings, feed_dict={images_placeholder: scaled_reshape, phase_train_placeholder: False })[0]
    return emb

In [None]:
def save_emb(emb_dir,image_file,skip=True):
    sku = image_file.split('/')[-2]
    file_name = image_file.split('/')[-1]
    if not os.path.exists(emb_dir + '/' + sku):
        os.makedirs(emb_dir + '/' + sku)
    emb_file = emb_dir + '/' + sku + '/' + file_name.split('.')[0] + '.npy'
    if not (skip and os.path.exists(emb_file)):
        emb = cal_emb(image_file)
        np.save(emb_file,emb)

In [None]:
def load_emb(emb_dir, image_file):
    sku = image_file.split('/')[-2]
    file_name = image_file.split('/')[-1]
    emb_file = emb_dir + '/' + sku + '/' + file_name.split('.')[0] + '.npy'
    return np.load(emb_file)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
emb_dir1 = '/datadrive/embs/ccna_embs1'
emb_dir2 = '/datadrive/embs/ccna_embs2'
save_emb(emb_dir1, image_name1)
save_emb(emb_dir2, image_name2)

images_files = [image_file for image_file in glob.glob('/datadrive/images/activelearning/ccna_add_train_reference_crop/1018438/' + '/*')]
cal_sku_embedding_mean(emb_dir1, images_files)

In [None]:
save_emb(emb_dir,image_name1)
save_emb(emb_dir,image_name2)

In [None]:
cal_dist(image_name1,image_name2)

## given an image from train, calculate its distance from other images in train

In [None]:
ref_image = '/datadrive/images/activelearning/ccna_add_train_reference_crop/1018120/tmp#796199.jpg'
train_dir = '/datadrive/images/activelearning/ccna_add_train_reference_crop'
sample_num = 5
random.seed(0)

image_file_pool = []
for sku_dir in glob.glob(train_dir + '/*'):
    images_files = [image_file for image_file in glob.glob(sku_dir + '/*')]
    image_file_pool.extend(random.sample(images_files,min(sample_num, len(images_files))))

lst = []
for image_file in image_file_pool:
    dist = cal_dist(ref_image,image_file)
    sku = image_file.split('/')[-2]
    file_name = image_file.split('/')[-1]
    lst.append([file_name, sku, dist])
    print file_name,sku,dist

df = pd.DataFrame(lst, columns=['file_name','sku','distance'])
df.to_csv('train_train_check.csv',index=False)

## given an image from test, calculate its distance from other images in test

In [None]:
#query_image = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test/1018120/tmp#110460.jpg'
#query_emb = cal_emb(query_image)
image_dir = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test'
emb_dir = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test_emb'

random.seed(0)
sample_num = 5

sku_mean_dict={}
db_file_pool = []
query_file_pool = []

for sku_dir in glob.glob(image_dir + '/*'):
    sku = sku_dir.split('/')[-1]
    images_files = [image_file for image_file in glob.glob(sku_dir + '/*')]
    image_files_sample = random.sample(images_files,min(sample_num, len(images_files)))
    query_file_sample = random.sample(images_files,min(1, len(images_files)))
    
    for image_file in image_files_sample:
        save_emb(emb_dir, image_file)
    for image_file in query_file_sample:
        save_emb(emb_dir, image_file)
    
    sku_mean_dict[sku] = cal_sku_embedding_mean(image_files_sample)
    db_file_pool.extend(image_files_sample)
    query_file_pool.extend(query_file_sample)

In [None]:
retrieval_list = []
for query_file in query_file_pool:
    print "retrieving for {} ...".format(query_file)
    dist_list = []
    sku_truth = query_file.split('/')[-2]
    query_file_name = query_file.split('/')[-1]
    query_emb = load_emb(emb_dir,query_file)
    
    for db_file in db_file_pool:
        dist = cal_dist(query_file,db_file)
        sku = db_file.split('/')[-2]
        db_file_name = db_file.split('/')[-1]
        dist_sku_mean = np.sqrt(np.sum(np.square(sku_mean_dict[sku]-query_emb)))
        dist_list.append([db_file_name, sku, dist, dist_sku_mean])
        
    dist_df = pd.DataFrame(dist_list, columns=['db_file_name','sku','dist','dist_sku_mean'])
    dist_df.to_csv('/home/caffe/facenet_eval/test2test_query_{}_{}.csv'.format(sku_truth, query_file_name),index=False)
    closest_sku_by_dist = dist_df[dist_df['dist'] == min(dist_df['dist'])]['sku'].tolist()[0]
    closest_sku_by_dist2mean = dist_df[dist_df['dist_sku_mean'] == min(dist_df['dist_sku_mean'])]['sku'].tolist()[0]
    retrieval_list.append([query_file_name,sku_truth,closest_sku_by_dist,closest_sku_by_dist2mean])

retrieval_df = pd.DataFrame(retrieval_list, 
                                columns=['query_file_name','sku_truth','closest_sku_by_dist','closest_sku_by_dist2mean'])
retrieval_df['by_dist_eval'] = retrieval_df['sku_truth'] == retrieval_df['closest_sku_by_dist']
retrieval_df['by_dist2mean_eval'] = retrieval_df['sku_truth'] == retrieval_df['closest_sku_by_dist2mean']
by_dist_accuracy = sum(retrieval_df['by_dist_eval'])/len(retrieval_df)*1.0
by_dist2mean_accuracy = sum(retrieval_df['by_dist2mean_eval'])./len(retrieval_df)*1.0

retrieval_df.to_csv('retrieval_eval.csv',index=False)

In [None]:
by_dist_accuracy = sum(retrieval_df['by_dist_eval'])*1.0/len(retrieval_df)
by_dist2mean_accuracy = sum(retrieval_df['by_dist2mean_eval'])*1.0/len(retrieval_df)

In [None]:
by_dist_accuracy

In [None]:
by_dist2mean_accuracy

## calculate the distance to sku mean for each crop and output sorted crops based on distance (intra-cluster distance)

In [None]:
#query_image = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test/1018120/tmp#110460.jpg'
#query_emb = cal_emb(query_image)
image_dir = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test'
sorted_image_dir = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test_sorted'
sorted_eval = '/home/caffe/face_eval/sorted'
emb_dir = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test_emb'

random.seed(0)

sku_mean_dict={}
query_file_pool = []

for sku_dir in glob.glob(image_dir + '/*'):
    sku = sku_dir.split('/')[-1]
    images_files = [image_file for image_file in glob.glob(sku_dir + '/*')]
    image_files_sample = images_files
    query_file_sample = images_files  
    for image_file in image_files_sample:
        save_emb(emb_dir, image_file)
    sku_mean_dict[sku] = cal_sku_embedding_mean(image_files_sample)
    query_file_pool.extend(query_file_sample)

In [None]:
dist_list = []
for query_file in query_file_pool:
    print "retrieving for {} ...".format(query_file)
    sku_truth = query_file.split('/')[-2]
    query_file_name = query_file.split('/')[-1]
    query_emb = load_emb(emb_dir,query_file)
    dist_sku_mean = np.sqrt(np.sum(np.square(sku_mean_dict[sku_truth]-query_emb)))
    dist_list.append([query_file_name, sku_truth, dist_sku_mean])
    #dist_df['sorted_index'] = np.argsort(dist_df['dist_sku_mean'], axis=1)
    #query_sorted_index = dist_df
    #if not os.path.exists(sorted_image_dir + '/' + sku_truth):
    #    os.makedirs(sorted_image_dir + '/' + sku_truth)
    
    #if not os.path.exists(sorted_eval + '/' + sku_truth):
    #    os.makedirse(sorted_eval)                
dist_df = pd.DataFrame(dist_list, columns=['query_file_name','sku_truth','dist2sku_mean'])
dist_df['sorted_index'] = dist_df.groupby('sku_truth')['dist2sku_mean'].rank(ascending=True)

In [None]:
dist_df.to_csv('sku_intra_cluster_dist.csv',index=False)

In [None]:
!rm -R $sorted_image_dir

In [None]:
for index, row in dist_df.iterrows():
    source_file = os.path.join(image_dir, row['sku_truth'], row['query_file_name'])
    if not os.path.exists(os.path.join(sorted_image_dir, row['sku_truth'])):
        os.makedirs(os.path.join(sorted_image_dir, row['sku_truth']))
    dest_file = os.path.join(sorted_image_dir, row['sku_truth'], '{:05d}'.format(int(row['sorted_index']))+'_'+row['query_file_name'])
    shutil.copy(source_file, dest_file)

## given a reference image from PMS, calculate its distance from other images in test

In [None]:
ref_image = '/datadrive/images/activelearning/ref_pms/1018120/Red Bull Sugarfree16.9.png'
image_ref = cv2.imread(ref_image)
image_ref = resize_image_to_larger_dimension_and_pad(image_ref, (image_size,image_size), pad_value=255)
cv2.imwrite('/datadrive/images/activelearning/ref_pms/1018120/Red Bull Sugarfree16.9_resized.png',image_ref)

In [None]:
ref_image = '/datadrive/images/activelearning/ref_pms/1018120/Red Bull Sugarfree16.9_resized.png'
image_dir = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test'
sample_num = 5

image_file_pool = []
for sku_dir in glob.glob(image_dir + '/*'):
    images_files = [image_file for image_file in glob.glob(sku_dir + '/*')]
    image_file_pool.extend(random.sample(images_files,min(sample_num, len(images_files))))

lst = []
for image_file in image_file_pool:
    dist = cal_dist(ref_image,image_file)
    sku = image_file.split('/')[-2]
    file_name = image_file.split('/')[-1]
    lst.append([file_name, sku, dist])
    print file_name,sku,dist

df = pd.DataFrame(lst, columns=['file_name','sku','distance'])
df.to_csv('ref_test_check.csv',index=False)

# TCCC Evaluation - Distance2Mean

* Generate embedding and save

In [None]:
## Put X% query images as ref images for training if needed

!rm /datadrive/images/activelearning/tccc_train/ -R
!rm /datadrive/images/activelearning/tccc_test/ -R

query_image_dir = '/datadrive/images/activelearning/tccc_fridge_image_crop'
ref_image_dir = '/datadrive/images/activelearning/tccc_reference_image_crop'

train_dir = '/datadrive/images/activelearning/tccc_train'
test_dir = '/datadrive/images/activelearning/tccc_test'
train_emb_dir = '/datadrive/images/activelearning/tccc_train_emb'
test_emb_dir = '/datadrive/images/activelearning/tccc_test_emb'

sample_perc= 0

for sku_dir in glob.glob(ref_image_dir + '/*'):
    sku = sku_dir.split('/')[-1]
    if not os.path.exists(os.path.join(train_dir, sku)):
        os.makedirs(os.path.join(train_dir, sku))         
    train_file_sample = [image_file for image_file in glob.glob(sku_dir + '/*')]
    for train_file in train_file_sample:
        source_file = train_file
        dest_file = os.path.join(train_dir, sku, 'train_'+train_file.split('/')[-1].split('.')[0]+'.jpg')
        shutil.copy(source_file, dest_file)
                          
for sku_dir in glob.glob(query_image_dir + '/*'):
    sku = sku_dir.split('/')[-1]
    if not os.path.exists(os.path.join(train_dir, sku)):
        os.makedirs(os.path.join(train_dir, sku))
    if not os.path.exists(os.path.join(test_dir, sku)):
        os.makedirs(os.path.join(test_dir, sku))
    query_file_sample = [image_file for image_file in glob.glob(sku_dir + '/*')]
    train_file_sample = random.sample(query_file_sample,min(int(sample_perc*len(query_file_sample)), len(query_file_sample)))
    test_file_sample = [image_file for image_file in query_file_sample if image_file not in train_file_sample]
    for train_file in train_file_sample:
        source_file = train_file
        dest_file = os.path.join(train_dir, sku, 'test_'+train_file.split('/')[-1].split('.')[0]+'.jpg')
        shutil.copy(source_file, dest_file)
    for test_file in test_file_sample:
        source_file = test_file
        dest_file = os.path.join(test_dir, sku, 'test_'+test_file.split('/')[-1].split('.')[0]+'.jpg')
        shutil.copy(source_file, dest_file)
                          


In [None]:
## generate embedding

query_image_dir = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test'
ref_image_dir = '/datadrive/images/activelearning/ccna_add_train_reference_crop'
query_emb_dir = '/datadrive/images/activelearning/ccna_add_train_reference_crop_test_emb'
ref_emb_dir = '/datadrive/images/activelearning/ccna_add_train_reference_crop_emb'

#query_image_dir = test_dir
#ref_image_dir = train_dir
#query_emb_dir = test_emb_dir
#ref_emb_dir = train_emb_dir

sku_mean_dict={}
db_file_pool = []
query_file_pool = []

for sku_dir in glob.glob(query_image_dir + '/*'):
    sku = sku_dir.split('/')[-1]
    query_file_sample = [image_file for image_file in glob.glob(sku_dir + '/*')]
    for image_file in query_file_sample:
        save_emb(query_emb_dir, image_file)
    query_file_pool.extend(query_file_sample)
    
for sku_dir in glob.glob(ref_image_dir + '/*'):
    sku = sku_dir.split('/')[-1]
    ref_file_sample = [image_file for image_file in glob.glob(sku_dir + '/*')]
    for image_file in ref_file_sample:
        #print "saving embedding for {}".format(image_file)
        save_emb(ref_emb_dir, image_file)
    db_file_pool.extend(ref_file_sample)
    sku_mean_dict[sku] = cal_sku_embedding_mean(ref_emb_dir,ref_file_sample)

* Calcualte distance to sku mean, classification and accuracy

In [None]:
retrieval_list = []
for query_file in query_file_pool:
    print "retrieving for {} ...".format(query_file)
    dist_list = []
    sku_truth = query_file.split('/')[-2]
    query_file_name = query_file.split('/')[-1]
    query_emb = load_emb(query_emb_dir,query_file)
    
    for db_file in db_file_pool:
        dist = cal_dist(query_emb_dir,query_file,ref_emb_dir,db_file)
        sku = db_file.split('/')[-2]
        db_file_name = db_file.split('/')[-1]
        dist_sku_mean = np.sqrt(np.sum(np.square(sku_mean_dict[sku]-query_emb)))
        dist_list.append([db_file_name, sku, dist, dist_sku_mean])
        
    dist_df = pd.DataFrame(dist_list, columns=['db_file_name','sku','dist','dist_sku_mean'])
    #dist_df.to_csv('/home/caffe/facenet_eval/test2test_query_{}_{}.csv'.format(sku_truth, query_file_name),index=False)
    closest_sku_by_dist = dist_df[dist_df['dist'] == min(dist_df['dist'])]['sku'].tolist()[0]
    closest_sku_by_dist2mean = dist_df[dist_df['dist_sku_mean'] == min(dist_df['dist_sku_mean'])]['sku'].tolist()[0]
    retrieval_list.append([query_file_name,sku_truth,closest_sku_by_dist,closest_sku_by_dist2mean])

retrieval_df = pd.DataFrame(retrieval_list, 
                                columns=['query_file_name','sku_truth','closest_sku_by_dist','closest_sku_by_dist2mean'])
retrieval_df['by_dist_eval'] = retrieval_df['sku_truth'] == retrieval_df['closest_sku_by_dist']
retrieval_df['by_dist2mean_eval'] = retrieval_df['sku_truth'] == retrieval_df['closest_sku_by_dist2mean']
by_dist_accuracy = sum(retrieval_df['by_dist_eval'])*1.0/len(retrieval_df)
by_dist2mean_accuracy = sum(retrieval_df['by_dist2mean_eval'])*1.0/len(retrieval_df)

retrieval_df.to_csv('tccc_classification_eval.csv',index=False)

In [None]:
by_dist_accuracy = sum(retrieval_df['by_dist_eval'])*1.0/len(retrieval_df)
by_dist2mean_accuracy = sum(retrieval_df['by_dist2mean_eval'])*1.0/len(retrieval_df)

In [None]:
by_dist_accuracy

In [None]:
by_dist2mean_accuracy

## Train SVM/KNN classifier

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC, NuSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score

In [None]:
train_labels = np.array([db_file.split('/')[-2] for db_file in db_file_pool])

train_emb = np.zeros((len(db_file_pool), 128))
for i,db_file in enumerate(db_file_pool):
    train_emb[i] = load_emb(ref_emb_dir,db_file)
    
test_labels = np.array([query_file.split('/')[-2] for query_file in query_file_pool])

test_emb = np.zeros((len(query_file_pool), 128))
for i,query_file in enumerate(query_file_pool):
    test_emb[i] = load_emb(query_emb_dir,query_file)

In [None]:
knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
svc = LinearSVC(C=50,class_weight='balanced',max_iter=100000)
#rf = RandomForestClassifier(n_estimators=100)
knn.fit(train_emb, train_labels)
svc.fit(train_emb, train_labels)
#rf.fit(train_emb, train_labels)

In [None]:
acc_knn = accuracy_score(test_labels, knn.predict(test_emb))
acc_svc = accuracy_score(test_labels, svc.predict(test_emb))
#acc_rf = accuracy_score(test_labels, rf.predict(test_emb))

In [None]:
acc_knn

In [None]:
acc_svc

In [None]:
acc_rf

In [None]:
## plot confusion matrixy
pred_labels = knn.predict(test_emb)
class_names = set(test_labels)
# Compute confusion matrix
cnf_matrix = confusion_matrix(test_labels, pred_labels)
np.set_printoptions(precision=2)

In [None]:
plt.figure(figsize=(20, 20), dpi=80)
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

In [None]:
plt.figure(figsize=(20, 20), dpi=80)
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

## Visualize Embedding

In [None]:
from sklearn.manifold import TSNE

train_tsne_emb = TSNE(n_components=2).fit_transform(train_emb)
test_tsne_emb = TSNE(n_components=2).fit_transform(test_emb)


In [None]:
plt.figure(figsize=(20, 20), dpi=80)
for i, t in enumerate(sorted(list(set(train_labels)))):
    idx = train_labels == t
    plt.scatter(train_tsne_emb[idx, 0], train_tsne_emb[idx, 1], label=t)
    plt.annotate(t, 
                 train_tsne_emb[idx].mean(axis=0),
                 horizontalalignment='center',
                 verticalalignment='center',
                 size=10, weight='bold',
                 label=t) 

plt.legend(bbox_to_anchor=(1, 1));

In [None]:
plt.figure(figsize=(20, 20), dpi=80)
for i, t in enumerate(sorted(list(set(test_labels)))):
    idx = test_labels == t
    plt.scatter(test_tsne_emb[idx, 0], test_tsne_emb[idx, 1], label=t)
    plt.annotate(t, 
                 test_tsne_emb[idx].mean(axis=0),
                 horizontalalignment='center',
                 verticalalignment='center',
                 size=10, weight='bold',
                 label=t) 

plt.legend(bbox_to_anchor=(1, 1));

## PCA visualization

In [None]:
from sklearn.decomposition import PCA

In [None]:
train_pca_emb = PCA(n_components=2).fit_transform(train_emb)

test_pca_emb = PCA(n_components=2).fit(train_emb).transform(test_emb)


In [None]:
plt.figure(figsize=(20, 20), dpi=80)
for i, t in enumerate(sorted(list(set(train_labels)))):
    idx = train_labels == t
    plt.scatter(train_pca_emb[idx, 0], train_pca_emb[idx, 1], label=t)
    plt.annotate(t, 
                 train_pca_emb[idx].mean(axis=0),
                 horizontalalignment='center',
                 verticalalignment='center',
                 size=10, weight='bold',
                 label=t) 

plt.legend(bbox_to_anchor=(1, 1));

In [None]:
plt.figure(figsize=(20, 20), dpi=80)
for i, t in enumerate(sorted(list(set(test_labels)))):
    idx = test_labels == t
    plt.scatter(test_pca_emb[idx, 0], test_pca_emb[idx, 1], label=t)
    plt.annotate(t, 
                 test_pca_emb[idx].mean(axis=0),
                 horizontalalignment='center',
                 verticalalignment='center',
                 size=10, weight='bold',
                 label=t) 

plt.legend(bbox_to_anchor=(1, 1));