In [None]:
import os
import pandas as pd
import numpy as np
import json
import random
from keras.layers.core import Flatten, Dropout
from keras.layers import Input, Dense, Lambda, Layer
from keras import backend as K
from keras import applications
from keras.models import Sequential, Model
from keras.optimizers import RMSprop, Adam
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.applications.resnet import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.resnet import ResNet152
from tensorflow.keras.models import load_model
from statistics import median

In [None]:
### set project variables

In [None]:
use_colab=False

# Path to folder containing images
if use_colab:
  from google.colab import drive
  
  drive.mount('/content/drive')
  dataset_directory = '/content/drive/MyDrive/Colab/datasets/wikihow'
  workspace = '/content/drive/MyDrive/Colab/wikihow'
else:
  dataset_directory = '/kuacc/users/asafa22/google-drive/wikihow'
  workspace = '/kuacc/users/asafa22/workspace/wikihow'

article_file_path = os.path.join(dataset_directory, 'articles.json')
text_image_mapping_file_path = os.path.join(workspace, 'Siamese','articles.csv')
word_vector_file_path = os.path.join(workspace, 'Siamese','word2vec_gensim.csv')
siamese_model_directory = os.path.join(workspace,'Siamese','model')
dual_encoder_model_directory = os.path.join('dual_encoder')
numbder_of_test_samples = 3
number_of_negative_options = 3

In [None]:
### implement wikihow functions

In [None]:
def parse_wikihow_article_file(text_image_mapping_file_path):

    df = pd.DataFrame(columns=['id','key','article', 'categories','step','image'])
    df.set_index('id',inplace=True)

    stepd_dictionary = dict()
    text_image_file = open(text_image_mapping_file_path,)
    text_image_dictionary = json.load(text_image_file)
    id = 0
    for article in text_image_dictionary:
        categories = text_image_dictionary[article]['categories']
        categories_as_text = ", ".join(categories)
        categories_as_text = categories_as_text.replace('Category:','')

        steps = text_image_dictionary[article]['steps']
        for step in steps:
            stepid = step['step_id']
            step_text = step['step']
            image = step['image']
            step_key = article+":"+str(stepid)
            stepd_dictionary[step_key] = step_text
            df.loc[id]=[step_key, article, categories_as_text, step_text, image]
            id+=1
            
    return df

In [None]:
### read data

In [None]:
dataset = parse_wikihow_article_file(article_file_path)
print('Number of data rows = ',len(dataset))

Number of data rows =  31966


In [None]:
### create test sample dataframe

In [None]:
# pick image and positive candidate
test_samples = dataset.sample(numbder_of_test_samples)

In [None]:
### contruct the random test data

In [None]:
# construcnt randome sample dataset
random_candidates = []
test = dict()

for id,row in test_samples.iterrows():
    negative_step_df= dataset.sample(number_of_negative_options)
    sample_categories = row['categories']
    test = dict()
    image = row['image']
    article = row['article']
    image = row['image']
    
    positive_candidate = dict()
    positive_candidate['categories']=row['categories']
    positive_candidate['article']=row['article']
    positive_candidate['step']=row['step']
    negative_candidates = []
    for negative_id, negative_row in negative_step_df.iterrows():
        negative_candidate = dict()
        negative_candidate['categories']=negative_row['categories']
        negative_candidate['article']=negative_row['article']
        negative_candidate['step']=negative_row['step']
        negative_candidates.append(negative_candidate)
    
    test['image']= image
    test['positive_candidate']= positive_candidate
    test['negative_candidates']= negative_candidates
    random_candidates.append(test)
  
print(random_candidates)

[{'image': 'images/Build_Glutes/11.jpg', 'positive_candidate': {'categories': 'Gluteus Muscle Workouts, Weights for Strength Training', 'article': 'Build Glutes', 'step': 'Eat the right kind of fat. You need a certain amount of healthy fats to maintain good health. Good fats are usually liquid at room temperature. Fats such as olive oil, flaxseed oil, and safflower oil are good. If a fat is solid at room temperature, such as butter, avoid it.[32]'}, 'negative_candidates': [{'categories': 'Cholesterol and Lipoproteins, Desserts and Sweets, Featured Articles', 'article': 'Enjoy Cholesterol‐Friendly Desserts', 'step': 'Visit a vegan bakery. To really indulge in cholesterol-friendly, vegan desserts, visit a vegan bakery to buy a plant-based treat. Vegan-friendly diet choices are rising in popularity, and with more people aiming to move away from meat and milk ingredients, vegan businesses are on the rise as a result. To find a vegan bakery near you, visit VegGuide.org, a community-based we

In [None]:
### contruct the in-category test data

In [None]:
# construcnt in-category sample dataset
category_candidates = []
test = dict()
for id,row in test_samples.iterrows():
    sample_categories = row['categories']
    test = dict()
    image = row['image']
    article = row['article']
    image = row['image']
    
    positive_candidate = dict()
    positive_candidate['categories']=row['categories']
    positive_candidate['article']=row['article']
    positive_candidate['step']=row['step']
    
    negative_goal_df = dataset.loc[dataset['categories'].apply(lambda x: len(set(x.split(','))&set(sample_categories.split(',')))>0)]
    negative_goal_df= negative_goal_df.sample(number_of_negative_options)
    negative_candidates = []
    for negative_id, negative_row in negative_goal_df.iterrows():
        negative_candidate = dict()
        negative_candidate['categories']=negative_row['categories']
        negative_candidate['article']=negative_row['article']
        negative_candidate['step']=negative_row['step']
        negative_candidates.append(negative_candidate)
    
    test['image']= image
    test['positive_candidate']= positive_candidate
    test['negative_candidates']= negative_candidates
    category_candidates.append(test)
  
print(category_candidates)

[{'image': 'images/Build_Glutes/11.jpg', 'positive_candidate': {'categories': 'Gluteus Muscle Workouts, Weights for Strength Training', 'article': 'Build Glutes', 'step': 'Eat the right kind of fat. You need a certain amount of healthy fats to maintain good health. Good fats are usually liquid at room temperature. Fats such as olive oil, flaxseed oil, and safflower oil are good. If a fat is solid at room temperature, such as butter, avoid it.[32]'}, 'negative_candidates': [{'categories': 'Gluteus Muscle Workouts, Leg Strengthening Exercises', 'article': 'Do a Glute Targeted Bulgarian Split Squat', 'step': 'Find your standing position. Your standing position should be near, but not touching, the bench. Facing away from the bench, utilize a mirror to find enough space between yourself and the bench to allow enough room to lift your foot backwards.\nThis space will look different for everyone as we all have different leg lengths and flexibility range.'}, {'categories': 'Gluteus Muscle Wor

In [None]:
### contruct the in-article test data

In [None]:
# construcnt in-article sample dataset
article_candidates = []
test = dict()
for id,row in test_samples.iterrows():
    test = dict()
    image = row['image']
    article = row['article']
    image = row['image']
    
    positive_candidate = dict()
    positive_candidate['categories']=row['categories']
    positive_candidate['article']=row['article']
    positive_candidate['step']=row['step']
    
    negative_candidate_df = dataset.loc[dataset['article']==article]
    negative_candidate_df= negative_candidate_df.sample(number_of_negative_options)
    negative_candidates = []
    for negative_id, negative_row in negative_candidate_df.iterrows():
        negative_candidate = dict()
        negative_candidate['categories']=negative_row['categories']
        negative_candidate['article']=negative_row['article']
        negative_candidate['step']=negative_row['step']
        negative_candidates.append(negative_candidate)
    
    test['image']= image
    test['positive_candidate']= positive_candidate
    test['negative_candidates']= negative_candidates
    article_candidates.append(test)

In [None]:
# construcnt goal sample dataset
#goal_candidate_dataset = pd.DataFrame(columns=['image','positive_candidate','negative_candidats'])
goal_candidates = []
test = dict()
unique_articles = dataset['article'].unique()
for id,row in test_samples.iterrows():
    test = dict()
    image = row['image']
    article = row['article']
    
    positive_candidate = dict()
    positive_candidate['categories']=row['categories']
    positive_candidate['article']=row['article']
    
    negative_candidates = []
    while len(negative_candidates) <number_of_negative_options:
        negative_article = random.choices(unique_articles,k=1)
        if negative_article[0]==article:
            continue
        negative_candidate = dict()
        negative_candidate['categories']=dataset.loc[dataset['article'] == negative_article[0]].iloc[0]['categories']
        negative_candidate['article']=negative_article[0]
        negative_candidates.append(negative_candidate)
    test['image']= image
    test['positive_candidate']= positive_candidate
    test['negative_candidates']= negative_candidates
    goal_candidates.append(test)

In [None]:
### find the Siamese model

In [None]:
def find_siamese_similarity(model, image_path, query_embeddings):

    text_embeds = np.zeros((len(query_embeddings),512))
    image_embeds = np.zeros((len(query_embeddings), 224, 224, 3))
    
    # preprocess image #
    image_embed = load_img(image_path, target_size=(224, 224))
    image_embed = img_to_array(image_embed)
    image_embed = np.expand_dims(image_embed, axis=0)
    image_embed = preprocess_input(image_embed)
    
    for q_index in range(len(query_embeddings)):
       text_embeds[q_index]=query_embeddings[q_index]
       image_embeds[q_index]=image_embed
    
    predicts = model.predict([text_embeds, image_embeds])
    return predicts

In [None]:
### Do the siamese test


In [None]:
def test_siamese_similarity(model, tests, key, article_file_path, query_embedding_file_path):    
    articles = pd.read_csv(article_file_path, header=0)
    embeddings = pd.read_csv(word_vector_file_path, header=None)
    
    for test in tests:
        text_embeddings = []
        image = test['image']
        image_path = os.path.join(dataset_directory,image)
        
        positive_candidate = test['positive_candidate']
        negative_candidates = test['negative_candidates']
        positive_candodidate_index = articles.loc[articles[key] == positive_candidate[key]].index[0]
        #print('positive_candodidate_index ', positive_candodidate_index)
        positive_candidate_embedding = embeddings.iloc[positive_candodidate_index]
        text_embeddings.append(positive_candidate_embedding)
        
        for negative_candidate in negative_candidates:
            negative_candodidate_index = articles.loc[articles[key] == negative_candidate[key]].index[0]
            #print('negative_candodidate_index ', negative_candodidate_index)
            negative_candodidate_embedding = embeddings.iloc[negative_candodidate_index]
            #print('negative_candodidate_embedding ', negative_candodidate_embedding)
            text_embeddings.append(negative_candodidate_embedding)
        
        predicts = find_siamese_similarity(model, image_path, text_embeddings)
        print(predicts)
        

In [None]:
load model
model = load_model(
    siamese_model_directory, custom_objects=None, compile=False, options=None
)
print('model loaded')
test_siamese_similarity(model, random_candidates, 'step', text_image_mapping_file_path, word_vector_file_path)

[[0.39048558]
 [0.39048558]
 [0.39048558]
 [0.39048558]]
[[0.39345193]
 [0.39345193]
 [0.39345193]
 [0.39345193]]
[[0.390279]
 [0.390279]
 [0.390279]
 [0.390279]]
