The notebook is used to compare the similarity between labels generated from Amazon Rekognition and the Vizwiz groundtruth captions using  

Method 1. TF-IDF representation and cosine similarity.  
Method 2. BoW representation and Jaccard similarity.  

In [None]:
import pandas as pd
import numpy as np
import statistics
import os
import json
import zipfile

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt') # for tokenization
nltk.download('stopwords') # for removing stopword
nltk.download('wordnet') # for lemmatization
stop_words = set(stopwords.words('english'))

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Download image-labels data AWS Rekognition processed on validation images.  

In [None]:
file_path = '/content/drive/My Drive/Capstone Project/rekognition_val_outputs.csv'
aws_df=pd.read_csv(file_path)

In [None]:
aws_df.head()

Unnamed: 0,image_id,tokens
0,0,"['Computer', 'Electronics', 'Laptop', 'Pc', 'Screen', 'Computer Hardware', 'Hardware', 'Monitor']"
1,1,"['Food', 'Seasoning', 'Syrup']"
2,2,"['Publication', 'Book', 'Text', 'Animal', 'Canine', 'Dog', 'Mammal', 'Pet']"
3,3,['Toothpaste']
4,4,"['Electronics', 'Screen', 'Computer Hardware', 'Hardware', 'Monitor', 'TV']"


In [None]:
aws_df.shape

(7750, 2)

In [None]:
aws_df = aws_df.rename(columns={"tokens":"Labels"})

In [None]:
aws_df.head()

Unnamed: 0,image_id,Labels
0,0,"['Computer', 'Electronics', 'Laptop', 'Pc', 'Screen', 'Computer Hardware', 'Hardware', 'Monitor']"
1,1,"['Food', 'Seasoning', 'Syrup']"
2,2,"['Publication', 'Book', 'Text', 'Animal', 'Canine', 'Dog', 'Mammal', 'Pet']"
3,3,['Toothpaste']
4,4,"['Electronics', 'Screen', 'Computer Hardware', 'Hardware', 'Monitor', 'TV']"


Preprocess aws labels

In [None]:
def preprocess_text(text):
    # lowercase
    text = text.lower()

    # tokenize text
    tokens = word_tokenize(text)

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    no_stopwords_tokens = [word for word in tokens if word not in stop_words]

    # lemmatize text
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in no_stopwords_tokens]

    text = ' '.join(lemmatized_tokens)

    return text

In [None]:
# apply text preprocessing
aws_df['preprocessed_text'] = aws_df['Labels'].apply(preprocess_text)

aws_df.head()

Unnamed: 0,image_id,Labels,preprocessed_text
0,0,"['Computer', 'Electronics', 'Laptop', 'Pc', 'Screen', 'Computer Hardware', 'Hardware', 'Monitor']","[ 'computer ' , 'electronics ' , 'laptop ' , 'pc ' , 'screen ' , 'computer hardware ' , 'hardware ' , 'monitor ' ]"
1,1,"['Food', 'Seasoning', 'Syrup']","[ 'food ' , 'seasoning ' , 'syrup ' ]"
2,2,"['Publication', 'Book', 'Text', 'Animal', 'Canine', 'Dog', 'Mammal', 'Pet']","[ 'publication ' , 'book ' , 'text ' , 'animal ' , 'canine ' , 'dog ' , 'mammal ' , 'pet ' ]"
3,3,['Toothpaste'],[ 'toothpaste ' ]
4,4,"['Electronics', 'Screen', 'Computer Hardware', 'Hardware', 'Monitor', 'TV']","[ 'electronics ' , 'screen ' , 'computer hardware ' , 'hardware ' , 'monitor ' , 'tv ' ]"


Load validation data from VizWiz

In [None]:
#extract annotations (val.json, train.json, test.json)
zip_files = ['/content/drive/MyDrive/Capstone Project/annotations.zip']

#destination folder for extracted files
destination_folder = '/content/'

#create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

#loop through each zip file
for zip_file in zip_files:
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        #extract the contents of the zip file to the destination folder
        zip_ref.extractall(destination_folder)

In [None]:
with open('annotations/val.json', 'r') as f:
    annotation_data = json.load(f)

annotation_data.keys()

dict_keys(['info', 'images', 'annotations'])

In [None]:
len(annotation_data['images'])

7750

In [None]:
annotation_data['annotations'][:2]

[{'caption': 'A computer screen shows a repair prompt on the screen.',
  'image_id': 23431,
  'is_precanned': False,
  'is_rejected': False,
  'id': 117155,
  'text_detected': True},
 {'caption': 'a computer screen with a repair automatically pop up',
  'image_id': 23431,
  'is_precanned': False,
  'is_rejected': False,
  'id': 117156,
  'text_detected': True}]

In [None]:
caption_list = []
id_list = []
for captions in annotation_data['annotations']:
    caption_list.append(captions['caption'])
    id_list.append(captions['image_id'])

In [None]:
viz_df = pd.DataFrame(data={'caption': caption_list, 'image_id': id_list})

In [None]:
viz_df.head()

Unnamed: 0,caption,image_id
0,A computer screen shows a repair prompt on the screen.,23431
1,a computer screen with a repair automatically pop up,23431
2,partial computer screen showing the need of repairs,23431
3,Part of a computer monitor showing a computer repair message.,23431
4,The top of a laptop with a blue background and dark blue text.,23431


In [None]:
# adjust image id and image name
viz_df['image_id'] = viz_df['image_id'] - 23431

In [None]:
viz_df.head(10)

Unnamed: 0,caption,image_id
0,A computer screen shows a repair prompt on the screen.,0
1,a computer screen with a repair automatically pop up,0
2,partial computer screen showing the need of repairs,0
3,Part of a computer monitor showing a computer repair message.,0
4,The top of a laptop with a blue background and dark blue text.,0
5,A person is holding a bottle that has medicine for the night time.,1
6,A bottle of medication has a white twist top.,1
7,night time medication bottle being held by someone,1
8,a person holding a small black bottle of NIGHT TIME,1
9,A bottle of what appears to be cough syrup held in hand.,1


In [None]:
# for each image, combine the five captions into a single list
viz_combined = viz_df.groupby('image_id')['caption'].apply(' '.join).reset_index()

viz_combined.rename(columns={'caption': 'combined_captions'}, inplace=True)

In [None]:
# apply preprocessing
viz_combined['preprocessed_text'] = viz_combined['combined_captions'].apply(preprocess_text)

In [None]:
viz_combined.head()

Unnamed: 0,image_id,combined_captions,preprocessed_text
0,0,A computer screen shows a repair prompt on the screen. a computer screen with a repair automatically pop up partial computer screen showing the need of repairs Part of a computer monitor showing a computer repair message. The top of a laptop with a blue background and dark blue text.,computer screen show repair prompt screen . computer screen repair automatically pop partial computer screen showing need repair part computer monitor showing computer repair message . top laptop blue background dark blue text .
1,1,A person is holding a bottle that has medicine for the night time. A bottle of medication has a white twist top. night time medication bottle being held by someone a person holding a small black bottle of NIGHT TIME A bottle of what appears to be cough syrup held in hand.,person holding bottle medicine night time . bottle medication white twist top . night time medication bottle held someone person holding small black bottle night time bottle appears cough syrup held hand .
2,2,"a white paper showing an image of black and brown dog A library book with pictures of two dogs on the cover on a wooden table. A book with a black and a tan dog walking down a snowy street. The book cover shows two dogs in the snow A book cover title Dog Years with an image of a black and brown dog walking up the street, on the left side it has a due date sticker from a library.","white paper showing image black brown dog library book picture two dog cover wooden table . book black tan dog walking snowy street . book cover show two dog snow book cover title dog year image black brown dog walking street , left side due date sticker library ."
3,3,"A white box is to the left of a blue box on a wooden table. A small rectangular red and white box next to a small rectangular blue box on a wooden surface. two boxes of medicine, one white and red and the other blue sitting on a table Two boxes that appear to contain medication or eye drops Two boxes of pharmaceutical products left in a table","white box left blue box wooden table . small rectangular red white box next small rectangular blue box wooden surface . two box medicine , one white red blue sitting table two box appear contain medication eye drop two box pharmaceutical product left table"
4,4,close up of a computer monitor that is powered on. A monitor has a message displayed on it. Pictured here is a screenshot that shows an error message from an app. Computer screen displaying an error saying the display driver is not supported by Zoom Text. a screenshot of someone's monitor that is having issues,close computer monitor powered . monitor message displayed . pictured screenshot show error message app . computer screen displaying error saying display driver supported zoom text . screenshot someone 's monitor issue


## Method 1: TF-IDF representation & Cosine similarity

Create a TF-IDF representation for preprocessed_text column in both aws_df and viz_combined dataframes.  

In [None]:
# define a tf-idf function

def create_tfidf(dataframe, column):
    tfidf_vectorizer = TfidfVectorizer()

    tfidf_matrix = tfidf_vectorizer.fit_transform(dataframe[column])

    feature_names = tfidf_vectorizer.get_feature_names_out()

    return tfidf_matrix, feature_names

In [None]:
tfidf_aws, features_aws = create_tfidf(aws_df, 'preprocessed_text')
tfidf_viz_combined, features_viz_combined = create_tfidf(viz_combined, 'preprocessed_text')

In [None]:
print("TF-IDF Representation (DataFrame aws_df):", tfidf_aws.toarray())
print("Shape of TF-IDF Representation (DataFrame aws_df):", tfidf_aws.toarray().shape)
print("TF-IDF Feature Names (DataFrame aws_df):", features_aws)

print("TF-IDF Representation (DataFrame viz_combined):", tfidf_viz_combined.toarray())
print("Shape of TF-IDF Representation (DataFrame viz_combined):", tfidf_viz_combined.toarray().shape)
print("TF-IDFFeature Names (DataFrame viz_combined):", features_viz_combined)

TF-IDF Representation (DataFrame aws_df): [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Shape of TF-IDF Representation (DataFrame aws_df): (7750, 864)
TF-IDF Feature Names (DataFrame aws_df): ['abyssinian' 'accessories' 'activity' 'adapter' 'adult' 'advertisement'
 'aftershave' 'agapanthus' 'aid' 'aircraft' 'airfield' 'airplane'
 'airport' 'alcohol' 'alloy' 'altar' 'aluminium' 'american' 'amplifier'
 'analog' 'anemone' 'angel' 'animal' 'ankle' 'apartment' 'apiaceae'
 'apple' 'appliance' 'applique' 'aquarium' 'aquatic' 'arbour'
 'archaeology' 'architecture' 'area' 'arena' 'arm' 'armchair' 'armor'
 'arrangement' 'arrow' 'arrowhead' 'art' 'ashtray' 'astragalus'
 'astronomy' 'avocado' 'baby' 'back' 'backpack' 'backyard' 'badge' 'bag'
 'ball' 'balloon' 'banana' 'bandage' 'banner' 'barefoot' 'baseball'
 'basketball' 'bass' 'bath' 'bathing' 'bathroom' 'bathtub' 'bead' 'beak'
 'beanie'

We can see from the shape that the dimensions of two tf-idf representations are different. Need to train with combined vocabulary in order to compute cosine similarity. If not, it arises incompatible dimension value error.

In [None]:
def create_full_voc_tfidf(dataframe1, dataframe2, column1, column2):

  full_voc = dataframe1[column1].to_list() + dataframe2[column2].to_list()

  tfidf_vectorizer = TfidfVectorizer()
  tfidf_vectorizer.fit(full_voc)

  tfidf_matrix_1 = tfidf_vectorizer.transform(dataframe1[column1])
  tfidf_matrix_2 = tfidf_vectorizer.transform(dataframe2[column2])

  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()


  return tfidf_matrix_1, tfidf_matrix_2, tfidf_feature_names


In [None]:
tfidf_full_aws, tfidf_full_viz_combined, tfidf_full_feature = create_full_voc_tfidf(aws_df, viz_combined, 'preprocessed_text','preprocessed_text')

In [None]:
print("TF-IDF Representation (DataFrame aws_df):", tfidf_full_aws.toarray())
print("Shape of TF-IDF Representation (DataFrame aws_df):", tfidf_full_aws.toarray().shape)

print("TF-IDF Representation (DataFrame viz_combined):", tfidf_full_viz_combined.toarray())
print("Shape of TF-IDF Representation (DataFrame viz_combined):", tfidf_full_viz_combined.toarray().shape)

print("Full TF-IDF Feature Names from both dataframes:", tfidf_full_feature)

TF-IDF Representation (DataFrame aws_df): [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Shape of TF-IDF Representation (DataFrame aws_df): (7750, 9546)
TF-IDF Representation (DataFrame viz_combined): [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Shape of TF-IDF Representation (DataFrame viz_combined): (7750, 9546)
Full TF-IDF Feature Names from both dataframes: ['00' '000' '00011420' ... 'zoomtext' 'zucca' 'zylo']


Compute Cosine Similarity Scores for corresponding images

In [None]:
# compute cosine similarity (the result is between each pair of images)
cosine_sim_matrix = cosine_similarity(tfidf_full_aws, tfidf_full_viz_combined)

# to get similarity scores for corresponding images, need to extract diagonal elements
corresponding_cosine_sim_scores = cosine_sim_matrix.diagonal()

In [None]:
for index, score in enumerate(corresponding_cosine_sim_scores):
    print(f"Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image {index} is: {score:.3f}")

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image 2750 is: 0.000
Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image 2751 is: 0.000
Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image 2752 is: 0.063
Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image 2753 is: 0.000
Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image 2754 is: 0.000
Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image 2755 is: 0.000
Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image 2756 is: 0.045
Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image 2757 is: 0.309
Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image 2758 is: 0.116
Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image 2759 is: 0.000
Cosine similarity between AWS Rekognition and VizWiz Groundtruth of image 2

In [None]:
print(f"The cosine similarity score of TF-IDF representation of labels from AWS Rekognition with groundtruth VizWiz captions in TF-IDF represeatation is {np.average(corresponding_cosine_sim_scores):.3f}")

The cosine similarity score of TF-IDF representation of labels from AWS Rekognition with groundtruth VizWiz captions in TF-IDF represeatation is 0.122


## Method 2: BoW representation & Jaccard similarity

In [None]:
# Bag of Words representation

def create_BoW(dataframe, column):
    vectorizer = CountVectorizer(binary=True)
    BoW_matrix = vectorizer.fit_transform(dataframe[column])

    feature_names = vectorizer.get_feature_names_out()

    return BoW_matrix, feature_names

In [None]:
BoW_aws, features_aws = create_BoW(aws_df, 'preprocessed_text')
BoW_viz_combined, features_viz_combined = create_BoW(viz_combined, 'preprocessed_text')

In [None]:
print("Shape of BoW Representation (DataFrame aws_df):", BoW_aws.toarray().shape)
print("Feature Names (DataFrame aws_df):", features_aws)

print("Shape of BoW Representation (DataFrame viz_combined):", BoW_viz_combined.toarray().shape)
print("Feature Names (DataFrame viz_combined):", features_viz_combined)

Shape of BoW Representation (DataFrame aws_df): (7750, 864)
Feature Names (DataFrame aws_df): ['abyssinian' 'accessories' 'activity' 'adapter' 'adult' 'advertisement'
 'aftershave' 'agapanthus' 'aid' 'aircraft' 'airfield' 'airplane'
 'airport' 'alcohol' 'alloy' 'altar' 'aluminium' 'american' 'amplifier'
 'analog' 'anemone' 'angel' 'animal' 'ankle' 'apartment' 'apiaceae'
 'apple' 'appliance' 'applique' 'aquarium' 'aquatic' 'arbour'
 'archaeology' 'architecture' 'area' 'arena' 'arm' 'armchair' 'armor'
 'arrangement' 'arrow' 'arrowhead' 'art' 'ashtray' 'astragalus'
 'astronomy' 'avocado' 'baby' 'back' 'backpack' 'backyard' 'badge' 'bag'
 'ball' 'balloon' 'banana' 'bandage' 'banner' 'barefoot' 'baseball'
 'basketball' 'bass' 'bath' 'bathing' 'bathroom' 'bathtub' 'bead' 'beak'
 'beanie' 'bear' 'bed' 'bedroom' 'beef' 'beer' 'bell' 'belt' 'bench'
 'berry' 'beverage' 'bicycle' 'binder' 'bird' 'birthday' 'black'
 'blackboard' 'blade' 'blanket' 'blazer' 'blonde' 'blouse' 'blow'
 'blueberry' 'boa

For BoW representatoin, same that we should train with the full vocabulary for both to avoid dimension mismatch in similarity computation.

In [None]:
def create_full_voc_bow(dataframe1, dataframe2, column1, column2):

  full_voc = dataframe1[column1].to_list() + dataframe2[column2].to_list()

  bow_vectorizer = CountVectorizer()
  bow_vectorizer.fit(full_voc)

  bow_matrix_1 = bow_vectorizer.transform(dataframe1[column1])
  bow_matrix_2 = bow_vectorizer.transform(dataframe2[column2])

  bow_feature_names = bow_vectorizer.get_feature_names_out()

  return bow_matrix_1, bow_matrix_2, bow_feature_names

In [None]:
bow_full_aws, bow_full_viz_combined, bow_full_feature = create_full_voc_bow(aws_df, viz_combined, 'preprocessed_text','preprocessed_text')

In [None]:
print("BoW Representation (DataFrame aws_df):", bow_full_aws.toarray())
print("Shape of BoW Representation (DataFrame aws_df):", bow_full_aws.toarray().shape)

print("BoW Representation (DataFrame viz_combined):", bow_full_viz_combined.toarray())
print("Shape of BoW Representation (DataFrame viz_combined):", bow_full_viz_combined.toarray().shape)

print("Full BoW Feature Names from both dataframes:", bow_full_feature)

BoW Representation (DataFrame aws_df): [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Shape of BoW Representation (DataFrame aws_df): (7750, 9546)
BoW Representation (DataFrame viz_combined): [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Shape of BoW Representation (DataFrame viz_combined): (7750, 9546)
Full BoW Feature Names from both dataframes: ['00' '000' '00011420' ... 'zoomtext' 'zucca' 'zylo']


Per documentation (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.jaccard_score.html)  

'micro':
Calculate metrics globally by counting the total true positives, false negatives and false positives.

'macro':
Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.

'weighted':
Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance.

In [None]:
jaccard_similarities_weighted = []
for i in range(BoW_aws.shape[0]):
  ja_similarity_weighted = jaccard_score(bow_full_aws.toarray()[i], bow_full_viz_combined.toarray()[i], average='weighted')
  jaccard_similarities_weighted.append(ja_similarity_weighted)

In [None]:
print(f"The weighted Jaccard similarity score of labels from AWS Rekognition with groundtruth VizWiz captions in BoW represeatation is {statistics.mean(jaccard_similarities_weighted):.3f}")

The weighted Jaccard similarity score of labels from AWS Rekognition with groundtruth VizWiz captions in BoW represeatation is 0.997


In [None]:
jaccard_similarities_macro = []
for i in range(BoW_aws.shape[0]):
  ja_similarity_macro = jaccard_score(bow_full_viz_combined.toarray()[i], bow_full_aws.toarray()[i], average='macro')
  jaccard_similarities_macro.append(ja_similarity_macro)

In [None]:
print(f"The macro Jaccard similarity score of labels from AWS Rekognition with groundtruth VizWiz captions in BoW represeatation is {statistics.mean(jaccard_similarities_macro):.3f}")

The macro Jaccard similarity score of labels from AWS Rekognition with groundtruth VizWiz captions in BoW represeatation is 0.236


In [None]:
jaccard_similarities_micro = []
for i in range(BoW_aws.shape[0]):
  ja_similarity_micro = jaccard_score(bow_full_viz_combined.toarray()[i], bow_full_aws.toarray()[i], average='micro')
  jaccard_similarities_micro.append(ja_similarity_micro)

In [None]:
print(f"The micro Jaccard similarity score of labels from AWS Rekognition with groundtruth VizWiz captions in BoW represeatation is {statistics.mean(jaccard_similarities_macro):.3f}")

The micro Jaccard similarity score of labels from AWS Rekognition with groundtruth VizWiz captions in BoW represeatation is 0.236
