In [68]:
import pandas as pd
import numpy as np
from transformers import AutoProcessor, CLIPModel
from PIL import Image
import requests
import torch
import time
import os

In [70]:
df_images = pd.read_csv("data/silver_20250322_Airline_Reviews_ImageCaption.csv")
df_images

Unnamed: 0.1,Unnamed: 0,RowId,Airline Name,Review_Title,Review Date,Top Review Image Url,img_resnet50_score,img_resnet50_label,img_blip_caption
0,0,0,AB Aviation,"""pretty decent airline""",11th November 2019,https://www.airlinequality.com/wp-content/uplo...,5.190325,military aircraft,a man taking a picture of a plane
1,1,13,Adria Airways,"""Would not fly again""",30th June 2018,https://www.airlinequality.com/wp-content/uplo...,6.628997,mail bag,the green leather case is attached to the back...
2,2,108,Aegean Airlines,“claims to be pet-friendly”,27th October 2024,https://www.airlinequality.com/wp-content/uplo...,5.040598,jeep,a woman holding a dog in a cage
3,3,109,Aegean Airlines,“Great value for money!”,13th October 2024,https://www.airlinequality.com/wp-content/uplo...,7.355824,espresso,a cup of coffee on a table
4,4,110,Aegean Airlines,"""75 euro for 1 10 kg bag round trip""",4th October 2024,https://www.airlinequality.com/wp-content/uplo...,5.758941,desk,a computer desk with a monitor and a printer
...,...,...,...,...,...,...,...,...,...
1892,1892,24298,ZIPAIR,"""being periodically harassed and ridiculed""",19th September 2023,https://www.airlinequality.com/wp-content/uplo...,4.510304,photocopier,a woman sitting in a plane
1893,1893,24304,ZIPAIR,"""can highly recommend ZIPAIR""",23rd July 2023,https://www.airlinequality.com/wp-content/uplo...,4.862087,waffle iron,a cell phone holder on an airplane
1894,1894,24312,ZIPAIR,"""I heard back 3 months later""",18th May 2023,https://www.airlinequality.com/wp-content/uplo...,5.286357,assault rifle,a suitcase with a tag attached to it
1895,1895,24320,ZIPAIR,ZIPAIR customer review,21st April 2023,https://www.airlinequality.com/wp-content/uplo...,4.154579,plate,a person holding a fork and knife in a box


#### Restore previous embeddings from backup .npz file

In [None]:
# Restore from .npz embeddings backup file
EMBEDDING_BACKUP_FILE = 'data/20250322_Airline_Reviews_CLIPEmbeddings.npz'
if os.path.exists(EMBEDDING_BACKUP_FILE):
    # Load from .npz
    df_image_clip_preembed = np.load(EMBEDDING_BACKUP_FILE)

    # Access arrays
    row_ids = df_image_clip_preembed['RowIDs']
    image_urls = df_image_clip_preembed['ImageUrls']
    image_embeddings = df_image_clip_preembed['ImageEmbeddings']
    caption_embeddings = df_image_clip_preembed['CaptionEmbeddings']
    resnet50label_embeddings = df_image_clip_preembed['Resnet50labelEmbeddings']

    print("row_ids:", row_ids)
    print("image_embeddings:", image_embeddings)
    print("caption_embeddings:", caption_embeddings)
    print("resnet50label_embeddings:", resnet50label_embeddings)
else:
    df_image_clip_preembed = None
    
#df_image_clip_preembed

#### CLIP Embed Images

In [72]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [74]:
def clip_embedding_image(img_url, row_id=None):
    try:
        print(row_id, img_url)
        
        image = Image.open(requests.get(img_url, stream=True).raw)
        inputs = processor(images=image, return_tensors="pt")
        
        with torch.no_grad():  # Disable gradient computation for inference
            image_embeddings = model.get_image_features(**inputs)
        
        # Normalize the embeddings
        image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)
    
        #1-second delay so as not to spam/DOS the source webserver
        time.sleep(1)

        #return image_embeddings.squeeze().tolist()  # Convert tensor to list for storage
        return image_embeddings #store in parquet file instead as tensor
    except Exception as e:
        print(f"Error processing image {image_url}: {e}")
        return None

In [75]:
clip_embedding_image('https://www.airlinequality.com/wp-content/uploads/2024/10/IMG_5873-500x500.jpeg', 13)

13 https://www.airlinequality.com/wp-content/uploads/2024/10/IMG_5873-500x500.jpeg


tensor([[-1.7717e-02, -4.6647e-02, -3.5812e-02, -7.7319e-03,  3.4006e-02,
         -2.7535e-02,  2.7931e-02,  4.3699e-02,  7.6066e-02,  1.6355e-02,
          4.6908e-02, -1.8241e-02,  1.2569e-02, -1.4163e-02,  6.1898e-02,
          2.5426e-04, -3.5482e-02,  4.8725e-03,  3.3883e-02, -4.0546e-03,
         -5.8251e-02,  1.5348e-02,  3.1560e-02, -1.5834e-02, -1.1362e-03,
          4.2453e-02,  8.5861e-02,  1.5496e-02,  2.3326e-04,  1.3941e-02,
         -1.4455e-02,  2.7522e-02,  7.1328e-03,  1.5114e-02, -2.6777e-02,
          6.0428e-03,  6.9937e-03, -1.8996e-02,  1.0673e-02,  9.2035e-02,
         -8.7404e-04, -4.4117e-02, -2.2013e-02, -6.5910e-02,  3.0903e-02,
         -6.3644e-02, -1.4575e-02,  6.8502e-03, -7.0876e-02, -3.5226e-02,
          2.1512e-02,  1.6209e-02, -3.8574e-04, -7.3284e-04,  7.6769e-03,
          3.4855e-02,  3.3632e-02,  2.1957e-02,  1.7400e-02,  1.5186e-02,
          3.8171e-02, -3.2899e-02,  2.0871e-03, -5.9106e-03, -1.2439e-03,
          1.3491e-02,  6.5164e-03,  3.

In [77]:
df_images_to_embed = df_images

In [80]:
'''
if df_image_clip_preembed is None:
    #df_images_to_embed['img_clip_embedding'] = df_images_to_embed['Top Review Image Url'].apply(clip_embedding_image)
    df_images_to_embed[['img_clip_embedding']] = df_images_to_embed.apply(
        lambda row: pd.Series(clip_embedding_image(row['Top Review Image Url'], row['RowId'])), axis=1
    )
else:
    df_images_to_embed = df_image_clip_preembed
'''

"\nif df_image_clip_preembed is None:\n    #df_images_to_embed['img_clip_embedding'] = df_images_to_embed['Top Review Image Url'].apply(clip_embedding_image)\n    df_images_to_embed[['img_clip_embedding']] = df_images_to_embed.apply(\n        lambda row: pd.Series(clip_embedding_image(row['Top Review Image Url'], row['RowId'])), axis=1\n    )\nelse:\n    df_images_to_embed = df_image_clip_preembed\n"

In [85]:
# With error handling to skip rows
def process_row(row):
    try:
        return clip_embedding_image(row['Top Review Image Url'], row['RowId'])
    except Exception as e:
        print(f"Error for RowId {row['RowId']}: {e}")
        return [None]  # Placeholder if the function fails

In [87]:
if df_image_clip_preembed is None:
    df_images_to_embed['img_clip_embedding'] = df_images_to_embed.apply(process_row, axis=1)
else:
    df_images_to_embed['img_clip_embedding'] = image_embeddings

0 https://www.airlinequality.com/wp-content/uploads/2019/11/20191102_123305-500x500.jpg
13 https://www.airlinequality.com/wp-content/uploads/2018/06/IMG_1571-1-500x500.jpg
108 https://www.airlinequality.com/wp-content/uploads/2024/10/IMG_5873-500x500.jpeg
109 https://www.airlinequality.com/wp-content/uploads/2024/10/20241008_FRA-ATH-A3831_espresso-500x500.jpg
110 https://www.airlinequality.com/wp-content/uploads/2024/10/PXL_20241004_073123468-500x500.jpg
118 https://www.airlinequality.com/wp-content/uploads/2024/07/inbound8469499039478717598-500x500.jpg
130 https://www.airlinequality.com/wp-content/uploads/2024/05/IMG_20240518_103907_edit_2247906755096047-500x500.jpg
135 https://www.airlinequality.com/wp-content/uploads/2024/03/Aegean1-500x500.jpg
136 https://www.airlinequality.com/wp-content/uploads/2024/03/IMG_1475-500x500.jpeg
154 https://www.airlinequality.com/wp-content/uploads/2023/09/16956630467416937441037722578767-500x500.jpg
165 https://www.airlinequality.com/wp-content/uploa

In [101]:
df_images_to_embed['img_clip_embedding']

0       [[tensor(0.0359), tensor(-0.0289), tensor(0.00...
1       [[tensor(-0.0093), tensor(0.0119), tensor(0.00...
2       [[tensor(-0.0177), tensor(-0.0466), tensor(-0....
3       [[tensor(0.0164), tensor(0.0258), tensor(0.006...
4       [[tensor(0.0080), tensor(0.0126), tensor(-0.02...
                              ...                        
1892    [[tensor(-0.0173), tensor(0.0035), tensor(-0.0...
1893    [[tensor(0.0226), tensor(0.0434), tensor(0.008...
1894    [[tensor(-0.0166), tensor(0.0110), tensor(-0.0...
1895    [[tensor(-0.0196), tensor(0.0500), tensor(-0.0...
1896    [[tensor(-0.0250), tensor(0.0249), tensor(-0.0...
Name: img_clip_embedding, Length: 1897, dtype: object

#### Clip Embed image captions

In [107]:
def clip_embedding_text(text, row_id=None):
    try:
        inputs = processor(text=[text], return_tensors="pt", padding=True)
        
        with torch.no_grad():
            text_embeddings = model.get_text_features(**inputs)
            
        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
        
        return text_embeddings
    except Exception as e:
        print(f"Error processing text '{text}': {e}")
        return None

In [109]:
# With error handling to skip rows
def process_row(row):
    try:
        return clip_embedding_text(row['img_blip_caption'], row['RowId'])
    except Exception as e:
        print(f"Error for RowId {row['RowId']}: {e}")
        return [None]  # Placeholder if the function fails

In [111]:
if df_image_clip_preembed is None:
    df_images_to_embed['img_caption_clip_embedding'] = df_images_to_embed.apply(process_row, axis=1)
else:
    df_images_to_embed['img_caption_clip_embedding'] = caption_embeddings

In [112]:
df_images_to_embed['img_caption_clip_embedding']

0       [[tensor(0.0108), tensor(0.0084), tensor(-0.03...
1       [[tensor(-0.0371), tensor(0.0019), tensor(-0.0...
2       [[tensor(-0.0431), tensor(0.0096), tensor(-0.0...
3       [[tensor(0.0076), tensor(0.0134), tensor(-0.02...
4       [[tensor(-0.0374), tensor(-0.0204), tensor(0.0...
                              ...                        
1892    [[tensor(0.0109), tensor(-0.0081), tensor(-0.0...
1893    [[tensor(-0.0233), tensor(0.0724), tensor(-0.0...
1894    [[tensor(-0.0017), tensor(-0.0078), tensor(-0....
1895    [[tensor(-0.0472), tensor(0.0156), tensor(-0.0...
1896    [[tensor(0.0319), tensor(-0.0006), tensor(-0.0...
Name: img_caption_clip_embedding, Length: 1897, dtype: object

#### Clip Embed RESNET50 labels

In [114]:
# With error handling to skip rows
def process_row(row):
    try:
        return clip_embedding_text(row['img_resnet50_label'], row['RowId'])
    except Exception as e:
        print(f"Error for RowId {row['RowId']}: {e}")
        return [None]  # Placeholder if the function fails

In [115]:
if df_image_clip_preembed is None:
    df_images_to_embed['img_resnet50label_clip_embedding'] = df_images_to_embed.apply(process_row, axis=1)
else:
    df_images_to_embed['img_resnet50label_clip_embedding'] = resnet50label_embeddings

In [116]:
df_images_to_embed['img_resnet50label_clip_embedding']

0       [[tensor(0.0091), tensor(-0.0108), tensor(0.01...
1       [[tensor(0.0023), tensor(-0.0049), tensor(-0.0...
2       [[tensor(0.0616), tensor(-0.0362), tensor(0.02...
3       [[tensor(0.0070), tensor(-0.0354), tensor(-0.0...
4       [[tensor(-0.0403), tensor(-0.0093), tensor(0.0...
                              ...                        
1892    [[tensor(-0.0013), tensor(-0.0205), tensor(0.0...
1893    [[tensor(-0.0537), tensor(-0.0516), tensor(0.0...
1894    [[tensor(0.0050), tensor(-0.0014), tensor(-0.0...
1895    [[tensor(-0.0165), tensor(0.0253), tensor(0.00...
1896    [[tensor(0.0033), tensor(0.0196), tensor(-0.00...
Name: img_resnet50label_clip_embedding, Length: 1897, dtype: object

#### Backup image, caption and resnet50label CLIP embeddings to .npz file

In [118]:
row_ids = df_images_to_embed['RowId'].values
image_urls = df_images_to_embed['Top Review Image Url'].values
image_embeddings = np.vstack(df_images_to_embed['img_clip_embedding'].values)  # Stack embeddings into a 2D NumPy array
caption_embeddings = np.vstack(df_images_to_embed['img_caption_clip_embedding'].values)
resnet50label_embeddings = np.vstack(df_images_to_embed['img_resnet50label_clip_embedding'].values)

# Save to .npz
np.savez(EMBEDDING_BACKUP_FILE, RowIDs=row_ids, ImageUrls=image_urls, \
         ImageEmbeddings=image_embeddings, CaptionEmbeddings=caption_embeddings,\
        Resnet50labelEmbeddings=resnet50label_embeddings)

print(f"Embeddings saved to '{EMBEDDING_BACKUP_FILE}'.")

Embeddings saved to 'data/20250322_Airline_Reviews_CLIPEmbeddings.npz'.


#### Compute cosine similarities: CLIP embeddings of image vs BLIP caption vs RESNET50 label

In [139]:
# Compare embedding1 with embedding2
def compute_similarity(embedding1, embedding2, row_id=None):
    if not isinstance(embedding1, torch.Tensor):  # Convert list to tensor if necessary
        if isinstance(embedding1, str):
            embedding1 = ast.literal_eval(embedding1)  # Convert string to list
            embedding1 = torch.tensor(embedding1)
        
    if embedding1 is not None and embedding2 is not None:
        similarity = torch.nn.functional.cosine_similarity(embedding1, embedding2)
        return similarity.item()  # Convert tensor to a scalar
    return None

In [141]:
# # Compute similarity scores: CLIP embeddings of image vs BLIP caption with error handling to skip rows
def process_row(row):
    try:
        return compute_similarity(row['img_clip_embedding'], row['img_caption_clip_embedding'], row['RowId'])
    except Exception as e:
        print(f"Error for RowId {row['RowId']}: {e}")
        return [None]  # Placeholder if the function fails

In [147]:
# Compute similarity scores: CLIP embeddings of image vs BLIP caption
df_images_to_embed['img_vs_caption_similarity'] = df_images_to_embed.apply(process_row, axis=1)

In [156]:
df_images_to_embed['img_vs_caption_similarity']

0       0.251976
1       0.324201
2       0.273312
3       0.293099
4       0.301425
          ...   
1892    0.306634
1893    0.303960
1894    0.308713
1895    0.292105
1896    0.296376
Name: img_vs_caption_similarity, Length: 1897, dtype: float64

In [149]:
# Compute similarity scores: CLIP embeddings of BLIP caption vs RESNET50 label with error handling to skip rows
def process_row(row):
    try:
        return compute_similarity(row['img_caption_clip_embedding'], row['img_resnet50label_clip_embedding'], row['RowId'])
    except Exception as e:
        print(f"Error for RowId {row['RowId']}: {e}")
        return [None]  # Placeholder if the function fails

In [151]:
# Compute similarity scores: CLIP embeddings of BLIP caption vs RESNET50 label
df_images_to_embed['img_caption_vs_resnetlabel_similarity'] = df_images_to_embed.apply(process_row, axis=1)

In [158]:
df_images_to_embed['img_caption_vs_resnetlabel_similarity']

0       0.738985
1       0.709402
2       0.569683
3       0.838396
4       0.798913
          ...   
1892    0.674276
1893    0.531253
1894    0.735511
1895    0.623484
1896    0.691259
Name: img_caption_vs_resnetlabel_similarity, Length: 1897, dtype: float64

#### Persist to csv file

In [154]:
df_images_to_embed[['RowId','Airline Name','Review_Title','Review Date','Top Review Image Url',\
    'img_resnet50_score','img_resnet50_label','img_blip_caption',\
    'img_clip_embedding','img_caption_clip_embedding','img_resnet50label_clip_embedding',\
    'img_vs_caption_similarity','img_caption_vs_resnetlabel_similarity']]\
    .to_csv("data/silver_20250322_Airline_Reviews_ImageCLIPembeddings.csv")