In [103]:
import numpy as np
from PIL import Image
import pandas as pd
import requests
from io import BytesIO
import torch
from torchvision import models, transforms
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pickle
import ast
import math
from numpy.linalg import norm

Loading data from given CSV file

In [104]:
data_path = 'A2_Data.csv'
data = pd.read_csv(data_path)
data.head(10)

Unnamed: 0.1,Unnamed: 0,Image,Review Text
0,3452,['https://images-na.ssl-images-amazon.com/imag...,Loving these vintage springs on my vintage str...
1,1205,['https://images-na.ssl-images-amazon.com/imag...,Works great as a guitar bench mat. Not rugged ...
2,1708,['https://images-na.ssl-images-amazon.com/imag...,We use these for everything from our acoustic ...
3,2078,['https://images-na.ssl-images-amazon.com/imag...,Great price and good quality. It didn't quite...
4,801,['https://images-na.ssl-images-amazon.com/imag...,I bought this bass to split time as my primary...
5,126,['https://images-na.ssl-images-amazon.com/imag...,"it's more on toy side than on instrument side,..."
6,1329,['https://images-na.ssl-images-amazon.com/imag...,Absolute BEST guitar hangers on the market... ...
7,325,['https://images-na.ssl-images-amazon.com/imag...,"Great nylon strings, just as expected. They wo..."
8,245,['https://images-na.ssl-images-amazon.com/imag...,I bought this stand for church because I been ...
9,1714,['https://images-na.ssl-images-amazon.com/imag...,Awesome stand!\n\nTip: The bottom part that su...


Collecting all image links

In [105]:
links = []
for idx, row in data.iterrows():
    link_arr = ast.literal_eval(row[1])
    for link in link_arr:
        links.append(link)

Defining transformations for pre-processing on images

In [106]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

Defining the model to be used for feature extraction of images. In this case I have used ResNet50

In [107]:
model = models.resnet50(pretrained = True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()



Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


Function to download any image given it's link

In [108]:
def download_images(link):
    response = requests.get(link)
    try:
        img = Image.open(BytesIO(response.content)).convert('RGB')
        return img
    except Exception as e:
        print(f"Failed to get image from {link} due to error {e}")
        return None

In [109]:
images = {}
for link in links:
    image = download_images(link)
    if images is None:
        continue
    else:
        images[link] = image

Failed to get image from https://images-na.ssl-images-amazon.com/images/I/71F3npeHUDL._SY88.jpg due to error cannot identify image file <_io.BytesIO object at 0x000002BF65675A30>
Failed to get image from https://images-na.ssl-images-amazon.com/images/I/71wHUWncMGL._SY88.jpg due to error cannot identify image file <_io.BytesIO object at 0x000002BF65675A30>
Failed to get image from https://images-na.ssl-images-amazon.com/images/I/71B8OOE5N8L._SY88.jpg due to error cannot identify image file <_io.BytesIO object at 0x000002BF65AAE250>
Failed to get image from https://images-na.ssl-images-amazon.com/images/I/81SX3oAWbNL._SY88.jpg due to error cannot identify image file <_io.BytesIO object at 0x000002BF65AAF9C0>
Failed to get image from https://images-na.ssl-images-amazon.com/images/I/718niQ1GEwL._SY88.jpg due to error cannot identify image file <_io.BytesIO object at 0x000002BF65CE1FD0>
Failed to get image from https://images-na.ssl-images-amazon.com/images/I/61OboZT-kcL._SY88.jpg due to er

Function to transform image and extract it's features

In [111]:
def preprocess_and_extract_features(img):
    # Apply the preprocessing transformations
    img_t = transform(img)
    img_t = img_t.unsqueeze(0)  
    img_t = img_t.to(device)  
    
    # Extract features with no gradient calculation for efficiency
    with torch.no_grad():
        features = model(img_t)

    features_flattened = features.view(-1)
    # Move the features to CPU for further processing or storage
    features_flattened = features_flattened.cpu()
    
    return features_flattened

In [112]:
images_features = {}
for link, img in images.items():
    if img is None:
        continue
    images_features[link] = preprocess_and_extract_features(img)

Storing and loading the image features

In [113]:
features_path = 'image_features.pkl'

with open(features_path, 'wb') as file:
    pickle.dump(images_features, file)

In [114]:
features_path = 'image_features.pkl'

with open(features_path, 'rb') as file:
    image_features = pickle.load(file)

Loading all textual data from the reviews column into a list

In [115]:
reviews = []
for idx, row in data.iterrows():
    if pd.isna(row[2]):
        reviews.append("")
        continue
    text = row[2]
    reviews.append(text)

Applying necessary text pre-processing on the textual data

In [116]:
def preprocess_text(text):
    # Lowercasing the text
    text = text.lower()

    # Removing all punctuations
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Tokenizing text
    tokens = nltk.word_tokenize(text)
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization on tokens 
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

In [None]:
# Tokenizing all documents
tokenized_docs = []
for doc in reviews:
    tokenized_docs.append(preprocess_text(doc))

Defining functions for calculating term frequency and inverse document frequency

In [118]:
def compute_tf_per_document(document):
    tf_dict = {}
    doc_len = len(document)
    for term in document:
        tf_dict[term] = tf_dict.get(term, 0) + 1 / float(doc_len)
    return tf_dict

def compute_tf_all_documents(tokenized_docs):
    return [compute_tf_per_document(doc) for doc in tokenized_docs]

def compute_idf(documents):
    idf_dict = {}
    N = len(documents)
    
    # Initialize document frequency (DF) counts
    for document in documents:
        for word in document:
            if word in idf_dict:
                idf_dict[word] += 1
            else:
                idf_dict[word] = 1
    
    # Convert DF counts to IDF scores
    for word, val in idf_dict.items():
        idf_dict[word] = math.log10(N / float(val))
        
    return idf_dict


def compute_tf_idf(tf, idf):
    tf_idf = {}
    for word, val in tf.items():
        tf_idf[word] = val * idf[word]
    return tf_idf

Computing tf-idf scores for all terms across the corpus

In [119]:
tf_scores_per_document = [compute_tf_per_document(doc) for doc in tokenized_docs]

idf_dict = compute_idf(tokenized_docs)

tf_idf_scores_per_document = [compute_tf_idf(tf, idf_dict) for tf in tf_scores_per_document]

global_tf_idf_scores = {}
for doc_scores in tf_idf_scores_per_document:
    for term, score in doc_scores.items():
        if term in global_tf_idf_scores:
            global_tf_idf_scores[term] += score
        else:
            global_tf_idf_scores[term] = score

for term in global_tf_idf_scores.keys():
    global_tf_idf_scores[term] /= len(tf_idf_scores_per_document)

Storing and loading the tf-idf scores

In [120]:
text_features = 'tf_idf_scores.pkl'

with open(text_features, 'wb') as file:
    pickle.dump(global_tf_idf_scores, file)

In [121]:
text_features = 'tf_idf_scores.pkl'

with open(text_features, 'rb') as file:
    text_features = pickle.load(file)

Creating a *Query* class to handle all input queries of the form (ImageLink, Review)

In [122]:
class Query:
    def __init__(self, link, review):
        self.img_link = link
        self.text_review = review

    def get_query_link(self):
        return self.img_link
    
    def get_query_review(self):
        return self.text_review

    def process_image_features(self):
        # This function downloads the image and extracts features
        img_features = preprocess_and_extract_features(download_images(self.img_link))
        return img_features

    def most_similar_images(self, tensor_dict):
        # Returns the most similar images on the basis of cosine similarity score calculated using torch library
        input_tensor = self.process_image_features()
        similarities = {}
        for link, tensor in tensor_dict.items():
            similarity = torch.nn.functional.cosine_similarity(input_tensor.unsqueeze(0), tensor.unsqueeze(0))
            similarities[link] = similarity.item()
        sorted_tensors = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
        return sorted_tensors

    def process_text_features(self):
        # This function preprocesses the query text
        return sorted(preprocess_text(self.text_review))

    # This function compiles all previously defined functions to process the query give the respective output
    def process_query(self, image_tensor_dict, text_reviews, global_tf_idf_scores):
        # Using image retrieval
        image_similarities = self.most_similar_images(image_tensor_dict)
        
        print("USING IMAGE RETRIEVAL")
        image_cosine_scores = []
        for i, (img_url, img_similarity) in enumerate(image_similarities[1:4]):
            print(f"{i+1}) Image URL: {img_url}")            
            image_cosine_score = img_similarity 
            print(f"Cosine similarity of images: {image_cosine_score}")
            
            image_cosine_scores.append(image_cosine_score)
        
        print("------------------------------------------")

        # Calculate average cosine similarity scores for image retrieval
        if image_cosine_scores:
            avg_image_cosine_score = sum(image_cosine_scores) / len(image_cosine_scores)
            print(f"Average cosine similarity of images: {avg_image_cosine_score}")

In [123]:
# Sample usage of Query class
query = Query("https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg",
              "I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.")

query.process_query(image_features, reviews, global_tf_idf_scores)

USING IMAGE RETRIEVAL
1) Image URL: https://images-na.ssl-images-amazon.com/images/I/71eH74UgOwL._SY88.jpg
Cosine similarity of images: 0.865452229976654
2) Image URL: https://images-na.ssl-images-amazon.com/images/I/61g0lol4mUL._SY88.jpg
Cosine similarity of images: 0.8546483516693115
3) Image URL: https://images-na.ssl-images-amazon.com/images/I/711kGbkdzEL._SY88.jpg
Cosine similarity of images: 0.8517411947250366
------------------------------------------
Average cosine similarity of images: 0.8572805921236674
