In [1]:
import os
import re
import cv2
import nltk
import string
import pickle
import random
import requests
import numpy as np
import pandas as pd
from math import log
from tqdm import tqdm
from io import BytesIO
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Model
from PIL import Image, ImageEnhance, ImageOps
from tensorflow.keras.preprocessing import image
from nltk.stem import PorterStemmer, WordNetLemmatizer
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import pandas as pd
import os
import random
from PIL import Image, ImageEnhance
from io import BytesIO
import requests
from tqdm import tqdm

csv_path = '/content/drive/MyDrive/A2_Data.csv'
df = pd.read_csv(csv_path)

preprocess_folder = '/content/drive/MyDrive/A2_Pre'
os.makedirs(preprocess_folder, exist_ok=True)

def download_image(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return Image.open(BytesIO(response.content))
        else:
            return None
    except Exception as e:
        print(f"Error downloading image {url}: {e}")
        return None

def save_image(img, folder, filename):
    img_path = os.path.join(folder, filename)
    img.save(img_path)

def preprocess_image(img, row_idx, img_idx):
    # Altering contrast
    contrast_factor = random.uniform(0.5, 1.5)
    img_contrast = ImageEnhance.Contrast(img).enhance(contrast_factor)
    save_image(img_contrast, preprocess_folder, f'img_{row_idx+2}_{img_idx}_contrast_{contrast_factor}.jpg')

    # Resizing
    size = (299, 299)
    img_resized = img.resize(size)
    save_image(img_resized, preprocess_folder, f'img_{row_idx+2}_{img_idx}_resizing.jpg')

    # Random flips
    img_flipped = img.transpose(Image.FLIP_LEFT_RIGHT)
    save_image(img_flipped, preprocess_folder, f'img_{row_idx+2}_{img_idx}_random_flips.jpg')

    # Brightness
    brightness_factor = random.uniform(0.5, 1.5)
    img_brightness = ImageEnhance.Brightness(img).enhance(brightness_factor)
    save_image(img_brightness, preprocess_folder, f'img_{row_idx+2}_{img_idx}_brightness_{brightness_factor}.jpg')

    # Geometrical orientation
    angle = random.randint(0, 360)
    img_rotated = img.rotate(angle)
    save_image(img_rotated, preprocess_folder, f'img_{row_idx+2}_{img_idx}_orientation_{angle}.jpg')

def process_image(row):
    image_urls = [url.strip().strip("'\"") for url in row['Image'].strip('[]').split(',')]
    for idx, url in enumerate(image_urls):
        img = download_image(url)
        if img:
            preprocess_image(img, row.name, idx)

for index, row in tqdm(df.iterrows(), total=len(df)):
    process_image(row)


100%|██████████| 1000/1000 [04:32<00:00,  3.66it/s]


In [None]:
import os
import numpy as np
import pickle
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.models import Model

images_path = '/content/drive/MyDrive/A2_Pre'
image_files = [f for f in os.listdir(images_path) if f.endswith('.jpg')]

base_model = InceptionV3(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

def extract_features(image_path):
    img = image.load_img(image_path, target_size=(299, 299))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = model.predict(img_array)
    return features.flatten()

def process_images(image_files, images_path):
    image_features = {}
    for image_file in image_files:
        image_path = os.path.join(images_path, image_file)
        features = extract_features(image_path)
        image_features[image_file] = features
    return image_features

image_features = process_images(image_files, images_path)

pickle_file_path = '/content/drive/MyDrive/image.pickle'
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(image_features, pickle_file)

print(f"Features extracted and saved to {pickle_file_path}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Features extracted and saved to /content/drive/MyDrive/image.pickle


In [3]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

input_csv_path = '/content/drive/MyDrive/A2_Data.csv'
output_csv_path = '/content/drive/MyDrive/A2_text.csv'

df = pd.read_csv(input_csv_path)

def preprocess_text(text):
    if pd.isna(text):
        return '', ''
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    lowercase_text = clean_text.lower()
    return clean_text, lowercase_text

df[['Cleaned Text', 'Lowercase Text']] = df['Review Text'].apply(preprocess_text).apply(pd.Series)

def tokenize_text(text):
    if pd.isna(text):
        return []
    tokens = word_tokenize(text)
    return tokens

df['Tokens'] = df['Lowercase Text'].apply(tokenize_text)

df['Sentence'] = df['Tokens'].apply(lambda tokens: ' '.join(tokens))

df['Sentence'] = df['Sentence'].str.replace('[{}]'.format(string.punctuation), '')

stop_words = set(stopwords.words('english'))
df['Cleaned Sentence'] = df['Sentence'].apply(lambda sentence: ' '.join(word for word in sentence.split() if word.lower() not in stop_words))

porter_stemmer = PorterStemmer()
df['Stemmed Sentence'] = df['Cleaned Sentence'].apply(lambda sentence: ' '.join(porter_stemmer.stem(word) for word in sentence.split()))

lemmatizer = WordNetLemmatizer()
df['Lemmatized Sentence'] = df['Cleaned Sentence'].apply(lambda sentence: ' '.join(lemmatizer.lemmatize(word) for word in sentence.split()))

df.to_csv(output_csv_path, index=False)

print(f"Text pre-processing, tokenization, punctuation removal, stop words removal, stemming, and lemmatization complete. Output saved to {output_csv_path}.")


  soup = BeautifulSoup(text, 'html.parser')
  df['Sentence'] = df['Sentence'].str.replace('[{}]'.format(string.punctuation), '')


Text pre-processing, tokenization, punctuation removal, stop words removal, stemming, and lemmatization complete. Output saved to /content/drive/MyDrive/A2_text.csv.


In [4]:
import pandas as pd
from collections import Counter
from math import log

def calculate_tf(tokens):
    tf_counter = Counter(tokens)
    total_words = len(tokens)
    tf = {word: count / total_words for word, count in tf_counter.items()}
    return tf

def calculate_idf(docs, term):
    doc_count = sum(1 for doc in docs if term in doc)
    if doc_count == 0:
        return 0
    return log(len(docs) / doc_count)

def calculate_tfidf(tf, idf):
    return {word: tf[word] * idf[word] for word in tf}

# Assuming df and documents are defined as in the original code
df = pd.read_csv('/content/drive/MyDrive/A2_text.csv')
df['Lemmatized Sentence'] = df['Lemmatized Sentence'].fillna('')
documents = df['Lemmatized Sentence'].apply(lambda sentence: sentence.split()).tolist()

# Calculating unique words
unique_words = list(set(word for document in documents for word in document))

# Calculating IDF values
idf = {word: calculate_idf(documents, word) for word in unique_words}

# Calculating TF-IDF matrix
tfidf_matrix = []
for i, document in enumerate(documents):
    tf = calculate_tf(document)
    tfidf = calculate_tfidf(tf, idf)
    tfidf_matrix.append(tfidf)

# Creating a DataFrame from TF-IDF matrix
tfidf_df = pd.DataFrame(tfidf_matrix, columns=unique_words)

# Concatenating the TF-IDF DataFrame with the original DataFrame
df = pd.concat([df, tfidf_df], axis=1)

# Saving the output to a CSV file
output_tfidf_csv_path = '/content/drive/MyDrive/A2_TFIDF.csv'
df.to_csv(output_tfidf_csv_path, index=False)

print(f"TF-IDF calculation complete. Output saved to {output_tfidf_csv_path}.")


TF-IDF calculation complete. Output saved to /content/drive/MyDrive/A2_TFIDF.csv.


In [23]:
import pandas as pd
import pickle

# Load the CSV file into a pandas DataFrame
csv_file_path = '/content/drive/MyDrive/A2_TFIDF.csv'
df = pd.read_csv(csv_file_path)

# Save the DataFrame to a pickle file
pickle_file_path = '/content/drive/MyDrive/A2_TFIDF.pickle'
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(df, pickle_file)

print(f"DataFrame saved to pickle file: {pickle_file_path}")


DataFrame saved to pickle file: /content/drive/MyDrive/A2_TFIDF.pickle


In [22]:
import pandas as pd
import requests
from io import BytesIO
from PIL import Image
import numpy as np
import cv2
import re
from collections import Counter
from math import log
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import preprocess_input
import pickle

def preprocess_image_from_url(image_url, target_size=(299, 299)):
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))
    img = img.resize(target_size)
    return img

def extract_features_from_image(img):
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = model.predict(img_array)
    return features.flatten()

def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

def find_similar_images(query_features, image_features, top_n=3):
    similarities = {}
    for filename, features in image_features.items():
        similarity = cosine_similarity(query_features, features)
        similarities[filename] = similarity
    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return sorted_similarities[:top_n]

def extract_numeric_value(filename):
    numeric_value = re.findall(r'\d+', filename)
    return int(numeric_value[0]) if numeric_value else None

def preprocess_user_image(image_url):
    response = requests.get(image_url)
    user_img = cv2.imdecode(np.frombuffer(response.content, np.uint8), -1)
    user_gray_img = cv2.cvtColor(user_img, cv2.COLOR_BGR2GRAY)
    return user_gray_img

def calculate_image_similarity_multiple(image_urls, user_image_url):
    max_similarity = -1
    for image_url in image_urls:
        image_url = image_url.strip(" '")
        response = requests.get(image_url)
        img = cv2.imdecode(np.frombuffer(response.content, np.uint8), -1)
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        similarity = cv2.matchTemplate(gray_img, user_image_url, cv2.TM_CCOEFF_NORMED)[0][0]
        if similarity > max_similarity:
            max_similarity = similarity
    return max_similarity

compo=[]

# Load the necessary models and data
base_model = InceptionV3(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

pickle_file_path = '/content/drive/MyDrive/image.pickle'
with open(pickle_file_path, 'rb') as pickle_file:
    image_features = pickle.load(pickle_file)

# Input from the user
print("Image and Text Query Input:")
image_url = input("Enter the URL of the image: ")

# Preprocess the user's query image
query_image = preprocess_image_from_url(image_url)
query_features = extract_features_from_image(query_image)
similar_images = find_similar_images(query_features, image_features, top_n=10)

# Process similar images based on cosine similarity
highest_similarity = {}
unique_values_count = 0
for filename, similarity in similar_images:
    numeric_value = extract_numeric_value(filename)
    if numeric_value is not None and numeric_value not in highest_similarity:
        highest_similarity[numeric_value] = similarity
        unique_values_count += 1
        if unique_values_count == 3:
            break

# Load the CSV file with review data
csv_file_path = '/content/drive/MyDrive/A2_TFIDF.csv'
df = pd.read_csv(csv_file_path)

# Input from the user for review
query = input("Enter the Review in text: ")
query_tf = calculate_tf(query)
idf = calculate_idf(df['Review Text'])
query_tfidf = calculate_tfidf(query_tf, idf)

# Process images using image retrieval
print("---------------------------------------------------------------------------------------------------------------------")
print("USING IMAGE RETRIEVAL")
print("---------------------------------------------------------------------------------------------------------------------")

image_retrival_Score = 0
for numeric_value, similarity in highest_similarity.items():
    image_URL = df.iloc[numeric_value - 2]['Image']
    review_text = df.iloc[numeric_value - 2]['Review Text']

    if isinstance(review_text, str):
        review_tf = calculate_tf(review_text)
        review_tfidf = calculate_tfidf(review_tf, idf)
        intersection = sum(query_tfidf[word] * review_tfidf[word] for word in query_tfidf if word in review_tfidf)
        query_norm = np.sqrt(sum(val ** 2 for val in query_tfidf.values()))
        review_norm = np.sqrt(sum(val ** 2 for val in review_tfidf.values()))
        text_similarity = intersection / (query_norm * review_norm)
        print()
        print(f"{image_retrival_Score+1}) Image URL: {image_URL}")
        print(f"Review: {review_text}")
        print(f"Cosine similarity of images: {similarity}")
        print(f"Cosine similarity of text: {text_similarity}")
        Composite_similarity_score_image = (similarity+text_similarity) / 2
        print(f"Composite similarity score: {Composite_similarity_score_image}")
        compo.append((image_URL,review_text,Composite_similarity_score_image,similarity,text_similarity))
        print("---------------------------------------")
        image_retrival_Score += 1

# Process text using text retrieval
print("---------------------------------------------------------------------------------------------------------------------")
print("USING TEXT RETRIEVAL")
print("---------------------------------------------------------------------------------------------------------------------")

text_retrival_Score = 0
text_similarities = {}
for index, row in df.iterrows():
    review_text = row['Review Text']
    if isinstance(review_text, str):
        review_tf = calculate_tf(review_text)
        review_tfidf = calculate_tfidf(review_tf, idf)
        intersection = sum(query_tfidf[word] * review_tfidf[word] for word in query_tfidf if word in review_tfidf)
        query_norm = np.sqrt(sum(val ** 2 for val in query_tfidf.values()))
        review_norm = np.sqrt(sum(val ** 2 for val in review_tfidf.values()))
        text_similarity = intersection / (query_norm * review_norm)
        text_similarities[index] = text_similarity

sorted_text_similarities = sorted(text_similarities.items(), key=lambda x: x[1], reverse=True)[:3]
for index, similarity in sorted_text_similarities:
    image_URL = df.iloc[index]['Image']
    review_text = df.iloc[index]['Review Text']
    user_gray_img = preprocess_user_image(image_url)
    image_urls_unwrapped = image_URL.strip('][').split(', ')
    Text_image_similarities = calculate_image_similarity_multiple(image_urls_unwrapped, user_gray_img)
    print()
    print(f"{text_retrival_Score+1}) Image URL: {image_URL}")
    print(f"Review: {review_text}")
    print(f"Cosine similarity of images: {Text_image_similarities}")
    print(f"Cosine similarity of text: {similarity}")
    Composite_similarity_score_text = (similarity+Text_image_similarities) / 2
    print(f"Composite similarity score: {Composite_similarity_score_text}")
    compo.append((image_URL,review_text,Composite_similarity_score_text,Text_image_similarities,similarity))
    # print("---------------------------------------")
    text_retrival_Score += 1

compo_sort = sorted(compo,key=lambda x:x[2],reverse = True)
top_compo=compo_sort[:3]

Image and Text Query Input:
Enter the URL of the image: https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg
Enter the Review in text: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
---------------------------------------------------------------------------------------------------------------------
USING IMAGE RETRIEVAL
---------------------------------------------------------------------------------------------------------------------

1) Image URL: ['https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg']
Review: I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.
Cosine similarity of images: 0.9922579526901245
Cosine similarity of text: 0.030926701547102746
Composite similarity s

In [21]:
print("Top 3 Composite Score")
print("------------------------------------------------------------------------------------------------------")

for url,review,score,I_s,T_s in top_compo:
  print(f"URL: {url}")
  print(f"Review: {review}")
  print(f"Composite Score: {score}")
  print(f"Image Similarity: {I_s}")
  print(f"Text Similarity: {T_s}")
  print("------------------------------------------------------------------------------------------------------")

Top 3 Composite Score
------------------------------------------------------------------------------------------------------
URL: ['https://images-na.ssl-images-amazon.com/images/I/51fyLunjnYL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/51zHQdv4n1L._SY88.jpg']
Review: This is a fine EQ.  It sounds good and I'm  thinking of getting another one  for right and left  speaker.
Composite Score: 0.9857497706708835
Image Similarity: 1.0
Text Similarity: 0.9714995413417672
------------------------------------------------------------------------------------------------------
URL: ['https://images-na.ssl-images-amazon.com/images/I/51fyLunjnYL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/51zHQdv4n1L._SY88.jpg']
Review: This is a fine EQ.  It sounds good and I'm  thinking of getting another one  for right and left  speaker.
Composite Score: 0.9809071317014622
Image Similarity: 0.9903147220611572
Text Similarity: 0.9714995413417672
---------------------------------