# 이미지 Feature vector를 활용해보기

- 여러 논문에서 소개된 것을 참고하여 pre-trained CNN으로 image features를 추출.
- image features를 비교하여 실제로 비슷한지 판단하고, rating정보와 함께 분석.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
path = '/content/drive/MyDrive/data/amazon_reviews'

## Load dataset

1. AMAZON_FASHION_5.json
2. All_Beauty_5.json
3. Luxury_Beauty_5.json

In [3]:
import os, json
import pandas as pd

In [4]:
def load_json(filename):
  data = []
  with open(os.path.join(path, filename), 'r', encoding='utf-8') as f:
      for l in f:
        data.append(json.loads(l.strip()))

  df = pd.DataFrame.from_dict(data)

  # get rows which contains image 
  df = df[~df['image'].isnull()]

  return df

In [5]:
fashion_df = load_json('AMAZON_FASHION_5.json')
print(fashion_df.shape)
print(fashion_df.head())

(106, 12)
     overall  verified  ... vote                                              image
164      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
172      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
179      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
192      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
197      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...

[5 rows x 12 columns]


In [6]:
beauty_df = load_json('All_Beauty_5.json')
print(beauty_df.shape)
print(beauty_df.head())

(98, 12)
    overall  verified  ... vote                                              image
19      5.0      True  ...    5  [https://images-na.ssl-images-amazon.com/image...
20      5.0      True  ...    4  [https://images-na.ssl-images-amazon.com/image...
21      5.0      True  ...    4  [https://images-na.ssl-images-amazon.com/image...
34      1.0      True  ...    2  [https://images-na.ssl-images-amazon.com/image...
47      5.0      True  ...   20  [https://images-na.ssl-images-amazon.com/image...

[5 rows x 12 columns]


In [7]:
luxury_df = load_json('Luxury_Beauty_5.json')
print(luxury_df.shape)
print(luxury_df.head())

(617, 12)
     overall  verified  ... vote                                              image
68       5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
75       5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
86       5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
88       5.0      True  ...    9  [https://images-na.ssl-images-amazon.com/image...
104      5.0     False  ...    2  [https://images-na.ssl-images-amazon.com/image...

[5 rows x 12 columns]


## Download Images

In [8]:
from tqdm import tqdm
import requests

In [9]:
def download_images(path, df, category):
  folder_path = os.path.join(path, category)

  if not os.path.exists(folder_path):
    os.makedirs(folder_path)

  for index in tqdm(df.index):
    url_list = df['image'].loc[index]
    for url_index, url in enumerate(url_list):
      if not os.path.exists(os.path.join(folder_path, f'{str(index)}_{str(url_index)}.jpg')):
        img_data = requests.get(url).content
        with open(os.path.join(folder_path, f'{str(index)}_{str(url_index)}.jpg'), 'wb') as handler:
          handler.write(img_data)

  print(f'{category}: {len(df.index)} images downloaded or already exist...')

In [10]:
download_images(path, beauty_df, 'beauty')

100%|██████████| 98/98 [00:00<00:00, 3130.46it/s]

beauty: 98 images downloaded or already exist...





In [11]:
download_images(path, fashion_df, 'fashion')

100%|██████████| 106/106 [00:00<00:00, 1802.98it/s]

fashion: 106 images downloaded or already exist...





In [12]:
download_images(path, luxury_df, 'luxury')

100%|██████████| 617/617 [00:02<00:00, 287.96it/s]

luxury: 617 images downloaded or already exist...





## Use pre-trained CNN

In [13]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image

In [14]:
# Load the pretrained model
model = models.resnet18(pretrained=True)
# Use the model object to select the desired layer
layer = model._modules.get('avgpool')

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [15]:
# Set model to evaluation mode
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [16]:
scaler = transforms.Scale((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()



In [17]:
def get_vector(image_name):
    # 1. Load the image with Pillow library
    img = Image.open(image_name)
    # 2. Create a PyTorch Variable with the transformed image
    t_img = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0))
    # 3. Create a vector of zeros that will hold our feature vector
    #    The 'avgpool' layer has an output size of 512
    my_embedding = torch.zeros(512)
    # 4. Define a function that will copy the output of a layer
    def copy_data(m, i, o):
      my_embedding.copy_(o.data.reshape(o.data.size(1)))
    # 5. Attach that function to our selected layer
    h = layer.register_forward_hook(copy_data)
    # 6. Run the model on our transformed image
    model(t_img)
    # 7. Detach our copy function from the layer
    h.remove()
    # 8. Return the feature vector
    return my_embedding.cpu().detach().numpy()

In [18]:
# Test
category = 'beauty'
for image_file in os.listdir(os.path.join(path, category))[:2]:
  print(f"{image_file} feature vectors")
  print(get_vector(os.path.join(path, category+'/'+image_file)))

19_0.jpg feature vectors


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[4.7817498e-01 1.4803204e-01 7.3766249e-01 1.7626870e-01 1.0427601e+00
 2.0582843e+00 4.3731141e-01 1.1788375e+00 4.7587356e-01 1.5432829e-01
 1.7027771e-01 6.0273188e-01 3.3801761e+00 6.4341664e-01 1.8992610e+00
 3.5708264e-01 1.3404369e-01 2.2832470e+00 3.3125648e-01 1.4945538e+00
 5.6086940e-01 2.8357738e-01 3.9763598e+00 4.5935050e-01 1.6845827e+00
 2.2359653e+00 9.4270879e-01 4.7583050e-01 3.7425122e-01 3.8774556e-01
 7.5579053e-01 5.8159262e-01 1.6831464e-01 4.8929077e-01 7.5774407e-01
 7.5356692e-01 1.1147516e+00 3.4227836e+00 5.2933991e-01 8.4244108e-01
 2.8930479e-01 7.3915070e-01 1.3036493e+00 2.8957934e+00 5.7692301e-01
 5.2707678e-01 1.0327747e+00 1.2108947e+00 1.6490760e+00 2.8376873e+00
 1.2509425e+00 9.6479946e-01 4.0623400e-01 2.7165284e+00 1.8427442e-01
 1.1143923e+00 1.0399740e+00 1.5313436e+00 1.5334581e+00 3.6911032e-01
 9.6344399e-01 1.0879428e+00 9.1176105e-01 7.2174090e-01 1.7561764e+00
 6.5195864e-01 8.3050537e-01 1.0654820e+00 2.7531805e-02 1.1963762e+00
 2.009

## Preprocess dataset

- Remove unnecessary columns
- Remove all other columns except `overall`, `reviewerID`, `asin`, `image` 
- Create new column with image filename

In [19]:
def add_image_filenames(category, df):
  # Remove unnecessary columns
  df = df[['overall','reviewerID', 'asin', 'image']]

  filenames = []
  for row_index in df.index:
    each_files = []
    for idx in range(len(df.loc[row_index]['image'])):
      each_files.append(os.path.join(path, category+'/'+f'{row_index}_{idx}.jpg'))
    filenames.append(each_files)

  # Add new column
  df.drop('image', axis=1, inplace=True)
  df['image_filename'] = list(filenames)

  return df

In [20]:
luxury_df = add_image_filenames('luxury', luxury_df)
beauty_df = add_image_filenames('beauty', beauty_df)
fashion_df = add_image_filenames('fashion', fashion_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Combine into one dataframe

- `luxury_df`, `fashion_df`, beauty_df

In [21]:
luxury_df.head()

Unnamed: 0,overall,reviewerID,asin,image_filename
68,5.0,A2BHOZILR7SY9,B000142FVW,[/content/drive/MyDrive/data/amazon_reviews/lu...
75,5.0,ACMSQCH1H7JZD,B000142FVW,[/content/drive/MyDrive/data/amazon_reviews/lu...
86,5.0,A2L77YQRAEA1YZ,B000142FVW,[/content/drive/MyDrive/data/amazon_reviews/lu...
88,5.0,A28W77RPDZK7AZ,B00014351Q,[/content/drive/MyDrive/data/amazon_reviews/lu...
104,5.0,A2IV70BWQBUF32,B00014351Q,[/content/drive/MyDrive/data/amazon_reviews/lu...


In [23]:
data_list = []
dataframe_list = [('luxury',luxury_df), ('beauty', beauty_df), ('fashion', fashion_df)]
for dataframe in dataframe_list:
  category = dataframe[0]
  df = dataframe[1]
  for index, row in df.iterrows():
    for filename in row['image_filename']:
      data_tuple = (category, row['overall'], row['reviewerID'], row['asin'], filename)
      data_list.append(data_tuple)

In [24]:
combined_df = pd.DataFrame(data=data_list,columns=['category', 'overall', 'reviewerID', 'asin', 'filename'])
print(combined_df.head())
print(combined_df.shape)

  category  ...                                           filename
0   luxury  ...  /content/drive/MyDrive/data/amazon_reviews/lux...
1   luxury  ...  /content/drive/MyDrive/data/amazon_reviews/lux...
2   luxury  ...  /content/drive/MyDrive/data/amazon_reviews/lux...
3   luxury  ...  /content/drive/MyDrive/data/amazon_reviews/lux...
4   luxury  ...  /content/drive/MyDrive/data/amazon_reviews/lux...

[5 rows x 5 columns]
(1368, 5)
