# 이미지 Feature vector를 활용해보기!

- 여러 논문에서 소개된것처럼 pre-trained CNN으로 image features를 추출해봅니다.
- image features를 비교하여 실제로 비슷한지 판단하고, rating정보와 함께 분석해봅니다.


In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
path = './../data/amazon_reviews'

## Load dataset

1. AMAZON_FASHION_5.json
2. All_Beauty_5.json
3. Luxury_Beauty_5.json

In [4]:
import os, json
import pandas as pd

In [6]:
def load_json(filename):
  data = []
  with open(os.path.join(path, filename), 'r', encoding='utf-8') as f:
      for l in f:
        data.append(json.loads(l.strip()))

  df = pd.DataFrame.from_dict(data)

  # get rows which contains image 
  df = df[~df['image'].isnull()]

  return df

In [7]:
fashion_df = load_json('AMAZON_FASHION_5.json')
print(fashion_df.shape)
print(fashion_df.head())

(106, 12)
     overall  verified   reviewTime      reviewerID        asin  \
164      5.0      True  04 18, 2018  A2YZERYQTLB8NG  B001IKJOLW   
172      5.0      True   04 7, 2018  A1CKPC88NHMYGR  B001IKJOLW   
179      5.0      True  03 22, 2018  A3KKVVAINMZF9D  B001IKJOLW   
192      5.0      True  02 27, 2018  A3TLWN2BRF1QH5  B001IKJOLW   
197      5.0      True  02 20, 2018  A3RNGBSBRJ3YAQ  B001IKJOLW   

                                                 style      reviewerName  \
164  {'Size:': ' 9.5 B(M) US', 'Color:': ' Black/Wh...             Lenci   
172  {'Size:': ' 11 B(M) US', 'Color:': ' Wolf Grey...     Cynthia Foyer   
179  {'Size:': ' 9 B(M) US', 'Color:': ' Blue Tint/...  Nadege Marcellus   
192  {'Size:': ' 8.5 B(M) US', 'Color:': ' Blue Tin...       Brittany C.   
197  {'Size:': ' 7.5 B(M) US', 'Color:': ' Wolf Gre...    Andrea L Hogan   

                                            reviewText  \
164  Best tennis shoes I've had all my life. Very c...   
172           

In [9]:
beauty_df = load_json('All_Beauty_5.json')
print(beauty_df.shape)
print(beauty_df.head())

(98, 12)
    overall  verified   reviewTime      reviewerID        asin  \
19      5.0      True  04 23, 2018   AX0ZEGHH0H525  B00006L9LC   
20      5.0      True  04 22, 2018  A1L0QECT7J93ZP  B00006L9LC   
21      5.0      True  04 21, 2018  A1VN560NNZQIR0  B00006L9LC   
34      1.0      True  03 27, 2018  A2V608ILSK1M5R  B00006L9LC   
47      5.0      True  02 23, 2018  A22V1MD93T2FW9  B00006L9LC   

                  style   reviewerName  \
19  {'Size:': ' Small'}         Aida A   
20  {'Size:': ' Small'}          Elena   
21  {'Size:': ' Small'}     Shablinska   
34  {'Size:': ' Small'}       CDART815   
47  {'Size:': ' Small'}  Heather Sharp   

                                           reviewText  \
19  Suffered from itchiness under my hair for coup...   
20  Got this product for me and  my daughter. I ca...   
21  Cleansing properties are above any praise! Sup...   
34  My product was not sealed and either used or s...   
47  I bought this for my husband. Hed been having ...   

In [8]:
luxury_df = load_json('Luxury_Beauty_5.json')
print(luxury_df.shape)
print(luxury_df.head())

(617, 12)
     overall  verified   reviewTime      reviewerID        asin  \
68       5.0      True   03 5, 2018   A2BHOZILR7SY9  B000142FVW   
75       5.0      True  01 13, 2018   ACMSQCH1H7JZD  B000142FVW   
86       5.0      True  10 17, 2017  A2L77YQRAEA1YZ  B000142FVW   
88       5.0      True   03 5, 2017  A28W77RPDZK7AZ  B00014351Q   
104      5.0     False  08 21, 2016  A2IV70BWQBUF32  B00014351Q   

                                           style   reviewerName  \
68             {'Color:': ' Tickle My France-y'}    MustangMary   
75                    {'Color:': ' Samoan Sand'}            KED   
86                    {'Color:': ' Samoan Sand'}           ELLE   
88   {'Color:': ' Crawfishin' for a Compliment'}         Angela   
104          {'Color:': ' Rich Girls & Po-Boys'}  Paige Sanders   

                                            reviewText  \
68   This is the neutral I was searching for this w...   
75   Very light color. I'm pale and it matches my s...   
86   Best 

## Download images

In [10]:
from tqdm import tqdm
import requests

In [11]:
def download_images(path, df, category):
  folder_path = os.path.join(path, category)

  if not os.path.exists(folder_path):
    os.makedirs(folder_path)

  for index in tqdm(df.index):
    url_list = df['image'].loc[index]
    for url_index, url in enumerate(url_list):
      if not os.path.exists(os.path.join(folder_path, f'{str(index)}_{str(url_index)}.jpg')):
        img_data = requests.get(url).content
        with open(os.path.join(folder_path, f'{str(index)}_{str(url_index)}.jpg'), 'wb') as handler:
          handler.write(img_data)

  print(f'{category}: {len(df.index)} images downloaded or already exist...')

In [12]:
download_images(path, beauty_df, 'beauty')

100%|██████████| 98/98 [00:14<00:00,  6.91it/s]

beauty: 98 images downloaded or already exist...





In [13]:
download_images(path, fashion_df, 'fashion')

100%|██████████| 106/106 [00:10<00:00,  9.78it/s]

fashion: 106 images downloaded or already exist...





In [14]:
download_images(path, luxury_df, 'luxury')

100%|██████████| 617/617 [04:30<00:00,  2.28it/s]

luxury: 617 images downloaded or already exist...





## Use pre-trained CNN

In [15]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image

In [16]:
# Load the pretrained model
model = models.resnet18(pretrained=True)
# Use the model object to select the desired layer
layer = model._modules.get('avgpool')

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\user/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [17]:
# Set model to evaluation mode
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [21]:
scaler = transforms.Scale((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()

In [22]:
def get_vector(image_name):
    # 1. Load the image with Pillow library
    img = Image.open(image_name)
    # 2. Create a PyTorch Variable with the transformed image
    t_img = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0))
    # 3. Create a vector of zeros that will hold our feature vector
    #    The 'avgpool' layer has an output size of 512
    my_embedding = torch.zeros(512)
    # 4. Define a function that will copy the output of a layer
    def copy_data(m, i, o):
      my_embedding.copy_(o.data.reshape(o.data.size(1)))
    # 5. Attach that function to our selected layer
    h = layer.register_forward_hook(copy_data)
    # 6. Run the model on our transformed image
    model(t_img)
    # 7. Detach our copy function from the layer
    h.remove()
    # 8. Return the feature vector
    return my_embedding.cpu().detach().numpy()

In [24]:
# Test
category = 'beauty'
for image_file in os.listdir(os.path.join(path, category))[:2]:
  print(f"{image_file} feature vectors")
  print(get_vector(os.path.join(path, category+'/'+image_file)))

1330_0.jpg feature vectors
[4.59707260e-01 4.80601370e-01 1.39175427e+00 8.23365450e-01
 9.03181911e-01 1.17401791e+00 2.28968835e+00 1.21745980e+00
 1.00720227e+00 1.87512845e-01 2.47374162e-01 7.89716780e-01
 1.51985228e+00 9.11718428e-01 1.47356331e-01 4.77697283e-01
 5.50643764e-02 1.16396749e+00 6.26493156e-01 1.20511258e+00
 4.01770651e-01 3.04142207e-01 2.15162373e+00 6.18596445e-04
 8.73278856e-01 4.15101111e-01 1.45628715e+00 4.61923145e-02
 4.48417544e-01 3.88405859e-01 2.23538709e+00 1.34194601e+00
 9.40538347e-01 4.17112857e-01 1.10655046e+00 2.27401182e-01
 1.27639794e+00 4.43568885e-01 1.79698598e+00 4.94024873e-01
 3.46500188e-01 9.58517194e-02 7.20613062e-01 1.84116757e+00
 1.55660486e+00 1.84616238e-01 1.32214189e+00 1.87435925e-01
 2.75410652e-01 4.54258285e-02 3.16850811e-01 9.46917951e-01
 1.24328983e+00 1.45313108e+00 8.99534583e-01 2.22507954e+00
 8.35970268e-02 9.91522968e-01 7.52461016e-01 2.29401612e+00
 4.85059261e-01 6.27349794e-01 2.89514780e-01 3.36589766e+

## Preprocess dataset

- Remove unnecessary columns
- Remove all other columns except `overall`, `reviewerID`, `asin`, `image` 
- Create new column with image filename

In [25]:
def add_image_filenames(category, df):
  # Remove unnecessary columns
  df = df[['overall','reviewerID', 'asin', 'image']]

  filenames = []
  for row_index in df.index:
    each_files = []
    for idx in range(len(df.loc[row_index]['image'])):
      each_files.append(os.path.join(path, category+'/'+f'{row_index}_{idx}.jpg'))
    filenames.append(each_files)

  # Add new column
  df.drop('image', axis=1, inplace=True)
  df['image_filename'] = list(filenames)

  return df


In [26]:
luxury_df = add_image_filenames('luxury', luxury_df)
beauty_df = add_image_filenames('beauty', beauty_df)
fashion_df = add_image_filenames('fashion', fashion_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


### Combine into one dataframe

- `luxury_df`, `fashion_df`, beauty_df

In [27]:
luxury_df.head()

Unnamed: 0,overall,reviewerID,asin,image_filename
68,5.0,A2BHOZILR7SY9,B000142FVW,[./../data/amazon_reviews\luxury/68_0.jpg]
75,5.0,ACMSQCH1H7JZD,B000142FVW,[./../data/amazon_reviews\luxury/75_0.jpg]
86,5.0,A2L77YQRAEA1YZ,B000142FVW,[./../data/amazon_reviews\luxury/86_0.jpg]
88,5.0,A28W77RPDZK7AZ,B00014351Q,"[./../data/amazon_reviews\luxury/88_0.jpg, ./...."
104,5.0,A2IV70BWQBUF32,B00014351Q,[./../data/amazon_reviews\luxury/104_0.jpg]


In [28]:
data_list = []
dataframe_list = [('luxury',luxury_df), ('beauty', beauty_df), ('fashion', fashion_df)]
for dataframe in dataframe_list:
  category = dataframe[0]
  df = dataframe[1]
  for index, row in df.iterrows():
    for filename in row['image_filename']:
      data_tuple = (category, row['overall'], row['reviewerID'], row['asin'], filename)
      data_list.append(data_tuple)

In [29]:
combined_df = pd.DataFrame(data=data_list,columns=['category', 'overall', 'reviewerID', 'asin', 'filename'])
print(combined_df.head())
print(combined_df.shape)

  category  overall      reviewerID        asin  \
0   luxury      5.0   A2BHOZILR7SY9  B000142FVW   
1   luxury      5.0   ACMSQCH1H7JZD  B000142FVW   
2   luxury      5.0  A2L77YQRAEA1YZ  B000142FVW   
3   luxury      5.0  A28W77RPDZK7AZ  B00014351Q   
4   luxury      5.0  A28W77RPDZK7AZ  B00014351Q   

                                   filename  
0  ./../data/amazon_reviews\luxury/68_0.jpg  
1  ./../data/amazon_reviews\luxury/75_0.jpg  
2  ./../data/amazon_reviews\luxury/86_0.jpg  
3  ./../data/amazon_reviews\luxury/88_0.jpg  
4  ./../data/amazon_reviews\luxury/88_1.jpg  
(1368, 5)


### Get image feature vectors

In [30]:
combined_df['image_vec'] = combined_df['filename'].apply(lambda x: get_vector(x))
combined_df.to_csv(os.path.join(path, 'image_dataset.csv'), sep='\t')

In [31]:
combined_df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,./../data/amazon_reviews\luxury/68_0.jpg,"[0.5511113, 0.86447924, 0.101986535, 0.2385793..."
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,./../data/amazon_reviews\luxury/75_0.jpg,"[1.5448722, 1.135253, 0.14166667, 0.29448533, ..."
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,./../data/amazon_reviews\luxury/86_0.jpg,"[0.9510973, 1.2491927, 0.17508696, 0.63649607,..."
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,./../data/amazon_reviews\luxury/88_0.jpg,"[0.86477274, 0.4211394, 0.17356645, 1.435113, ..."
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,./../data/amazon_reviews\luxury/88_1.jpg,"[1.2491896, 0.44804484, 1.3037115, 1.6371999, ..."


## K-means clustering

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import numpy as np

In [34]:
def check_vector(vector):
  return np.array([0.0 if str(x) == '' else float(x) for x in vector])[:512]

In [35]:
df = pd.read_csv(os.path.join(path, 'image_dataset.csv'), sep='\t', index_col=0,
                 converters={"image_vec": lambda x: x.strip("[]").replace('\n','').split(" ")})


In [36]:
df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,./../data/amazon_reviews\luxury/68_0.jpg,"[5.51111281e-01, 8.64479244e-01, 1.01986535e-0..."
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,./../data/amazon_reviews\luxury/75_0.jpg,"[1.54487216e+00, 1.13525295e+00, 1.41666666e-0..."
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,./../data/amazon_reviews\luxury/86_0.jpg,"[9.51097310e-01, 1.24919271e+00, 1.75086960e-0..."
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,./../data/amazon_reviews\luxury/88_0.jpg,"[8.64772737e-01, 4.21139389e-01, 1.73566446e-0..."
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,./../data/amazon_reviews\luxury/88_1.jpg,"[1.24918962e+00, 4.48044837e-01, 1.30371153e+0..."


In [37]:
df['image_vec'] = df['image_vec'].apply(lambda x: check_vector(x))

In [38]:
df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,./../data/amazon_reviews\luxury/68_0.jpg,"[0.551111281, 0.864479244, 0.101986535, 0.2385..."
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,./../data/amazon_reviews\luxury/75_0.jpg,"[1.54487216, 1.13525295, 0.141666666, 0.294485..."
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,./../data/amazon_reviews\luxury/86_0.jpg,"[0.95109731, 1.24919271, 0.17508696, 0.6364960..."
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,./../data/amazon_reviews\luxury/88_0.jpg,"[0.864772737, 0.421139389, 0.173566446, 1.4351..."
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,./../data/amazon_reviews\luxury/88_1.jpg,"[1.24918962, 0.448044837, 1.30371153, 1.637199..."


In [39]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1234)
print(train_df.shape)
print(test_df.shape)

(1094, 6)
(274, 6)


In [40]:
X_train = np.array([list(x) for x in train_df['image_vec'].values])

In [41]:
X_train[:3]

array([[0.77896273, 0.42345819, 0.40614778, ..., 0.3404167 , 0.75824237,
        0.12758844],
       [0.18728934, 0.81857044, 0.43446231, ..., 0.1557987 , 0.25018808,
        1.37156463],
       [0.03194912, 1.10710597, 0.90613908, ..., 0.03438348, 0.09282351,
        2.05421495]])

In [42]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_train)

In [43]:
kmeans.labels_

array([2, 1, 0, ..., 2, 1, 2])

In [44]:
test_df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
266,luxury,5.0,A24YNYNC6QJNBN,B002B4540O,./../data/amazon_reviews\luxury/13577_2.jpg,"[0.854530513, 1.85690045, 1.03045952, 0.185951..."
1088,beauty,5.0,A245UNW3PI53NG,B0009RF9DW,./../data/amazon_reviews\beauty/496_0.jpg,"[1.13625371, 0.0897669867, 0.442607284, 1.3276..."
488,luxury,4.0,A25QBCHO0KFT0P,B00B95PWYE,./../data/amazon_reviews\luxury/21446_4.jpg,"[0.0355344117, 1.64337778, 0.0617780983, 0.030..."
1087,beauty,5.0,A85ENSL5HBBZF,B0009RF9DW,./../data/amazon_reviews\beauty/495_0.jpg,"[0.909579039, 0.423226923, 1.81325483, 0.23797..."
1152,beauty,4.0,A25QBCHO0KFT0P,B0010ZBORW,./../data/amazon_reviews\beauty/3194_0.jpg,"[1.35672057, 0.251046032, 1.97370028, 0.212662..."


### Evaluation

In [45]:
# kmeans.predict([test_df['image_vec'].iloc[0]])
test_df['prediction'] = test_df['image_vec'].apply(lambda x: kmeans.predict([x])[0])
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec,prediction
266,luxury,5.0,A24YNYNC6QJNBN,B002B4540O,./../data/amazon_reviews\luxury/13577_2.jpg,"[0.854530513, 1.85690045, 1.03045952, 0.185951...",1
1088,beauty,5.0,A245UNW3PI53NG,B0009RF9DW,./../data/amazon_reviews\beauty/496_0.jpg,"[1.13625371, 0.0897669867, 0.442607284, 1.3276...",1
488,luxury,4.0,A25QBCHO0KFT0P,B00B95PWYE,./../data/amazon_reviews\luxury/21446_4.jpg,"[0.0355344117, 1.64337778, 0.0617780983, 0.030...",0
1087,beauty,5.0,A85ENSL5HBBZF,B0009RF9DW,./../data/amazon_reviews\beauty/495_0.jpg,"[0.909579039, 0.423226923, 1.81325483, 0.23797...",1
1152,beauty,4.0,A25QBCHO0KFT0P,B0010ZBORW,./../data/amazon_reviews\beauty/3194_0.jpg,"[1.35672057, 0.251046032, 1.97370028, 0.212662...",2


In [47]:
test_df.groupby('category')['prediction'].count()

category
beauty      35
fashion     28
luxury     211
Name: prediction, dtype: int64

In [48]:
test_df.groupby('category').count()

Unnamed: 0_level_0,overall,reviewerID,asin,filename,image_vec,prediction
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
beauty,35,35,35,35,35,35
fashion,28,28,28,28,28,28
luxury,211,211,211,211,211,211


In [49]:
test_df.groupby('prediction').count()

Unnamed: 0_level_0,category,overall,reviewerID,asin,filename,image_vec
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,91,91,91,91,91,91
1,101,101,101,101,101,101
2,82,82,82,82,82,82


In [50]:
print(test_df[(test_df.prediction == 0) & (test_df.category == 'luxury')].shape)
print(test_df[(test_df.prediction == 1) & (test_df.category == 'luxury')].shape)
print(test_df[(test_df.prediction == 2) & (test_df.category == 'luxury')].shape)

(71, 7)
(73, 7)
(67, 7)


## K-Nearest Neighbors

In [51]:
from sklearn.neighbors import KNeighborsClassifier

In [52]:
neigh = KNeighborsClassifier(n_neighbors=3)

- `category`가 아닌 `overall`로 간단하게 평점 예측하기

In [53]:
y_train = train_df['overall'].values
y_train

array([4., 4., 5., ..., 5., 4., 5.])

In [54]:
neigh.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [55]:
test_df['prediction'] = test_df['image_vec'].apply(lambda x: neigh.predict([x])[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [56]:
test_df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec,prediction
266,luxury,5.0,A24YNYNC6QJNBN,B002B4540O,./../data/amazon_reviews\luxury/13577_2.jpg,"[0.854530513, 1.85690045, 1.03045952, 0.185951...",5.0
1088,beauty,5.0,A245UNW3PI53NG,B0009RF9DW,./../data/amazon_reviews\beauty/496_0.jpg,"[1.13625371, 0.0897669867, 0.442607284, 1.3276...",5.0
488,luxury,4.0,A25QBCHO0KFT0P,B00B95PWYE,./../data/amazon_reviews\luxury/21446_4.jpg,"[0.0355344117, 1.64337778, 0.0617780983, 0.030...",4.0
1087,beauty,5.0,A85ENSL5HBBZF,B0009RF9DW,./../data/amazon_reviews\beauty/495_0.jpg,"[0.909579039, 0.423226923, 1.81325483, 0.23797...",5.0
1152,beauty,4.0,A25QBCHO0KFT0P,B0010ZBORW,./../data/amazon_reviews\beauty/3194_0.jpg,"[1.35672057, 0.251046032, 1.97370028, 0.212662...",4.0


In [57]:
test_df[test_df.overall == test_df.prediction].count()

category      163
overall       163
reviewerID    163
asin          163
filename      163
image_vec     163
prediction    163
dtype: int64

In [58]:
len(test_df)

274

In [60]:
163/274

0.5948905109489051