In [1]:
import boto3
import json
import pandas as pd
from boto3 import client

In [2]:
# Get list of Rekognition responses in responses folder of s3 bucket

s3r = boto3.resource('s3')
bucket = s3r.Bucket('judge-a-book')
files_in_bucket = list(bucket.objects.all())

obj_list = [obj.key for obj in files_in_bucket if obj.key.startswith('responses/') and obj.key.endswith('json')]

In [3]:
# Extract label information from Rekognition responses
conn = client('s3')
res_list = []
for key in obj_list:
    response = conn.get_object(Bucket='judge-a-book', Key=key)
    text = response["Body"].read().decode()
    res_list.append((key, json.loads(text))) 

In [14]:
res_list

[('responses/0001642081.json',
  {'Labels': [{'Name': 'Text',
     'Confidence': 99.81629943847656,
     'Instances': [],
     'Parents': []},
    {'Name': 'Menu',
     'Confidence': 88.78952026367188,
     'Instances': [{'BoundingBox': {'Width': 0.9102855324745178,
        'Height': 0.9439492225646973,
        'Left': 0.03298455476760864,
        'Top': 0.032055750489234924},
       'Confidence': 85.45950317382812}],
     'Parents': [{'Name': 'Text'}]},
    {'Name': 'Label',
     'Confidence': 86.8148422241211,
     'Instances': [],
     'Parents': [{'Name': 'Text'}]},
    {'Name': 'Paper',
     'Confidence': 72.81261444091797,
     'Instances': [],
     'Parents': []},
    {'Name': 'Poster',
     'Confidence': 71.39962768554688,
     'Instances': [],
     'Parents': [{'Name': 'Advertisement'}]},
    {'Name': 'Advertisement',
     'Confidence': 71.39962768554688,
     'Instances': [],
     'Parents': []},
    {'Name': 'Flyer',
     'Confidence': 63.58919906616211,
     'Instances': []

In [8]:
# Compiles Rekognition label information into DataFrame compatible list format

label_list = []
no_label_list = []
for res in res_list:
    labels = res[1]['Labels']
    if len(labels) == 0:
        ISBN = res[0][10:-5]
        no_label_list.append(ISBN)
    else:
        for label in res[1]['Labels']:
            ISBN = res[0][10:-5]
            Name = label['Name']
            Confidence = label['Confidence']
            Instances = label['Instances']
            numInst = len(label['Instances'])
            label_list.append({'ISBN': ISBN, 
                               'Name': Name, 
                               'Confidence': Confidence, 
                               'Instances': Instances, 
                               'Num_Instances': numInst})
        
# Create DataFrame from label_list

df = pd.DataFrame(label_list)

In [10]:
no_labels = pd.DataFrame(no_label_list)

In [12]:
len(no_labels)

23

#### Theres 23 book covers that has no labels

In [18]:
df.head()

Unnamed: 0,ISBN,Name,Confidence,Instances,Num_Instances
0,1642081,Text,99.816299,[],0
1,1642081,Menu,88.78952,"[{'BoundingBox': {'Width': 0.9102855324745178,...",1
2,1642081,Label,86.814842,[],0
3,1642081,Paper,72.812614,[],0
4,1642081,Poster,71.399628,[],0


In [19]:
# Import goodreads' children's books dataframe

goodreads = pd.read_json('goodreads_books_children.json', lines=True)
goodreads.head()

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,1599150603,7,[],US,,"[{'count': '56', 'name': 'to-read'}, {'count':...",,False,4.13,B00DU10PUG,...,9.0,,2006.0,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,287141,46,278578,The Aeneid for Boys and Girls,The Aeneid for Boys and Girls
1,1934876569,6,[151854],US,,"[{'count': '515', 'name': 'to-read'}, {'count'...",,False,4.22,,...,3.0,,2009.0,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,6066812,98,701117,All's Fairy in Love and War (Avalon: Web of Ma...,All's Fairy in Love and War (Avalon: Web of Ma...
2,590417010,193,[],US,eng,"[{'count': '450', 'name': 'to-read'}, {'count'...",,False,4.43,B017RORXNI,...,9.0,,1995.0,https://www.goodreads.com/book/show/89378.Dog_...,https://images.gr-assets.com/books/1360057676m...,89378,1331,86259,Dog Heaven,Dog Heaven
3,915190575,4,[],US,,"[{'count': '8', 'name': 'to-read'}, {'count': ...",,False,4.29,,...,,,,https://www.goodreads.com/book/show/3209312-mo...,https://s.gr-assets.com/assets/nophoto/book/11...,3209312,11,3242879,"Moths and Mothers, Feathers and Fathers: A Sto...","Moths and Mothers, Feathers and Fathers: A Sto..."
4,1416904999,4,[],US,,"[{'count': '8', 'name': 'to-read'}, {'count': ...",,False,3.57,,...,6.0,,2005.0,https://www.goodreads.com/book/show/1698376.Wh...,https://s.gr-assets.com/assets/nophoto/book/11...,1698376,23,1695373,What Do You Do?,What Do You Do?


In [20]:
# Merge df with goodreads DataFrame

df1 = df.merge(goodreads, how='left', left_on='ISBN', right_on='isbn')
df1.head()

Unnamed: 0,ISBN,Name,Confidence,Instances,Num_Instances,isbn,text_reviews_count,series,country_code,language_code,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,1642081,Text,99.816299,[],0,1642081,4,[265777],US,,...,,,1970,https://www.goodreads.com/book/show/2657015-ba...,https://images.gr-assets.com/books/1307384593m...,2657015,32,2681793,"Ballet for Laura (Laura, #1)","Ballet for Laura (Laura, #1)"
1,1642081,Menu,88.78952,"[{'BoundingBox': {'Width': 0.9102855324745178,...",1,1642081,4,[265777],US,,...,,,1970,https://www.goodreads.com/book/show/2657015-ba...,https://images.gr-assets.com/books/1307384593m...,2657015,32,2681793,"Ballet for Laura (Laura, #1)","Ballet for Laura (Laura, #1)"
2,1642081,Label,86.814842,[],0,1642081,4,[265777],US,,...,,,1970,https://www.goodreads.com/book/show/2657015-ba...,https://images.gr-assets.com/books/1307384593m...,2657015,32,2681793,"Ballet for Laura (Laura, #1)","Ballet for Laura (Laura, #1)"
3,1642081,Paper,72.812614,[],0,1642081,4,[265777],US,,...,,,1970,https://www.goodreads.com/book/show/2657015-ba...,https://images.gr-assets.com/books/1307384593m...,2657015,32,2681793,"Ballet for Laura (Laura, #1)","Ballet for Laura (Laura, #1)"
4,1642081,Poster,71.399628,[],0,1642081,4,[265777],US,,...,,,1970,https://www.goodreads.com/book/show/2657015-ba...,https://images.gr-assets.com/books/1307384593m...,2657015,32,2681793,"Ballet for Laura (Laura, #1)","Ballet for Laura (Laura, #1)"


In [16]:
# Save df1 to a pickle

df1.to_pickle('BookLabels.pkl')

In [21]:
df1 = pd.read_pickle('BookLabels.pkl')

In [29]:
rbg_df = pd.read_csv('average_rbg.csv', index_col=0)

In [30]:
rbg_df.head()

Unnamed: 0,isbn,blue,green,red
0,1599901927,141.225092,177.505561,215.409007
1,8415594828,102.218585,190.058614,187.763558
2,515157635,63.026062,59.296692,59.767029
3,590819194,238.6267,238.833428,238.665763
4,395522781,195.211215,212.979706,219.304641


In [22]:
histograms_df = pd.read_csv('histograms.csv')

In [23]:
# Merge df1 with color histograms dataframe and save to pickle

df2 = df1.merge(histograms_df, how='left', left_on='ISBN', right_on='isbn')
df2.to_pickle('BookLabels_Histograms.pkl')

In [31]:
df3 = df2.merge(rbg_df, how='left', left_on='ISBN', right_on='isbn')

In [34]:
df3.shape

(66528, 136)

In [35]:
df3.to_pickle('BookLabels_Histograms_RBG.pkl')