-
Notifications
You must be signed in to change notification settings - Fork 8
/
artveeScraper.py
201 lines (163 loc) · 7.23 KB
/
artveeScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import boto3
from botocore.exceptions import ClientError
from bs4 import BeautifulSoup
import csv
import json
import logging
import math
import os
import re
import requests
def create_bucket(bucket_name, s3, region=None):
try:
if region is None:
s3.create_bucket(Bucket=bucket_name)
else:
location = {'LocationConstraint': region}
s3.create_bucket(Bucket=bucket_name,
CreateBucketConfiguration=location)
except ClientError as e:
logging.error(e)
return False
return True
def upload_file(file_name, bucket, s3, object_name=None):
# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = file_name
# Upload the file
try:
response = s3.upload_file(file_name, bucket, object_name)
except ClientError as e:
logging.error(e)
return False
return True
def create_json(csv_path, json_path):
"""
Args:
csv_path : file path for the csv
json_path: file path for the json
Explanation:
Reads the csv and converts to a dictionary
Uses json.dumps() to dump the data and write to json
"""
data = {}
with open(csv_path, encoding='utf-8') as csvf:
csv_reader = csv.DictReader(csvf)
# Convert each row into a dictionary and add it to data
for rows in csv_reader:
key = rows['Title']
data[key] = rows
with open(json_path, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
def scrape_images(img_source, img_index, title, data_path, s3):
"""
Args:
img_source : list of the 'a' elements which direct to the image download options
img_index (int): the current image out of the 48 cards on the page
title (str): name of the artwork used in the file name
Explanation:
Finds the page to download images using the href in an element of img_source
Parses the download page and uses soup to get the download link for the image
Writes the image to file using requests and uploads to s3
Closes the streams and deletes the image from file after upload
"""
img_dl_page = requests.get(img_source[img_index].get("href"))
img_soup = BeautifulSoup(img_dl_page.content, "html.parser")
img_link = img_soup.find("a", {"class" : "prem-link gr btn btn-secondary dis snax-action snax-action-add-to-collection snax-action-add-to-collection-downloads"}).get("href")
img_name = title + ".jpg"
img_path = os.path.join(data_path, img_name)
with open(img_path, "wb") as img_file:
img_file.write(requests.get(img_link).content)
with open(img_path, "rb") as s3_img:
s3.upload_fileobj(s3_img, "artvee", title + ".jpg")
s3_img.close()
img_file.close()
os.remove(img_path)
def scrape_meta_images(url, category, data_path, writer, s3):
"""
Args:
url (str): URL for the paginated category pages
category (str): The category used in the url
data_path (str): The path where the csv, json, and temporary images will be stored
writer: Writes the appended elements in data to the csv
s3 (botocore.client.S3): s3 client through which images are uploaded
Explanation:
Parses the page of 48 artworks and puts cards, which contain the image and metadata, in a list
Parses the page for the image download page to be passed in after scraping metadata
In each card, finds the title and artist and appends to data []
Scrapes the image and uploads it
Writes data to the csv and moves to the next card
"""
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
cards = soup.find_all("div", {"class" : re.compile("product-grid-item product woodmart-hover-tiled*")})
img_source = soup.find_all("a", {"class" : "product-image-link linko"})
img_index = 0
for card in cards:
data = []
#Formatted in nested if-statements to prevent receiving an error for a missing element/class (None type)
title = card.find("h3", class_="product-title")
if (title != None):
if (title.find("a") != None):
title = title.get_text()
data.append(title)
else:
title = "Untitled"
data.append(title)
artist_info = card.find("div", class_="woodmart-product-brands-links")
if (artist_info != None):
artist_info = artist_info.get_text()
data.append(artist_info)
else:
artist_info = "Unknown"
data.append(artist_info)
scrape_images(img_source, img_index, title, data_path, s3)
data.append(category)
writer.writerow(data)
img_index += 1
def count_pages(category):
"""
Args:
category : used in the url to find the page and its respective results
Explanation:
Parse first page of a category
Find number of results displayed on page
Have 48 results displayed, mod 48, and add 1 for any remainder
Return total number of pages to iterate through
"""
url = "https://artvee.com/c/%s/page/1/?per_page=48" % category
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find("p", class_="woocommerce-result-count").text.strip("results").strip()
no_pages = math.floor(int(results) / 48)
if (int(results) % 48 > 0):
no_pages += 1
return no_pages
if __name__ == "__main__":
s3 = boto3.client('s3')
create_bucket("artvee", s3, "us-west-1")
data_path = ""
csv_path = os.path.join(data_path, "artvee.csv")
json_path = os.path.join(data_path + "artvee.json")
if (data_path == ""):
print("\nPlease assign a value to the data_path\n")
else:
with open(csv_path, "w", newline = "", encoding="utf-8") as f:
#Create csv writer and header row
headers = ["Title", "Artist", "Category"]
writer = csv.writer(f)
writer.writerow(headers)
#Artvee categorizes its works and these are how they are written in the url
categories = ["abstract", "figurative", "landscape", "religion", "mythology", "posters", "animals", "illustration", "fashion", "still-life", "historical", "botanical", "drawings", "japanese-art"]
for category in categories:
no_pages = count_pages(category)
#Pagination
for p in range(1, no_pages + 1):
print("Currently looking at: %s, page %d" % (category, p))
url = "https://artvee.com/c/%s/page/%d/?per_page=48" % (category, p)
scrape_meta_images(url, category, data_path, writer, s3)
f.close()
#Create the json after all data is written in the csv and upload it to s3 bucket
create_json(csv_path, json_path)
with open(json_path, "rb") as s3_meta:
s3.upload_fileobj(s3_meta, "artvee", "artveeMeta.json")