-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_artworks.py
98 lines (83 loc) · 5.68 KB
/
scrape_artworks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from googleapiclient.discovery import build
from google.oauth2 import service_account
# read line from the .txt file
lines = open('artstation_links_copy.txt').readlines()
options = Options()
options.page_load_strategy = 'normal'
driver = webdriver.Chrome(options = options)
# looping from the end of a list
for line in lines[::-1]:
url = line
driver.get(url)
try:
role_not_fixed = driver.find_element_by_css_selector('div.wrapper-main').text
role_fixed = role_not_fixed.split('\n')[1]
except:
role_fixed = None
try:
# looking for the word "at" and splitting text on that occurence.
word_at = ' at '
before, keyword, after = role_fixed.partition(word_at)
company_work_at = after.strip()
except:
company_work_at = None
try:
art_title = driver.find_element_by_css_selector('h1.h3').text
except:
art_title = None
try:
date_posted_not_fixed = driver.find_element_by_xpath('/html/body/div[3]/div[2]/div/div[1]/aside/div/small[1]/i').text
date_posted_fixed = date_posted_not_fixed.split(' ')[1:]
date_posted = ' '.join(date_posted_fixed)
except:
date_posted = None
try:
number_likes = driver.find_element_by_xpath('/html/body/div[3]/div[2]/div/div[1]/aside/div/ul/li[1]/as-likes/button/as-pluralize/span').text.replace(',', '')
except:
number_likes = '0'
try:
number_views = driver.find_element_by_xpath('/html/body/div[3]/div[2]/div/div[1]/aside/div/ul/li[2]/as-pluralize/span').text.replace(',', '')
except:
number_views = '0'
try:
number_comments = driver.find_element_by_xpath('/html/body/div[3]/div[2]/div/div[1]/aside/div/ul/li[3]/as-pluralize/span').text.replace(',', '')
except:
number_comments = '0'
try:
software_used_not_fixed = driver.find_element_by_css_selector('div.wrapper-main').text
word = 'Software Used'
# looking for the word "Software Used" and splitting text on that occurence.
before, keyword, after = software_used_not_fixed.partition(word)
# looking for any occurence using regex
pattern = re.findall('Blender|Photoshop|ZBrush|ScandyPro|Maya|Procreate|double shot espresso|Double Shot espresso|Double shot espresso|Nuke|Mari|Krakatoa|RealFlow|ArtFlow|Flash|Flame Painer|CrazyBump|Frost|FumeFX|Phoenix FD|Photoscan|ScansLibrary|Inkscape|GIMP|CRYENGINE|Frostbite Engine|MagicaVoxel|Black Magic Desing Fusion|Fusion 360|Coiffure|RealityCapture|FireAlpaca|Realtime Board|Dreams|Dreamweaver|Adobe Draw|Snowdrop|Final Draft|Hydra Renderer|zbursh|DragonBones|HDR Light Studio|CorelDRAW Graphics Studio|DragonFrame|SpeedTree|Illustrator Draw|Illust Studio|Adobe XD|Adobe Lightroom|Hair Strand Designer|trueSKY|Source Control|ArtRage|Sculptris|PFTrack|The Groove 3D|UNIGINE Engine|Character Creator|Acrobat|Gaea|Manga Studio|Game Textures|Sublime Text|RD-Textures|PolyBrush|Poliigon|InVision Draft|PureRef|Mischief|Yeti|Knald|TYFlow|UVLayout|Nomad Sculpt|Substance Painter|Substance Designer|FiberShop|Unreal Engine|After Effects|Lightroom|Octane Render|3DCoat|World Machine|3ds Max|V-Ray|Forest Pack|Ornatrix|Marvelous Designer|Arnold|Xgen|Cinema 4D|Mixer|Bridge|Marmoset Toolbag|Keyshot|RizomUV|Clarisse IFX|QuarkXPress|E-Cycles|iClone|X-Particles|SketchClub|RailClone|Cycles 4D|Qubicle|Sletchfab|Premiere|Lumion|Quixel|Quixel Mixer|Substance ALchemist|MightyBake|Modo|3Delight|AIR|Unity|Enscape|SOLIDWORKS Visualize|RenderMan|Verge3D|Thea Render|Mental Ray|Twinmotion|Maxwell Render|Redshift|Corona Renderer|Corona|Houdini|Mantra|Karma|formZ|Toolbag|LightWave 3D|Iray|LuxRender|YafaRay|FurryBall|FluidRay|Katana|Light Tracer Render|EEVEE|Cycles|LuxCoreRender|Raylectron|Radiance|Vegas Pro|SolidWorks|Rhinoceros|Moi3D|Moi|3dpeople|Python|PhotoLab 2|Shotgun|Photoshop Elements|Indigo Renderer|Gelato|Sketch App|SketchUp|AutoCAD|ArchiCAD|Mudbox|MudBox|Figma|Pixelmator|Zmodeler|R3DS Wrap|R3DS ZWRAP|TopoGun|Texturingxyz|xNormal|CLIP STUDIO PAINT|TVPaint|Megascans|Illustrator', after)
# joining finded occurenceses on the new line
software_used = '\n'.join(pattern)
except:
software_used = None
try:
tags_not_fixed = driver.find_element_by_css_selector('div.wrapper-main').text
tags_fixed = tags_not_fixed.split('\n')[::-1][1].split('#')[1:]
tags_final = '\n'.join(tags_fixed)
except:
tags_final = None
# Watch this Google Sheets Python API tutorial: https://bit.ly/3bZZxd6
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']
SERVICE_ACCOUNT_FILE = 'keys.json' # json keys from google cloud platform.
creds = None
creds = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes = SCOPES)
SAMPLE_SPREADSHEET_ID = 'YOUR_SPREADSHEET_ID'
service = build('sheets', 'v4', credentials = creds)
sheet = service.spreadsheets()
columns = [[role_fixed], [company_work_at], [date_posted], [number_views], [number_likes], [number_comments], [software_used], [tags_final], [art_title], [url]]
request = service.spreadsheets().values().append(spreadsheetId = SAMPLE_SPREADSHEET_ID,
range = "Main!A2", valueInputOption = "USER_ENTERED",
body = {"majorDimension": "COLUMNS",
"values": columns}).execute()
print(f'Writing: {request}')
# deleting each line after it scraped. Note there's could be a better solution
del lines[-1]
open('artstation_links_copy.txt', 'w').writelines(lines)
driver.quit()