In [None]:
# !pip install -U beautifulsoup4 cloudscraper pandas pillow
# !pip install -U cjm_parallel_utils

In [None]:
import random
from pathlib import Path
import json
import re
import queue
from functools import partial

from bs4 import BeautifulSoup
import cloudscraper

import pandas as pd
pd.set_option('max_colwidth', None)  # Do not truncate the contents of cells in the DataFrame
pd.set_option('display.max_rows', None)  # Display all rows in the DataFrame
pd.set_option('display.max_columns', None)  # Display all columns in the DataFrame

In [None]:
from cjm_parallel_utils.core import parallel

In [None]:
scraper = cloudscraper.create_scraper()

In [None]:
url = "https://www.pexels.com/new-photos/"
page_resp = scraper.get(url)
html_soup =  BeautifulSoup(page_resp.text, 'html.parser')

In [None]:
target_tag='article'
img_ids = []

if len(html_soup.select(target_tag)) > 0:
        for t in html_soup.select(target_tag):
            match = re.search(r"photos/\d+", str(t))
            if match: 
#                 print(match.group())
                img_ids.append(match.group().split('/')[-1])
    
img_ids = list(set(img_ids))
img_ids.sort(reverse=True)
img_ids

['15509817',
 '15509790',
 '15509771',
 '15509711',
 '15509710',
 '15509709',
 '15508999',
 '15508666',
 '15504968',
 '15504460',
 '15500431',
 '15500423']

In [None]:
latest_img_id = img_ids[0]
latest_img_id

'15509817'

In [None]:
def get_tag_terms(data_dict):
    tags = data_dict['props']['pageProps']['medium']['attributes']['tags']
    tag_terms = [tag['search_term'] for tag in tags]
    return tag_terms

In [None]:
def get_medium_attributes(data_dict):
    medium_attr_df = pd.DataFrame.from_dict(data_dict['props']['pageProps']['medium']['attributes'], orient='index')
    drop_list = ['description', 'width', 'height', 'slug', 'status', 'created_at', 'updated_at', 'publish_at', 'feed_at', 'license', 'published', 'starred', 'user', 'tags', 'image', 'alt', 'donate_url', 'collection_ids', 'liked']
    medium_attr_df = medium_attr_df.drop(drop_list).transpose()
    medium_attr_df['tags'] = ['']
    medium_attr_df.at[0, 'tags'] = get_tag_terms(data_dict)
    return medium_attr_df

In [None]:
def get_medium_detail_attributes(data_dict):
    medium_detail_attr_df = pd.DataFrame.from_dict(data_dict['props']['pageProps']['mediumDetails']['attributes'], orient='index')
    drop_list = ['copyright', 'created_at', 'fingerprint', 'updated_at', 'photographer', 'photographed_at', 'size', 'photo_id']
    return medium_detail_attr_df.drop(drop_list).transpose()

In [None]:
def get_attributes(img_id, attr_q, missing_q):
    url = f"https://www.pexels.com/photo/{img_id}"
    page_resp = scraper.get(url)
    html_soup = BeautifulSoup(page_resp.text, 'html.parser')
    try:
        next_data_dict = json.loads(html_soup.find('script', type='application/json').string)
        if next_data_dict is not None:
            if 'medium' not in next_data_dict['props']['pageProps']:
                missing_q.put(img_id)
                return
            if next_data_dict['props']['pageProps']['medium']['attributes']['title'] is None:
                missing_q.put(img_id)
                return
            medium_attr_df = get_medium_attributes(next_data_dict)
            medium_detail_attr_df = get_medium_detail_attributes(next_data_dict)
            attributes_df = pd.concat([medium_attr_df, medium_detail_attr_df], axis=1).set_index('id')
            attr_q.put(attributes_df)
    except:
        missing_q.put(img_id)

In [None]:
attr_q = queue.Queue()
missing_q = queue.Queue()

In [None]:
img_ids = [int(latest_img_id)-i for i in range(1000, 2000)]
len(img_ids)

10000

In [None]:
get_attributes(img_ids[0], attr_q, missing_q)

print(len(attr_q.queue), len(missing_q.queue))

if len(attr_q.queue) > 0:
    attributes_df = pd.concat(list(attr_q.queue))
    attributes_df.head()

0 1


In [None]:
attr_q = queue.Queue()
missing_q = queue.Queue()

In [None]:
stop

NameError: name 'stop' is not defined

In [None]:
parallel(partial(get_attributes, attr_q=attr_q, missing_q=missing_q), arr=img_ids)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
len(attr_q.queue), len(missing_q.queue)

(130, 870)

In [None]:
output_dir = Path(f"/mnt/980_1TB_2/Datasets/Pexels_New/")
output_dir.mkdir(parents=True, exist_ok=True)
output_dir

PosixPath('/mnt/980_1TB_2/Datasets/Pexels_New')

In [None]:
missing_ids_file_path = Path(output_dir/"missing_img_ids-new.txt")
missing_ids_file_path

PosixPath('/mnt/980_1TB_2/Datasets/Pexels_New/missing_img_ids-new.txt')

In [None]:
with open(missing_ids_file_path, "w") as write_file:
    for img_id in list(missing_q.queue):
        write_file.write(f"{img_id}\n")

In [None]:
attributes_df_json_file = Path(output_dir/'attributes_df-new.json')
attributes_df_json_file

PosixPath('/mnt/980_1TB_2/Datasets/Pexels_New/attributes_df-new.json')

In [None]:
attributes_df = pd.concat(list(attr_q.queue))
attributes_df.head()

Unnamed: 0_level_0,title,aspect_ratio,main_color,colors,tags,adult,aperture,camera,focal_length,google_place_id,iso,latitude,longitude,manufacturer,medical,orientation,racy,shutter_speed,software,spoof,violence,location
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
15508762,January Sunset,1.333333,"[138, 98, 135]",[],[],very_unlikely,2.8,SM-A037F,3.38,,2000,,,samsung,very_unlikely,1,very_unlikely,0.069,,very_unlikely,very_unlikely,
15508755,Indian Mangoes,1.333333,"[104, 113, 84]",[],[],very_unlikely,2.8,SM-A037F,3.38,,125,,,samsung,very_unlikely,1,very_unlikely,0.003,,very_unlikely,very_unlikely,
15508756,Necklace,0.748016,"[133, 121, 107]",[],[],very_unlikely,2.2,Redmi 7A,3.83,,118,,,Xiaomi,very_unlikely,0,unlikely,0.02,pine-user 9 PKQ1.190319.001 V11.0.5.0.PCMINXM release-keys,very_unlikely,very_unlikely,
15508747,Gold necklace with earrings,0.748016,"[150, 141, 124]",[],[],very_unlikely,2.2,Redmi 7A,3.83,,100,,,Xiaomi,very_unlikely,0,unlikely,0.0058479532163742,pine-user 9 PKQ1.190319.001 V11.0.5.0.PCMINXM release-keys,very_unlikely,very_unlikely,
15508746,Little bee,1.333333,"[144, 131, 61]",[],"[animall, bee, grass, honeybees, insect, insect photography, naturephoto, naturephotography, springtime]",very_unlikely,5.4,COOLPIX L820,37.1,,250,,,NIKON,unlikely,1,very_unlikely,0.004,COOLPIX L820V1.0,unlikely,very_unlikely,


In [None]:
attributes_df.to_json(attributes_df_json_file)