# Imports

In [1]:
import os
from os import listdir
from os.path import isfile, join

import requests

import pandas as pd

# Source Files

In [2]:
# Source Params

cwd_path = os.getcwd()
rel_path = '../raw_data/Yoga-82/yoga_dataset_links'
my_path = os.path.join(cwd_path, rel_path)


files = [f'{my_path}/{f}' for f in listdir(my_path) if f.endswith('.txt')]

In [3]:
my_path

'/home/lscr/code/2023-q1-wagon/2023-q1-projects/YOGi/notebooks/../raw_data/Yoga-82/yoga_dataset_links'

In [4]:
for file in files:
    if ' ' in file:
        print(file)

/home/lscr/code/2023-q1-wagon/2023-q1-projects/YOGi/notebooks/../raw_data/Yoga-82/yoga_dataset_links/Sitting pose 1 (normal).txt
/home/lscr/code/2023-q1-wagon/2023-q1-projects/YOGi/notebooks/../raw_data/Yoga-82/yoga_dataset_links/Split pose.txt


In [5]:
# files[75:]

# Scraping

In [6]:
# Write Params

rel_save_path = '../raw_data/images_v2'
abs_save_root_path = os.path.join(cwd_path, rel_save_path)

TIMEOUT = 10

scrape_dict = {
    'class': [],
    'image_name': [],
    'url': [],
    'success': []
}

In [None]:
for i, file in enumerate(files, 1):
    pose_class = file.strip('.txt').split('/')[-1]
    print(f'Scraping {pose_class} - file {i} of {len(files)}')
    class_save_dir = os.path.join(abs_save_root_path, pose_class)
    
    if not os.path.exists(class_save_dir):
        os.makedirs(class_save_dir)
    
    # Open text file with links
    with open(file, 'r') as f:
        lines = [l.strip().split('\t') for l in f]
        
        fail_count = 0
        # Iterate through lines in file
        for ii, line in enumerate(lines, 1):
            # line[0] == 'class_dir/img.jpg'
            # line[1] == img_url
            image_save_path = os.path.join(abs_save_root_path, line[0])
            
            # http request
            try: # try-except for when a domain no longer exists
                response = requests.get(line[1], timeout=TIMEOUT)

                # success  
                if response.status_code == 200:
                    with open(image_save_path, 'wb') as f:
                        f.write(response.content)
                    # write to dict for info
                    scrape_dict['class'].append(pose_class)
                    scrape_dict['image_name'].append(line[0].split('/')[1])
                    scrape_dict['url'].append(line[1])
                    scrape_dict['success'].append(1)

                # failure
                else:
                    scrape_dict['class'].append(pose_class)
                    scrape_dict['image_name'].append(line[0].split('/')[1])
                    scrape_dict['url'].append(line[1])
                    scrape_dict['success'].append(0)
                    fail_count += 1
            
            # failure
            except: 
                scrape_dict['class'].append(pose_class)
                scrape_dict['image_name'].append(line[0].split('/')[1])
                scrape_dict['url'].append(line[1])
                scrape_dict['success'].append(0)
                fail_count += 1
                    
            # info
            if ii % 50 == 0:
                print(f'Attempted {ii} of {len(lines)} images in class {i}.')
            if ii == len(lines):
                print(f'Attempted to scrape {ii} images with {fail_count} broken links.')

print('Completed scraping!')

Scraping Reclining_Hand-to-Big-Toe_Pose_or_Supta_Padangusthasana_ - file 1 of 82
Attempted 50 of 265 images in class 1.
Attempted 100 of 265 images in class 1.
Attempted 150 of 265 images in class 1.
Attempted 200 of 265 images in class 1.
Attempted 250 of 265 images in class 1.
Attempted to scrape 265 images with 94 broken links.
Scraping Extended_Revolved_Triangle_Pose_or_Utthita_Trikonasana_ - file 2 of 82
Attempted 50 of 768 images in class 2.
Attempted 100 of 768 images in class 2.
Attempted 150 of 768 images in class 2.
Attempted 200 of 768 images in class 2.
Attempted 250 of 768 images in class 2.
Attempted 300 of 768 images in class 2.
Attempted 350 of 768 images in class 2.
Attempted 400 of 768 images in class 2.
Attempted 450 of 768 images in class 2.
Attempted 500 of 768 images in class 2.
Attempted 550 of 768 images in class 2.
Attempted 600 of 768 images in class 2.
Attempted 650 of 768 images in class 2.
Attempted 700 of 768 images in class 2.
Attempted 750 of 768 images 

In [None]:
df = pd.DataFrame(scrape_dict)

In [None]:
df.to_csv(f'{abs_save_root_path}/yoga-82_scraping.csv', index=False)

In [None]:
df.shape

In [None]:
num_success = df.success.sum()
per_success = df.success.sum() / df.shape[0]

print(f'Successfully scraped {num_success} images out of a total {df.shape[0]} images.{per_success * 100: 0.2f}% success rate.')

# Checking Each Image that has been downloaded

In [None]:
from PIL import Image

In [None]:
try:
    im = Image.open(f'{abs_save_root_path}/Akarna_Dhanurasana/0_151.jpg')
except IOError:
    print('Broken')

In [None]:
im.format

In [None]:
im.show()