# Imports

In [1]:
import os
from os import listdir
from os.path import isfile, join

import requests
from PIL import Image

import pandas as pd

# Load Master CSV

The output of the last script

In [2]:
# Scraped image path params

cwd_path = os.getcwd()
image_rel_root_path = '../raw_data/scraped_images/' # This is important
image_root_path = os.path.join(cwd_path, image_rel_root_path)

image_root_path

'/home/lscr/code/2023-q1-wagon/2023-q1-projects/YOGi/notebooks/../raw_data/images_v2/'

In [3]:
# Load CSV with sweet scraping info
df = pd.read_csv(f'{image_root_path}/yoga-82_scraping.csv')
df.shape

(28450, 4)

In [4]:
df.head()

Unnamed: 0,class,image_name,url,success
0,Reclining_Hand-to-Big-Toe_Pose_or_Supta_Padang...,657.jpg,http://lesliesaglio.com/wp-content/uploads/201...,0
1,Reclining_Hand-to-Big-Toe_Pose_or_Supta_Padang...,643.jpg,https://s-media-cache-ak0.pinimg.com/736x/36/a...,1
2,Reclining_Hand-to-Big-Toe_Pose_or_Supta_Padang...,640.jpg,http://static.squarespace.com/static/5387efd9e...,1
3,Reclining_Hand-to-Big-Toe_Pose_or_Supta_Padang...,623.jpg,https://www.melissawest.com/wp-content/uploads...,1
4,Reclining_Hand-to-Big-Toe_Pose_or_Supta_Padang...,615.jpg,https://www.melissawest.com/wp-content/uploads...,1


# Check if a file is able to be opened. If so, what extension does the file have.

No point in checking links that we didn't get a response from

In [6]:
# Scraped image source params

image_rel_root_path = '../raw_data/images_v2/' # This is important
image_root_path = os.path.join(cwd_path, image_rel_root_path)

image_root_path

'/home/lscr/code/2023-q1-wagon/2023-q1-projects/YOGi/notebooks/../raw_data/images_v2/'

In [7]:
# Testing on one hardcoded image
try:
    im = Image.open(f'{image_root_path}/Akarna_Dhanurasana/0_4.jpg')
    if im.format != 'JPEG':
        print(f'.{im.format}')
except IOError:
    print('Broken')

In [8]:
im.show()

In [9]:
im.format

'JPEG'

# Validation Script

In [10]:
# Create new columns in frame for 'valid' and 'extension'

df['valid'] = df['success']
df['extension'] = 'broken'

In [11]:
# iterating through every row of the data frame
io_count = 0
file_count = 0
for i, row in df.iterrows():
    abs_img_path = os.path.join(image_root_path, row['class'], row['image_name'])
    
    # Check to see if image was downloaded
    if row['success'] == 1:
        
        # Try to open image from file
        try: 
            # Success
            im = Image.open(abs_img_path) 
            
            # Check extension of image, default is jpeg
            if im.format != 'JPEG':
                file_count += 1

            # df.loc[i, 'valid'] = int(1)
            df.loc[i, 'extension'] = im.format

        except IOError: 
            # Fail
            df.loc[i, 'valid'] = int(0)
            df.loc[i, 'extension'] = 'broken'

            io_count += 1

        # print(f'{abs_image_path} is not valid.')
        # row['valid'] = 0 



In [12]:
df.valid = df.valid.astype('int')

In [21]:
# Fun numbers
df.success.sum(), df.valid.sum(), df.success.sum() - df.valid.sum()

(21735, 19188, 2547)

In [22]:
# io_count is the same as success minus valid - this is good.
file_count, io_count

(1430, 2547)

In [27]:
print(f'{file_count} files are not .jpg and have the incorrect extension.')
print(f'{io_count} files are broken and can not be opened.')
print(f'{df.valid.sum()} out of {df.shape[0]} images are valid.')
print(f'{df.valid.sum() / df.shape[0] * 100: 0.2f}% images are valid.')

1430 files are not .jpg and have the incorrect extension.
2547 files are broken and can not be opened.
19188 out of 28450 images are valid.
 67.44% images are valid.


In [19]:
df.head()

Unnamed: 0,class,image_name,url,success,valid,extension
0,Reclining_Hand-to-Big-Toe_Pose_or_Supta_Padang...,657.jpg,http://lesliesaglio.com/wp-content/uploads/201...,0,0,broken
1,Reclining_Hand-to-Big-Toe_Pose_or_Supta_Padang...,643.jpg,https://s-media-cache-ak0.pinimg.com/736x/36/a...,1,1,JPEG
2,Reclining_Hand-to-Big-Toe_Pose_or_Supta_Padang...,640.jpg,http://static.squarespace.com/static/5387efd9e...,1,1,JPEG
3,Reclining_Hand-to-Big-Toe_Pose_or_Supta_Padang...,623.jpg,https://www.melissawest.com/wp-content/uploads...,1,1,JPEG
4,Reclining_Hand-to-Big-Toe_Pose_or_Supta_Padang...,615.jpg,https://www.melissawest.com/wp-content/uploads...,1,1,JPEG


In [17]:
# You can use this to check class balance

df[df['valid'] == 1][['class', 'valid']].groupby('class').count()

Unnamed: 0_level_0,valid
class,Unnamed: 1_level_1
Akarna_Dhanurasana,86
Bharadvaja's_Twist_pose_or_Bharadvajasana_I_,82
Boat_Pose_or_Paripurna_Navasana_,391
Bound_Angle_Pose_or_Baddha_Konasana_,264
Bow_Pose_or_Dhanurasana_,225
...,...
Wide-Legged_Forward_Bend_pose_or_Prasarita_Padottanasana_,259
Wild_Thing_pose_or_Camatkarasana_,178
Wind_Relieving_pose_or_Pawanmuktasana,156
Yogic_sleep_pose,80


In [18]:
# Write frame to .csv

df.to_csv(f'{image_root_path}/yoga-82_validated.csv')