In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import trange, tqdm
from time import sleep
import json

def read_jsonl(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

def dict_to_jsonl(dicts, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for d in dicts:
            json.dump(d, f, ensure_ascii=False)
            f.write('\n')
            
def check_valid_pic_url(url):
    return url and \
        any(url.lower().endswith(x) for x in ['.jpg', '.jpeg', '.png']) and \
        any(url.lower().startswith(x) for x in ['http://', 'https://'])
        
def get_soup(url) -> BeautifulSoup:
    response = requests.get(url, headers = {'cookie': 'over18=1;'})
    return BeautifulSoup(response.text, "html.parser")

In [2]:
def convert_push_to_int(push_text):
    if push_text.isdigit():
        return int(push_text)
    elif push_text == '爆':
        return 100
    elif push_text == 'XX':
        return -100
    elif push_text == '':
        return 0
    elif push_text[0] == 'X':
        return -10 * int(push_text[1])
    else:
        return 0

page_range = (3656, 3944)
base_url = 'https://www.ptt.cc'

popular = []
not_popular = []

for page_num in trange(page_range[0], page_range[1] + 1):
    url = f'https://www.ptt.cc/bbs/Beauty/index{page_num}.html'
    soup = get_soup(url)
    for article in soup.find_all('div', 'r-ent'):
        title = article.find('div', 'title').text.strip()
        date = article.find('div', 'date').text.strip()
        url = base_url + article.find('div', 'title').a['href']
        
        push_text = x.text if (x := article.find('div', 'nrec').span) else ''
        push = convert_push_to_int(push_text)

        # 12/31 -> 1231, 1/01 -> 0101
        date = date.replace('/', '')
        date = date if len(date) == 4 else '0' + date
        
        if page_num == page_range[0] and date == '1231':
            continue
        elif page_num == page_range[1] and date == '0101':
            break
        
        if title.startswith('[公告]') or \
            title.startswith('Fw:[公告]') or \
            title.startswith('(本文已被刪除)') or \
            title.startswith('(已被'):
            continue
        
        if push > 35:
            popular.append({
                'date': date,
                'title': title,
                'url': url,
                'push': push_text
            })
        else:
            not_popular.append({
                'date': date,
                'title': title,
                'url': url,
                'push': push_text
            })                

  0%|          | 0/289 [00:00<?, ?it/s]

100%|██████████| 289/289 [02:03<00:00,  2.33it/s]


In [3]:
def get_all_pic_urls(articles):  
    img_urls = []
    for article in tqdm(articles):
        soup = get_soup(article['url'])
        urls = [x for a in soup.find_all('a') if (x := a.get('href'))]
        urls = [url for url in urls if check_valid_pic_url(url)]
        
        img_urls.extend(urls)
        sleep(0.1)
    
    return img_urls

In [4]:
result = {
    'popular': {
        'articles': popular,
        'image_urls': get_all_pic_urls(popular) 
    },
    'not_popular': {
        'articles': not_popular,
        'image_urls': get_all_pic_urls(not_popular) 
    },
} 

100%|██████████| 1036/1036 [09:41<00:00,  1.78it/s] 
100%|██████████| 4648/4648 [39:21<00:00,  1.97it/s]  


In [5]:
# delete duplicate urls
result['popular']['image_urls'] = list(set(result['popular']['image_urls']))
result['not_popular']['image_urls'] = list(set(result['not_popular']['image_urls']))

In [6]:
# dump to json  
with open(f'pic_urls.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False, indent=4)

In [7]:
# load from json
with open('pic_urls.json', 'r', encoding='utf-8') as f:
    result = json.load(f)

In [8]:
# parallel downloader
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from typing import List
import shutil

# replace invalid characters("\\/:*?\"<>|") with _
def remove_invalid_char(name):
    return ''.join([c for c in name if c not in '\\/:*?\"<>|'])

error_count = 0 
# not_exist_image = open('not_exist.jpg', 'rb')
def download_images(urls: List[str], path: str):
    def download(url):
        global error_count
        try:
            filename = url.split('/')[-1]
            filename = remove_invalid_char(filename)
            response = requests.get(url, stream=True, headers = {'User-Agent': 'Mozilla/5.0'})
            
            file_url = os.path.join(path, filename)
            with open(file_url, 'wb') as out_file:
                shutil.copyfileobj(response.raw, out_file)
                
            del response
            
            # compare the downloaded image with the not_exist image
            with open('./not_exist.jpg', 'rb') as f:
                not_exist = f.read()
                with open(file_url, 'rb') as f:
                    downloaded = f.read()
                # delete if the downloaded image is the same as the not_exist image
                if not_exist == downloaded:
                    os.remove(file_url)
                    print(f'Not exist, url: {url}')
                    return
                    
            # check file size 
            if os.path.getsize(file_url) <= 4096:
                os.remove(file_url)
                print(f'File too small, url: {url}')
                return
        except Exception as e:
            print(f'Error: {e}, url: {url}')
            error_count += 1
            return
        
        # delay to prevent from being banned
        sleep(10)
        
    
    max_workers = os.cpu_count() or 1
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(download, urls)
        
    # wait for all the download to finish
    executor.shutdown(wait=True)

In [9]:
if not os.path.exists('./popular_1'):
    os.makedirs('./popular_1')
if not os.path.exists('./not_popular_1'):
    os.makedirs('./not_popular_1')
    
download_images(result['popular']['image_urls'], './popular_1')
download_images(result['not_popular']['image_urls'], './not_popular_1')

Not exist, url: https://i.imgur.com/HyK2SD2.jpg
Not exist, url: https://i.imgur.com/IVQUDQC.jpg
Not exist, url: https://i.imgur.com/H9mIvDc.jpg
Not exist, url: https://i.imgur.com/mhXbMw4.jpg
File too small, url: https://d.img.vision/dddshay/010.jpg
Not exist, url: https://i.imgur.com/8l6XHPa.jpg
Not exist, url: https://i.imgur.com/BpcVHZR.jpg
Not exist, url: https://i.imgur.com/l39RL16.jpg
Not exist, url: https://i.imgur.com/UhNomUM.jpg
Not exist, url: https://i.imgur.com/VyXJrH7.jpg
Not exist, url: https://i.imgur.com/5Ojl5tU.jpg
Not exist, url: https://i.imgur.com/t56shvY.jpg
Not exist, url: https://i.imgur.com/1eP9qU6.jpg
Not exist, url: http://i.imgur.com/TPVZWn5.jpg
Not exist, url: https://i.imgur.com/JdCoCjB.jpg
Not exist, url: https://i.imgur.com/z0bj6Y8.jpg
Not exist, url: https://i.imgur.com/Op89Xwh.jpg
Not exist, url: https://i.imgur.com/pdVw9ra.jpg
Not exist, url: https://i.imgur.com/6VN2bgd.jpg
Not exist, url: https://i.imgur.com/lcAbgz2.jpg
Not exist, url: https://i.imgur

KeyboardInterrupt: 

In [11]:
# check if every image is valid
import os
from PIL import Image

def is_valid_image(img_path):
    try:
        img = Image.open(img_path)
        img.verify()
        return True
    except Exception as e:
        print(f'Error: {e}, img_path: {img_path}')
        return False
    
img_dirs = ['./popular_1', './not_popular_1']

for img_dir in img_dirs:
    for file in os.listdir(img_dir):
        path = f'{img_dir}/{file}'
        # if it's a directory, skip it
        if os.path.isdir(path):
            continue
        if not is_valid_image(path):
            os.remove(path)
            print(f'{path} is removed')

Error: cannot identify image file './popular_1/NPSZZ6kr.jpg', img_path: ./popular_1/NPSZZ6kr.jpg
./popular_1/NPSZZ6kr.jpg is removed
Error: cannot identify image file './popular_1/uJKZ8UG.jpg', img_path: ./popular_1/uJKZ8UG.jpg
./popular_1/uJKZ8UG.jpg is removed
Error: cannot identify image file './not_popular_1/013.jpg', img_path: ./not_popular_1/013.jpg
./not_popular_1/013.jpg is removed
Error: cannot identify image file './not_popular_1/cBQIIKW.jpg', img_path: ./not_popular_1/cBQIIKW.jpg
./not_popular_1/cBQIIKW.jpg is removed
Error: cannot identify image file './not_popular_1/7wxcMIA.jpg', img_path: ./not_popular_1/7wxcMIA.jpg
./not_popular_1/7wxcMIA.jpg is removed
Error: cannot identify image file './not_popular_1/urlelBY.jpg', img_path: ./not_popular_1/urlelBY.jpg
./not_popular_1/urlelBY.jpg is removed
Error: cannot identify image file './not_popular_1/003.jpg', img_path: ./not_popular_1/003.jpg
./not_popular_1/003.jpg is removed
Error: cannot identify image file './not_popular_1/

In [13]:
from torchvision import transforms
from torch.utils.data import Dataset
from PIL import Image
import numpy as np
import os

class CustomDataset(Dataset):
    def __init__(self, popular_dirs, not_popular_dirs):
        popular_pic_path = [f'{dir}/{path}' for dir in popular_dirs for path in os.listdir(dir) if not os.path.isdir(f'{dir}/{path}')]
        not_popular_pic_path = [f'{dir}/{path}' for dir in not_popular_dirs for path in os.listdir(dir) if not os.path.isdir(f'{dir}/{path}')]
        self.pic_dirs = popular_pic_path + not_popular_pic_path
        self.labels = [1] * len(popular_pic_path) + [0] * len(not_popular_pic_path)
        
        self.trans = transforms.Compose([transforms.ToTensor(),
                            transforms.Resize((224, 224)),
                            transforms.Normalize(mean = (0.485, 0.456, 0.406), 
                                                 std = (0.229, 0.224, 0.225))
                            ])
        
    def __len__(self):
        return len(self.pic_dirs)
    
    def __getitem__(self, idx):       
        img = Image.open(self.pic_dirs[idx])
        img = img.convert('RGB')
        img = self.trans(img)
        
        label = self.labels[idx]
        return img, label

In [14]:
import h5py
from tqdm import trange
def create_hdf5_dataset(dataset, output_file):
    with h5py.File(output_file, 'w') as f:
        images_dataset = f.create_dataset('images', shape=(len(dataset), 3, 224, 224), dtype='float32')
        labels_dataset = f.create_dataset('labels', shape=(len(dataset),), dtype='int32')
        
        for i in trange(len(dataset)):
            try:
                image, label = dataset[i]
                images_dataset[i] = image.numpy()
                labels_dataset[i] = label
            except Exception as e:
                print(f'Error: {e}, idx: {i}')
                print(f'img_path: {dataset.pic_dirs[i]}')
                continue

In [15]:
dataset = CustomDataset(['./popular_1'], ['./not_popular_1'])

create_hdf5_dataset(dataset, 'custom_dataset.hdf5')

 21%|██        | 12295/58770 [12:58<34:17, 22.59it/s]  

Error: broken data stream when reading image file, idx: 12291
img_path: ./popular_1/2068.jpg


 32%|███▏      | 18678/58770 [19:33<27:15, 24.51it/s]  

Error: image file is truncated (4 bytes not processed), idx: 18678
img_path: ./not_popular_1/image014.jpg


100%|██████████| 58770/58770 [1:01:41<00:00, 15.88it/s]


In [16]:
import torch
class HDF5Dataset(Dataset):
    def __init__(self, file_path):
        self.file = h5py.File(file_path, 'r')
        self.images = self.file['images']
        self.labels = self.file['labels']
        self.length = len(self.images)
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        image = torch.tensor(self.images[idx])  # 加載並轉換為PyTorch tensor
        # 0, 1 to [1, 0], [0, 1]
        label = np.eye(2)[self.labels[idx]]
        label = torch.tensor(label, dtype=torch.float32)  # 加載並轉換為PyTorch tensor
        return image, label

# 使用DataLoader迭代資料集
hdf5_dataset = HDF5Dataset('./custom_dataset.hdf5')