In [None]:

def get_production_info(section_element):
    h4_element = section_element.find('h4')
    type = h4_element.span.text.strip().lower().replace(":", "").capitalize()
    title = h4_element.a.text.strip()
    release_info = h4_element.contents[-1].strip().strip('()')

    ongoing_pattern = re.compile(r'(\d{4})-')  # Matches "YYYY-"
    range_pattern = re.compile(r'(\d{4})-(\d{4})')  # Matches "YYYY-YYYY"
    year_pattern = re.compile(r'^(\d{4})$')  # Matches "YYYY"

    ongoing_match = ongoing_pattern.search(release_info)
    range_match = range_pattern.search(release_info)
    year_match = year_pattern.search(release_info)

    if range_match:
        start_year = range_match.group(1)
        end_year = range_match.group(2)
        ongoing = False
    elif ongoing_match:
        start_year = ongoing_match.group(1)
        end_year = None
        ongoing = True
    elif year_match:
        start_year = year_match.group(1)
        end_year = None
        ongoing = False
    else:
        start_year = None
        end_year = None
        ongoing = False

    result = {
        "type": type,
        "title": title,
        "start_year": start_year,
        "end_year": end_year,
        "ongoing": ongoing,
        "media_list": []
    }
    return result


def get_media_info(next_sibling):
    if (not next_sibling):
        return None
    if next_sibling.find('a', class_='video'):
        a_tag = next_sibling.find('a')
        href = a_tag.get('href')
        eid = a_tag.get('eid')
        img_src = a_tag.find('img')['src']
        result = {
            "link_to_media": href,
            "eid": eid,
            "image_source": img_src,
            "type": "video"
        }
        return result
    elif next_sibling.find('a', class_='picture'):
        a_tag = next_sibling.find('a')
        href = a_tag.get('href')
        img_src = a_tag.find('img')['src']
        result = {
            "link_to_media": href,
            "eid": None,
            "image_source": img_src,
            "type": "picture"
        }
        return result

In [None]:
import json
import os
import json
import re
import requests
from bs4 import BeautifulSoup

start_index = 1
end_index = 50


def get_production_info(production):
    h4_element = production.find('h4')
    type = h4_element.span.text.strip().lower().replace(":", "").capitalize()
    title = h4_element.a.text.strip()
    release_info = h4_element.contents[-1].strip().strip('()')

    ongoing_pattern = re.compile(r'(\d{4})-')  # Matches "YYYY-"
    range_pattern = re.compile(r'(\d{4})-(\d{4})')  # Matches "YYYY-YYYY"
    year_pattern = re.compile(r'^(\d{4})$')  # Matches "YYYY"

    ongoing_match = ongoing_pattern.search(release_info)
    range_match = range_pattern.search(release_info)
    year_match = year_pattern.search(release_info)

    if range_match:
        start_year = range_match.group(1)
        end_year = range_match.group(2)
        ongoing = False
    elif ongoing_match:
        start_year = ongoing_match.group(1)
        end_year = None
        ongoing = True
    elif year_match:
        start_year = year_match.group(1)
        end_year = None
        ongoing = False
    else:
        start_year = None
        end_year = None
        ongoing = False

    result = {
        "type": type,
        "title": title,
        "start_year": start_year,
        "end_year": end_year,
        "ongoing": ongoing,
        "media_list": []
    }
    return result


def get_media_info(media):
    if (not media):
        return None
    if media.find('a', class_='video'):
        a_tag = media.find('a')
        href = a_tag.get('href')
        eid = a_tag.get('eid')
        img_src = a_tag.find('img')['src']
        result = {
            "link_to_media": href,
            "eid": eid,
            "image_source": img_src,
            "type": "video"
        }
        return result
    elif media.find('a', class_='picture'):
        a_tag = media.find('a')
        href = a_tag.get('href')
        img_src = a_tag.find('img')['src']
        result = {
            "link_to_media": href,
            "eid": None,
            "image_source": img_src,
            "type": "picture"
        }
        return result


def get_actress_portfolio(url, view_count):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    actress_name = soup.find('h1').text.strip()
    actress_name = re.sub(r'#.*', '', actress_name).strip()
    try:
        birthplace = soup.find(
            'div', class_='banner-info').find('a').text.strip()
    except:
        birthplace = ""
    hashtag = soup.find(
        'span', class_='tag-desktop').text.strip().replace("#", "")
    star_rating = soup.find('span', class_='rating-score').text.strip()
    celeb_img_url = soup.find(
        'img', class_="img-circle pull-right img-responsive celeb-img").get('src')

    actress_portfolio = {
        "actress_name": actress_name,
        "birthplace": birthplace,
        "hashtag": hashtag,
        "star_rating": star_rating,
        "celeb_img_url": celeb_img_url,
        "view_count": view_count,
        "url": url,
        "production_media_list": []
    }

    production_media_list = []

    production_section_list = soup.find_all('section')

    for production in production_section_list:
        media_div_for_section = production.find_next_sibling()
        production_info = get_production_info(production)

        if media_div_for_section and media_div_for_section.name == 'div':
            media_div_list = media_div_for_section.find_all(
                'div', class_='col-lg-3 col-sm-4 col-xs-6 celebs-boxes albuma')
            media_list = []
            for media in media_div_list:
                media_info = get_media_info(media)
                media_list.append(media_info)
            production_info["media_list"] = media_list

        production_media_list.append(production_info)

    actress_portfolio["production_media_list"] = production_media_list
    return actress_portfolio


# print(get_actress_portfolio('https://www.aznude.com/view/celeb/e/estheracebo.html', view_count=0))

# Read the names from the text file into a list
with open("actress_list.txt", "r") as file:
    names = file.readlines()

# Remove any leading or trailing whitespace characters
names = [name.strip() for name in names]

names_dict = {}

for index in range(len(names)):
    if "failed" in names[index]:
        key = names[index].replace("failed", "").strip()
        names_dict[key] = {
            "place": (index % 22),
            "page_index": (index // 22)
        }

folder_path = "E:\\VSCode\\aznude\\actress_portfolio"
actr_list_path = "E:\\VSCode\\aznude\\actrss"

json_contents = []

json_files = [file_name for file_name in os.listdir(
    folder_path) if file_name.endswith('.json')]


for file_name in json_files:
    file_path = os.path.join(folder_path, file_name)

    if os.path.isfile(file_path) and file_name.endswith('.json'):
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            page_index = names_dict[data[1]]["page_index"]
            place = names_dict[data[1]]["place"]
            name = data[1]
            actrs_file_path = f"E:\\VSCode\\aznude\\actrss\\actress_portfolio_list_{page_index}.json"

            print(name, page_index+1, place)

            actrs_url = "https://aznude.com" + data[0]
            test = get_actress_portfolio(actrs_url)
            json_contents.append(data)

    break

In [None]:
# Read the names from the text file into a list
with open("actress_list.txt", "r") as file:
    names = file.readlines()

# Remove any leading or trailing whitespace characters
names = [name.strip() for name in names]

names_dict = {}

for index in range(len(names)):
    if "failed" in names[index]:
        key = names[index].replace("failed", "").strip()
        names_dict[key] = {
            "place": (index % 22) + 1,
            "page_index": (index // 22) if (index // 22) else 1
        }

names_dict

In [None]:
import json
import re
import requests
from bs4 import BeautifulSoup

start_index = 1
end_index = 50


def get_production_info(production):
    h4_element = production.find('h4')
    type = h4_element.span.text.strip().lower().replace(":", "").capitalize()
    title = h4_element.a.text.strip()
    release_info = h4_element.contents[-1].strip().strip('()')

    ongoing_pattern = re.compile(r'(\d{4})-')  # Matches "YYYY-"
    range_pattern = re.compile(r'(\d{4})-(\d{4})')  # Matches "YYYY-YYYY"
    year_pattern = re.compile(r'^(\d{4})$')  # Matches "YYYY"

    ongoing_match = ongoing_pattern.search(release_info)
    range_match = range_pattern.search(release_info)
    year_match = year_pattern.search(release_info)

    if range_match:
        start_year = range_match.group(1)
        end_year = range_match.group(2)
        ongoing = False
    elif ongoing_match:
        start_year = ongoing_match.group(1)
        end_year = None
        ongoing = True
    elif year_match:
        start_year = year_match.group(1)
        end_year = None
        ongoing = False
    else:
        start_year = None
        end_year = None
        ongoing = False

    result = {
        "type": type,
        "title": title,
        "start_year": start_year,
        "end_year": end_year,
        "ongoing": ongoing,
        "media_list": []
    }
    return result


def get_media_info(media):
    if (not media):
        return None
    if media.find('a', class_='video'):
        a_tag = media.find('a')
        href = a_tag.get('href')
        eid = a_tag.get('eid')
        img_src = a_tag.find('img')['src']
        result = {
            "link_to_media": href,
            "eid": eid,
            "image_source": img_src,
            "type": "video"
        }
        return result
    elif media.find('a', class_='picture'):
        a_tag = media.find('a')
        href = a_tag.get('href')
        img_src = a_tag.find('img')['src']
        result = {
            "link_to_media": href,
            "eid": None,
            "image_source": img_src,
            "type": "picture"
        }
        return result


def get_actress_portfolio(url, view_count):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    actress_name = soup.find('h1').text.strip()
    actress_name = re.sub(r'#.*', '', actress_name).strip()
    try:
        birthplace = soup.find('div', class_='banner-info').find('a').text.strip()
    except:
        birthplace = ""
    hashtag = soup.find(
        'span', class_='tag-desktop').text.strip().replace("#", "")
    star_rating = soup.find('span', class_='rating-score').text.strip()
    celeb_img_url = soup.find(
        'img', class_="img-circle pull-right img-responsive celeb-img").get('src')

    actress_portfolio = {
        "actress_name": actress_name,
        "birthplace": birthplace,
        "hashtag": hashtag,
        "star_rating": star_rating,
        "celeb_img_url": celeb_img_url,
        "view_count": view_count,
        "url": url,
        "production_media_list": []
    }

    production_media_list = []

    production_section_list = soup.find_all('section')

    for production in production_section_list:
        media_div_for_section = production.find_next_sibling()
        production_info = get_production_info(production)

        if media_div_for_section and media_div_for_section.name == 'div':
            media_div_list = media_div_for_section.find_all(
                'div', class_='col-lg-3 col-sm-4 col-xs-6 celebs-boxes albuma')
            media_list = []
            for media in media_div_list:
                media_info = get_media_info(media)
                media_list.append(media_info)
            production_info["media_list"] = media_list

        production_media_list.append(production_info)

    actress_portfolio["production_media_list"] = production_media_list
    return actress_portfolio


anadearmas = get_actress_portfolio('https://www.aznude.com/view/celeb/a/anadearmas.html', view_count=0)
json_string = json.dumps(anadearmas)
with open('anadearmas.json', 'w') as json_file:
    json_file.write(json_string)

In [None]:
import json
import re
import requests
from bs4 import BeautifulSoup

start_index = 1
end_index = 50


def get_production_info(production):
    h4_element = production.find('h4')
    type = h4_element.span.text.strip().lower().replace(":", "").capitalize()
    title = h4_element.a.text.strip()
    release_info = h4_element.contents[-1].strip().strip('()')

    ongoing_pattern = re.compile(r'(\d{4})-')  # Matches "YYYY-"
    range_pattern = re.compile(r'(\d{4})-(\d{4})')  # Matches "YYYY-YYYY"
    year_pattern = re.compile(r'^(\d{4})$')  # Matches "YYYY"

    ongoing_match = ongoing_pattern.search(release_info)
    range_match = range_pattern.search(release_info)
    year_match = year_pattern.search(release_info)

    if range_match:
        start_year = range_match.group(1)
        end_year = range_match.group(2)
        ongoing = False
    elif ongoing_match:
        start_year = ongoing_match.group(1)
        end_year = None
        ongoing = True
    elif year_match:
        start_year = year_match.group(1)
        end_year = None
        ongoing = False
    else:
        start_year = None
        end_year = None
        ongoing = False

    result = {
        "type": type,
        "title": title,
        "start_year": start_year,
        "end_year": end_year,
        "ongoing": ongoing,
        "media_list": []
    }
    return result


def get_media_info(media):
    if (not media):
        return None
    if media.find('a', class_='video'):
        a_tag = media.find('a')
        href = a_tag.get('href')
        eid = a_tag.get('eid')
        img_src = a_tag.find('img')['src']
        result = {
            "link_to_media": href,
            "eid": eid,
            "image_source": img_src,
            "type": "video"
        }
        return result
    elif media.find('a', class_='picture'):
        a_tag = media.find('a')
        href = a_tag.get('href')
        img_src = a_tag.find('img')['src']
        result = {
            "link_to_media": href,
            "eid": None,
            "image_source": img_src,
            "type": "picture"
        }
        return result


def get_actress_portfolio(url, view_count):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    actress_name = soup.find('h1').text.strip()
    actress_name = re.sub(r'#.*', '', actress_name).strip()
    try:
        birthplace = soup.find('div', class_='banner-info').find('a').text.strip()
    except:
        birthplace = ""
    hashtag = soup.find('span', class_='tag-desktop').text.strip().replace("#", "")
    celeb_img_url = soup.find('img', class_="img-circle pull-right img-responsive celeb-img").get('src')

    actress_portfolio = {
        "actress_name": actress_name,
        "birthplace": birthplace,
        "hashtag": hashtag,
        "celeb_img_url": celeb_img_url,
        "view_count": view_count,
        "url": url,
        "production_media_list": []
    }

    production_media_list = []

    production_section_list = soup.find_all('section')

    for production in production_section_list:
        media_div_for_section = production.find_next_sibling()
        production_info = get_production_info(production)

        if media_div_for_section and media_div_for_section.name == 'div':
            media_div_list = media_div_for_section.find_all(
                'div', class_='col-lg-3 col-sm-4 col-xs-6 celebs-boxes albuma')
            media_list = []
            for media in media_div_list:
                media_info = get_media_info(media)
                media_list.append(media_info)
            production_info["media_list"] = media_list

        production_media_list.append(production_info)

    actress_portfolio["production_media_list"] = production_media_list
    return actress_portfolio


for index in range(start_index, end_index+1):
    popular_actress_url = f'https://www.aznude.com/browse/celebs/popular/{index}.html'
    response = requests.get(popular_actress_url)
    soup_popular_actress = BeautifulSoup(response.text, 'html.parser')
    actress_div_list = soup_popular_actress.find_all(
        'div', class_="col-lg-2 col-md-3 col-sm-4 col-xs-6 story-thumbs celebs-boxes")
    actress_portfolio_list = []
    failed_actress_portfolio_list = []

    for actress in actress_div_list:
        actress_url = actress.find('a').get('href')
        actress_name = actress.find('h4').text.strip()
        try:
            view_count = actress.find('span').text.strip()
            actress_portfolio = get_actress_portfolio(
                url="https://aznude.com" + actress_url, view_count=view_count)
            actress_portfolio_list.append(actress_portfolio)
            print(actress_name)
        except:
            failed_actress_portfolio_list.append([actress_url, actress_name])
            failed_json_string = json.dumps([actress_url, actress_name])
            with open(f'.\\failed_actress_list\\failed_{index}_{actress_name}.json', 'w') as failed_json_file:
                failed_json_file.write(failed_json_string)
            print(actress_name, "failed")

    json_string = json.dumps(actress_portfolio_list)
    with open(f'.\\actress_portfolio\\actress_portfolio_list_{index}.json', 'w') as json_file:
        json_file.write(json_string)

In [None]:
import json
import re
import requests
from bs4 import BeautifulSoup

start_index = 1
end_index = 50


def get_production_info(production):
    h4_element = production.find('h4')
    type = h4_element.span.text.strip().lower().replace(":", "").capitalize()
    title = h4_element.a.text.strip()
    release_info = h4_element.contents[-1].strip().strip('()')

    ongoing_pattern = re.compile(r'(\d{4})-')  # Matches "YYYY-"
    range_pattern = re.compile(r'(\d{4})-(\d{4})')  # Matches "YYYY-YYYY"
    year_pattern = re.compile(r'^(\d{4})$')  # Matches "YYYY"

    ongoing_match = ongoing_pattern.search(release_info)
    range_match = range_pattern.search(release_info)
    year_match = year_pattern.search(release_info)

    if range_match:
        start_year = range_match.group(1)
        end_year = range_match.group(2)
        ongoing = False
    elif ongoing_match:
        start_year = ongoing_match.group(1)
        end_year = None
        ongoing = True
    elif year_match:
        start_year = year_match.group(1)
        end_year = None
        ongoing = False
    else:
        start_year = None
        end_year = None
        ongoing = False

    result = {
        "type": type,
        "title": title,
        "start_year": start_year,
        "end_year": end_year,
        "ongoing": ongoing,
        "media_list": []
    }
    return result


def get_media_info(media):
    if (not media):
        return None
    if media.find('a', class_='video'):
        a_tag = media.find('a')
        href = a_tag.get('href')
        eid = a_tag.get('eid')
        img_src = a_tag.find('img')['src']
        result = {
            "link_to_media": href,
            "eid": eid,
            "image_source": img_src,
            "type": "video"
        }
        return result
    elif media.find('a', class_='picture'):
        a_tag = media.find('a')
        href = a_tag.get('href')
        img_src = a_tag.find('img')['src']
        result = {
            "link_to_media": href,
            "eid": None,
            "image_source": img_src,
            "type": "picture"
        }
        return result


def get_actress_portfolio(url, view_count):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    actress_name = soup.find('h1').text.strip()
    actress_name = re.sub(r'#.*', '', actress_name).strip()
    # try:
    #     birthplace = soup.find(
    #         'div', class_='banner-info').find('a').text.strip()
    # except:
    #     birthplace = ""
    birthplace = soup.find('div', class_='banner-info').find('a').text.strip()
    hashtag = soup.find(
        'span', class_='tag-desktop').text.strip().replace("#", "")
    celeb_img_url = soup.find(
        'img', class_="img-circle pull-right img-responsive celeb-img").get('src')

    actress_portfolio = {
        "actress_name": actress_name,
        "birthplace": birthplace,
        "hashtag": hashtag,
        "celeb_img_url": celeb_img_url,
        "view_count": view_count,
        "url": url,
        "production_media_list": []
    }

    production_media_list = []
    production_section_list = soup.find_all('section')

    for production in production_section_list:
        media_div_for_section = production.find_next_sibling()
        production_info = get_production_info(production)

        if media_div_for_section and media_div_for_section.name == 'div':
            media_div_list = media_div_for_section.find_all(
                'div', class_='col-lg-3 col-sm-4 col-xs-6 celebs-boxes albuma')
            media_list = []
            for media in media_div_list:
                media_info = get_media_info(media)
                media_list.append(media_info)
            production_info["media_list"] = media_list

        production_media_list.append(production_info)

    actress_portfolio["production_media_list"] = production_media_list
    return actress_portfolio


for index in range(start_index, end_index+1):
    popular_actress_url = f'https://www.aznude.com/browse/celebs/popular/{
        index}.html'
    response = requests.get(popular_actress_url)
    soup_popular_actress = BeautifulSoup(response.text, 'html.parser')
    actress_div_list = soup_popular_actress.find_all(
        'div', class_="col-lg-2 col-md-3 col-sm-4 col-xs-6 story-thumbs celebs-boxes")
    actress_portfolio_list = []
    failed_actress_portfolio_list = []

    in_page_index = 0

    for actress in actress_div_list:
        if in_page_index != 20:
            in_page_index += 1
            continue
        actress_url = actress.find('a').get('href')
        actress_name = actress.find('h4').text.strip()
        try:
            view_count = actress.find('span').text.strip()
            actress_portfolio = get_actress_portfolio(
                url="https://aznude.com" + actress_url, view_count=view_count)
            actress_portfolio_list.append(actress_portfolio)
            print(actress_name)
        except:
            failed_stat = {
                "actress_url": "https://aznude.com" + actress_url,
                "actress_name": actress_name,
                "in_page_index": in_page_index,
                "page_index": index,
                "view_count": view_count,
            }
            print(failed_stat)
            failed_actress_portfolio_list.append(failed_stat)
            failed_json_string = json.dumps(failed_stat)
            with open(f'.\\failed_actress_list\\failed_{index}_{actress_name}.json', 'w') as failed_json_file:
                failed_json_file.write(failed_json_string)
            print(actress_name, "failed")
        in_page_index += 1

    json_string = json.dumps(actress_portfolio_list)
    with open(f'.\\actress_portfolio\\actress_portfolio_list_{index}.json', 'w') as json_file:
        json_file.write(json_string)
    break

In [5]:
import os, json

folder_path = "E:\\VSCode\\aznude\\actress_portfolio"

json_files = [file_name for file_name in os.listdir(
    folder_path) if file_name.endswith('.json')]


for file_name in json_files:
    file_path = os.path.join(folder_path, file_name)

    if os.path.isfile(file_path) and file_name.endswith('.json'):
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            print(file_path, len(data))


E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_1.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_10.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_11.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_12.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_13.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_14.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_15.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_16.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_17.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_18.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_19.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_2.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_20.json 22
E:\VSCode\aznude\actress_portfolio\actress_portfolio_list_21.json 22
E:\VSCode\aznude\actress_portfolio\a