In [6]:
#imports
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re
import json
from joblib import Parallel, delayed

In [None]:
characters_page = requests.get("https://onepiece.fandom.com/wiki/List_of_Canon_Characters")
characters_page.content

In [None]:
soup = BeautifulSoup(characters_page.content, 'html.parser')
print(soup.prettify())

In [9]:
# SLOW VERSION
def getCharacterUrl(name):
    baseUrl = 'https://onepiece.fandom.com/wiki/'
    #check if name has spaces
    if (' ' in name):
        name = name.replace(' ', '_')
    newUrl = baseUrl + name
    return newUrl

def getCharacterInfo(url, name):
    # case sensitive regex match on month and 1-2 digit date
    bday_regex = "(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2})"
    character_page = requests.get(url)
    soup = BeautifulSoup(character_page.content, 'html.parser')
    sideBar = soup.find('aside')
    # only care about characters with birthdays
    if sideBar.find(attrs={"data-source": "birth"}):
        #birthday
        input_tag = sideBar.find(attrs={"data-source": "birth"})
        birthday_string = input_tag.find('div').contents[0]
        #birthday to month and day ints
        results = re.search(bday_regex, birthday_string)
        birth_day = int(results.group(2))
        birth_month = datetime.strptime(results.group(1), "%B").month
        # img url
        img = sideBar.find('img')['src']
        json_val = {'name': name, 'birthday': birthday_string, 'birth_month': birth_month ,'birth_day': birth_day, 'img_url': img }
        return json_val

In [5]:
# SLOW VERSION
# assumes first 2 tables are characters and the third is groups
tables = soup.find_all('table', limit=2);
json_array = []
for table in tables:
    rows = table.find_all('tr')
    #need to skip the row in thead
    print(len(rows), " characters to go")
    for i, row in enumerate(rows):
        if i > 0:
            tds = row.find_all('td')
            name = tds[1].text.strip()
            url = getCharacterUrl(name)
            val = getCharacterInfo(url, name)
            if (val is not None):
                json_array.append(val)
            if i%25==0:
                print(i," characters done")
print(json_array)


887  characters to go
25  characters done
50  characters done
75  characters done
100  characters done
125  characters done
150  characters done
175  characters done
200  characters done


KeyboardInterrupt: 

In [10]:
#testing out "fast version" with parallel processing

def processRow(row):
    tds = row.find_all('td')
    name = tds[1].text.strip()
    return getCharacterUrl(name)

tables = soup.find_all('table', limit=2);
json_array = []
all_rows = []
for table in tables:
    #need to skip the first row - thead
    new_rows = table.find_all('tr')
    del new_rows[0]
    all_rows = all_rows + new_rows
    
print(len(all_rows), " characters to go")
url_list = [processRow(row) for row in all_rows]

def getCharacterPage(url):
    return {"url": url, "page": requests.get(url)}

print("processing char pages")
all_character_pages = Parallel(n_jobs=10)(delayed(getCharacterPage)(url) for url in url_list)

print("done with pages")




1353  characters to go
processing char pages
done with pages


In [16]:
def getCharacterInfoFromPage(page,url):
    # case sensitive regex match on month and 1-2 digit date
    bday_regex = "(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2})"
    soup = BeautifulSoup(page.content, 'html.parser')
    sideBar = soup.find('aside')
    # only care about characters with birthdays
    if sideBar.find(attrs={"data-source": "birth"}):
        # have to rebuild name due to parallel processing
        name = sideBar.find(attrs={"data-source": "name"}).text
        #birthday
        input_tag = sideBar.find(attrs={"data-source": "birth"})
        birthday_string = input_tag.find('div').contents[0]
        #birthday to month and day ints
        results = re.search(bday_regex, birthday_string)
        birth_day = int(results.group(2))
        birth_month = datetime.strptime(results.group(1), "%B").month
        # img url
        img = sideBar.find('img')['src']
        json_val = {'name': name, 'birthday': birthday_string, 'birth_month': birth_month ,'birth_day': birth_day, 'img_url': img , 'wiki_url': url}
        return json_val
    
def toJson(char_page):
    return getCharacterInfoFromPage(char_page["page"],char_page["url"])

print("start")
    
#iterative
json_array = [toJson(page) for page in all_character_pages]
#parallel
#json_array = Parallel(n_jobs=10)(delayed(toJson)(page) for page in all_character_pages)
        
print("done with json")

start
done with json


In [17]:
#removing nulls
json_array = list(filter(lambda item: item is not None, json_array))

# Serializing json
json_object = json.dumps(json_array, indent=4)
print(json_object)
 

[
    {
        "name": "A O",
        "birthday": "January 15th",
        "birth_month": 1,
        "birth_day": 15,
        "img_url": "https://static.wikia.nocookie.net/onepiece/images/8/8c/A_O_Anime_Infobox.png/revision/latest/scale-to-width-down/350?cb=20160102105316",
        "wiki_url": "https://onepiece.fandom.com/wiki/A_O"
    },
    {
        "name": "Absalom",
        "birthday": "December 30th",
        "birth_month": 12,
        "birth_day": 30,
        "img_url": "https://static.wikia.nocookie.net/onepiece/images/5/56/Absalom_Anime_Infobox.png/revision/latest/scale-to-width-down/350?cb=20230101154942",
        "wiki_url": "https://onepiece.fandom.com/wiki/Absalom"
    },
    {
        "name": "Agyo",
        "birthday": "November 29th",
        "birth_month": 11,
        "birth_day": 29,
        "img_url": "https://static.wikia.nocookie.net/onepiece/images/1/11/Agyo_Anime_Infobox.png/revision/latest/scale-to-width-down/350?cb=20140707173009",
        "wiki_url": "https://

In [18]:
# Writing to sample.json
with open("one_piece_character_data.json", "w") as outfile:
    outfile.write(json_object)