In [1]:
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

from webdriver_manager.chrome import ChromeDriverManager

### Create driver

In [2]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [3]:
page_url = "https://myheroacademia.fandom.com/wiki/Story_Arcs"
driver.get(page_url)

In [4]:
from selenium.webdriver.common.by import By

episodes = [driver.find_element(By.XPATH, f"//a[@title='Episode {i}']") for i in range(1, 113)]

In [5]:
episodes[:5]

[<selenium.webdriver.remote.webelement.WebElement (session="b777fb538698bbe02a86904311b43142", element="12b333f1-1f24-4cac-9374-2fade9b7398e")>,
 <selenium.webdriver.remote.webelement.WebElement (session="b777fb538698bbe02a86904311b43142", element="08032b2d-9ac3-4cad-aee8-13cdeca5aa6f")>,
 <selenium.webdriver.remote.webelement.WebElement (session="b777fb538698bbe02a86904311b43142", element="a2cdf930-2d32-4db7-bf88-a9eb40bdc851")>,
 <selenium.webdriver.remote.webelement.WebElement (session="b777fb538698bbe02a86904311b43142", element="5d32e79b-cb8c-48b6-9372-9056fb3eb7f9")>,
 <selenium.webdriver.remote.webelement.WebElement (session="b777fb538698bbe02a86904311b43142", element="9cbcc5fb-7a88-457a-847a-1b51bcd172a1")>]

In [6]:
len(episodes)

112

In [7]:
episodes[0].text

'Izuku Midoriya: Origin'

In [8]:
episodes[0].get_attribute('href')

'https://myheroacademia.fandom.com/wiki/Episode_1'

In [9]:
driver.get(episodes[0].get_attribute('href'))

In [10]:
lists_characters = driver.find_element(By.XPATH, "//span[@id='Characters_in_Order_of_Appearance']/parent::h2/following-sibling::div/ul")

In [11]:
characters = lists_characters.find_elements(By.XPATH, "./li")

In [12]:
len(characters)

13

In [13]:
appearance_lst = []
for c in characters:
    print(c.text)
    text = c.text
    if text.endswith('(Flashback)'):
        appearance_lst.append([1, text.replace(' (Flashback)', ''), 'Flashback'])
    elif text.endswith('(Mentioned)'):
        appearance_lst.append([1, text.replace(' (Mentioned)', ''), 'Mentioned'])
    else:
        appearance_lst.append([1, text, 'Direct'])

Izuku Midoriya
Tsubasa (Flashback)
Katsuki Bakugo
Giant Villain
Luminescent Baby (Flashback)
All Might
Death Arms
Backdraft
Kamui Woods
Mt. Lady
Sludge Villain
Inko Midoriya (Flashback)
Hisashi Midoriya (Mentioned)


In [14]:
appearance_lst

[[1, 'Izuku Midoriya', 'Direct'],
 [1, 'Tsubasa', 'Flashback'],
 [1, 'Katsuki Bakugo', 'Direct'],
 [1, 'Giant Villain', 'Direct'],
 [1, 'Luminescent Baby', 'Flashback'],
 [1, 'All Might', 'Direct'],
 [1, 'Death Arms', 'Direct'],
 [1, 'Backdraft', 'Direct'],
 [1, 'Kamui Woods', 'Direct'],
 [1, 'Mt. Lady', 'Direct'],
 [1, 'Sludge Villain', 'Direct'],
 [1, 'Inko Midoriya', 'Flashback'],
 [1, 'Hisashi Midoriya', 'Mentioned']]

In [15]:
df = pd.DataFrame(appearance_lst, columns=['Episode', 'Character', 'Type of appearance'])
df

Unnamed: 0,Episode,Character,Type of appearance
0,1,Izuku Midoriya,Direct
1,1,Tsubasa,Flashback
2,1,Katsuki Bakugo,Direct
3,1,Giant Villain,Direct
4,1,Luminescent Baby,Flashback
5,1,All Might,Direct
6,1,Death Arms,Direct
7,1,Backdraft,Direct
8,1,Kamui Woods,Direct
9,1,Mt. Lady,Direct


In [16]:
# The full class implementation
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

class CharacterCrawler():
    
    def __init__(self, from_episode = 1, to_episode = 113):
        self.from_episode = from_episode
        self.to_episode = to_episode
        self.data = None
        
    def crawl(self):
        print("Crawling starts...")
        page_url = "https://myheroacademia.fandom.com/wiki/Story_Arcs"
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        driver.get(page_url)
        
        # Get all episodes' links
        # We used a for loops instead of a list comprehension to avoid crawling an unexisting episode
        episode_links = []
        print("Getting all episode links...")
        for i in range(self.from_episode, self.to_episode + 1):
            try:
                ep = driver.find_element(By.XPATH, f"//a[@title='Episode {i}']")
                episode_links.append(ep.get_attribute('href'))
            except:
                print(f"Cannot find episode {i+1}. Skipping...")
                episodes.append(None)
        
        appearance_lst = []
        for i, ep in enumerate(episode_links):
            print(f"Getting character in episode {i+1}...")
            if ep is None:
                continue
            try:
                driver.get(ep)
                lists_characters = driver.find_element(
                    By.XPATH,
                    "//span[starts-with(@id,'Character')]/parent::h2/following-sibling::div/ul"
                )
                characters = lists_characters.find_elements(By.XPATH, "./li")
                for c in characters:
                    text = c.text
                    splitted = text.split(' (')
                    if len(splitted) == 2:
                        appearance_lst.append([i + 1, splitted[0], splitted[1][:-1]])
                    else:
                        appearance_lst.append([i + 1, text, 'Direct'])
            except Exception as e:
                print(f"Cannot find characters in episode {i + 1}. Reason {e}")
            
        self.data = pd.DataFrame(appearance_lst, columns=['Episode', 'Character', 'Type of appearance'])
        print("Crawling completed!")
        
    def export(self, path):
        if self.data is None:
            print("Data does not exists, please perform a successful crawl!")
        else:
            self.data.to_csv(path, index=False)
            print("Exported successfully")

In [17]:
character_crawler = CharacterCrawler()

In [18]:
character_crawler.crawl()

Crawling starts...
Getting all episode links...
Getting character in episode 1...
Getting character in episode 2...
Getting character in episode 3...
Getting character in episode 4...
Getting character in episode 5...
Getting character in episode 6...
Getting character in episode 7...
Getting character in episode 8...
Getting character in episode 9...
Getting character in episode 10...
Getting character in episode 11...
Getting character in episode 12...
Getting character in episode 13...
Getting character in episode 14...
Getting character in episode 15...
Getting character in episode 16...
Getting character in episode 17...
Getting character in episode 18...
Getting character in episode 19...
Getting character in episode 20...
Getting character in episode 21...
Getting character in episode 22...
Getting character in episode 23...
Getting character in episode 24...
Getting character in episode 25...
Getting character in episode 26...
Getting character in episode 27...
Getting characte

In [19]:
character_crawler.data

Unnamed: 0,Episode,Character,Type of appearance
0,1,Izuku Midoriya,Direct
1,1,Tsubasa,Flashback
2,1,Katsuki Bakugo,Direct
3,1,Giant Villain,Direct
4,1,Luminescent Baby,Flashback
...,...,...,...
4179,113,Naomasa Tsukauchi,Mentioned
4180,113,Stain,Mentioned
4181,113,Sir Nighteye,Flashback
4182,113,Burnin,Direct


In [20]:
character_crawler.export('myheroacademia.csv')

Exported successfully
