In [2]:
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
import requests
from PIL import Image, ImageSequence, ImageOps
from pathlib import Path

# Scraping numerical data

We will be scraping meleeframedata.com for the amount of damage coming from each attack. After examining the output of the "damage" wrappers on the website, we construct the following function to return a floating point number from the raw text data.

In [11]:
def parseDamage(s : str) -> str:
   firstNumberChars = [char for char in s.split(' ')[0] if char.isdigit() or char == '.']
   if not firstNumberChars:
      return 0
   return float(''.join(firstNumberChars))

The following code cell determines which types of attacks we will fetch. Each normal move can be modified to be an aerial or a special, and the div classes in the html file are modified by the outputs of the dictionary `typeToDivClass`. We can also add more attributes to the list `attributesToFetch` in order to scrape more or fewer details.

In [12]:
move_types = ['Normal', 'Aerial', 'Special'] 
typeToDivClass = {'Normal': '', 'Aerial':'air-', 'Special': 'special-'}
attributesToFetch = ['percent', 'name']

The homepage of the website contains links to each character's webpage. Upon inspection of the HTML source, we see that the first 15 links of the page are generic navigation links. We thus append the rest of the links to a list of urls of character pages.

In [None]:
homeurl = 'https://meleeframedata.com/'
hr = requests.get(homeurl, verify=False) 
homePageSoup  = BeautifulSoup(hr.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib 
links = homePageSoup.find_all('a')
suffixes = []
for i in links[16:]:
    suffixes.append(str(i).split('"')[1].replace('amp;', ''))
urls = ['https://meleeframedata.com/' + suffix for suffix in suffixes]

Now in each characters page, we can go through and scrape each attack listed on the website. We save the character name, move name and parsed damage amount in a list.

In [None]:
listOfMoves = []
for charPage in urls:
    req = requests.get(charPage, verify = False)
    charSoup = BeautifulSoup(req.content, 'html5lib')
    charName = charSoup.find('title').text
    for type in move_types:
        modifier = typeToDivClass[type]
        container = modifier + 'move-container'
        percent = modifier + 'percent'
        name = modifier + 'movename'
        for move in charSoup.findAll(class_ = container):
            listOfMoves.append([charName, move.findAll(class_= name)[0].text, parseDamage(move.findAll(class_= percent)[0].text)])


We will save this scraped data as a pandas dataframe.

In [19]:
df = pd.DataFrame(listOfMoves, columns=['Character', 'Move', 'Damage'])
df.to_pickle('moveDamages.pkl')

Moves which deal 0 damage tend to have very unique effects which are difficult to predict; we will preemptively sanitize our training set by removing them.

In [70]:
df = df.loc[df['Damage'] != 0]
df.sort_values('Damage').head()

# Scraping image data

The images corresponding to attack animations are stored in an index page at the URL below. We will pass through this index and append a link to each folder in the index.

In [None]:
url = 'https://meleeframedata.com/static/gifs/'
reqs = requests.get(url, verify=False)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a')[5:]:
    urls.append(url + link.get('href'))
urls[0]

Let's take a look at how the actual image files in the index are formatted so that we can pick them apart when saving them.

In [None]:
bowserURL = urls[0]
bowserReq = requests.get(bowserURL, verify=False)
bowSoup = BeautifulSoup(bowserReq.text, 'html.parser')
bowserAttackurls = []
for bowLink in bowSoup.find_all('a')[5:]:
    bowserAttackurls.append(bowserURL + bowLink.get('href'))
bowserAttackurls[0:3]

Let's also format each folder in local storage from the url from which it is obtained.

In [8]:
PATH = 'C:/Users/gfi8p/imageClassifier/'
A = PATH + 'rawimages/' + urls[0].split('/')[-2] + '/'
A

'C:/Users/gfi8p/imageClassifier/rawimages/bowser/'

In the following block of code, the outer for loop will create a folder for each character. The inner for loops will obtain URLS for images relevant to that character, then save each image into the created folder.

In [87]:
for charURL in urls:
    folder = PATH + 'rawimages/' + charURL.split('/')[-2] + '/'
    Path(folder).mkdir(parents=True, exist_ok=True)
    charRequest = requests.get(charURL)
    charSoup = BeautifulSoup(charRequest.text, 'html.parser')
    charAttackURLs = []
    for charAttack in charSoup.find_all('a')[5:]:
        charAttackURLs.append(charURL + charAttack.get('href'))
    for url in charAttackURLs:
        response = requests.get(url)
        filename = folder + url.split('/')[-1]
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(response.content)

We now have all of the images on the local machine and it is time to move on to processing the data.