# Web Scraping scryfall.com

In [276]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

url = r'https://www.scryfall.com'

r = requests.get(url)

data = r.text

soup = BeautifulSoup(data)

In [277]:
print(soup.prettify()[:1000])

<!DOCTYPE html>
<html dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Scryfall Magic: The Gathering Search
  </title>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <link crossorigin="true" href="https://assets.scryfall.com" rel="preconnect"/>
  <link crossorigin="true" href="https://img.scryfall.com" rel="preconnect"/>
  <link href="https://assets.scryfall.com" rel="dns-prefetch"/>
  <link href="https://img.scryfall.com" rel="dns-prefetch"/>
  <meta content="A fast, powerful, comprehensive Magic: The Gathering card search." name="description"/>
  <link href="https://scryfall.com/blog/feed" rel="alternate" title="ATOM" type="application/atom+xml"/>
  <link href="https://scryfall.com/opensearch.xml" rel="search" title="Scryfall" type="application/opensearchdescription+xml"/>
  <meta content="Scryfall Magic: The Gathering Search" property="og:site_name"/>
  <meta content="website" property="og:type"/>
  <meta content="Scryfall Magic: The G

In [278]:
links = soup.find_all('a')

display(f'There are {len(links)} links')

for link in links:
    print(link.get('href'))

'There are 50 links'

#main
#footer
/
/advanced
/docs/syntax
/sets
/random
https://scryfall.com/sets/und
/sets/thb?order=set
/sets/sld?order=spoiled
/register
https://scryfall.com/card/ust/162/steel-squirrel
https://scryfall.com/card/ema/158/argothian-enchantress
https://scryfall.com/card/csp/82/goblin-furrier
https://scryfall.com/card/und/31/acornelia-fashionable-filcher
https://scryfall.com/card/eld/200/savvy-hunter
https://scryfall.com/card/mir/228/maro
https://scryfall.com/card/lrw/33/oaken-brawler
/advanced
/docs/syntax
/sets
/account/decks
/random
/bots
/docs/faqs
/blog
https://artgame.scryfall.com
https://tagger.scryfall.com
/settings/profile
/register
/docs/terms
/contact
/docs/privacy
/docs/api
/docs/api/cards
/docs/api/images
/docs/api/bulk-data
/blog/category/api
https://twitter.com/scryfall
https://www.reddit.com/user/Scryfall/
https://github.com/scryfall
#
#
None
None
None
None
/admin/cards/new
/admin
/settings/preferences


Ok, that's a lot of pages. The "sets" page looks promising. I think that will have every card. Let's go there!

In [279]:
r = requests.get(f'{url}/sets')

sets_data = r.text

sets_soup = BeautifulSoup(sets_data)

In [280]:
# print(sets_soup.prettify())

In [281]:
sets_links = sets_soup.find_all('a')

display(f'There are {len(sets_links)} links')

for link in sets_links[:100]:
    print(link.get('href'))

'There are 3345 links'

#main
#footer
/
/advanced
/docs/syntax
/sets
/random
/account/decks
/settings
/advanced
/docs/syntax
/sets
/random
/account/decks
/account/decks
/account/decks/new
/signout
/settings/preferences
/settings/profile
/settings/security
/settings/safety
https://scryfall.com/sets/und
https://scryfall.com/sets/und
https://scryfall.com/sets/und
/sets/und
https://scryfall.com/sets/tund
https://scryfall.com/sets/tund
https://scryfall.com/sets/tund
/sets/tund
https://scryfall.com/sets/thb
https://scryfall.com/sets/thb
https://scryfall.com/sets/thb
/sets/thb
/sets/thb/es
/sets/thb/fr
/sets/thb/de
/sets/thb/it
/sets/thb/pt
/sets/thb/ja
/sets/thb/ko
/sets/thb/ru
/sets/thb/zhs
/sets/thb/zht
https://scryfall.com/sets/pthb
https://scryfall.com/sets/pthb
https://scryfall.com/sets/pthb
/sets/pthb
https://scryfall.com/sets/tthb
https://scryfall.com/sets/tthb
https://scryfall.com/sets/tthb
/sets/tthb
https://scryfall.com/sets/j20
https://scryfall.com/sets/j20
https://scryfall.com/sets/j20
/sets/j20
https:/

Hmm, a lot of repeats and a lot of different formats. Let's fix all the links to be the full link, then create a list without duplicates.

In [282]:
def fix_link(link):
    try:
        if link[:5] == 'https':
            return link
        elif link[0] == '/':
            return f'{url}{link}'
    except:
        print('Empty Link')

In [283]:
sets_list = [link.get('href') for link in sets_links]

fixed_sets_list = [fix_link(string) for string in sets_list]
fixed_sets_list = list(set(fixed_sets_list))
fixed_sets_list

Empty Link
Empty Link
Empty Link
Empty Link


['https://www.scryfall.com/sets/psus',
 'https://scryfall.com/sets/ons',
 'https://www.scryfall.com/sets/tsp/zhs',
 'https://scryfall.com/sets/troe',
 'https://www.scryfall.com/sets/gpt/ja',
 'https://scryfall.com/sets/pmps07',
 'https://www.scryfall.com/sets/pmps09/ja',
 'https://www.scryfall.com/sets/usg/ja',
 'https://www.scryfall.com/sets/gpt/zhs',
 'https://www.scryfall.com/sets/bfz/pt',
 'https://www.scryfall.com/sets/wc97',
 'https://www.scryfall.com/sets/cn2/zhs',
 'https://www.scryfall.com/sets/m14/zhs',
 'https://scryfall.com/sets/ddi',
 'https://www.scryfall.com/sets/5dn/ja',
 'https://scryfall.com/sets/ddl',
 'https://www.scryfall.com/sets/war/de',
 'https://www.scryfall.com/sets/gtc/es',
 'https://scryfall.com/sets/me2',
 'https://www.scryfall.com/sets/pfrf',
 'https://www.scryfall.com/sets/tsoi',
 'https://www.scryfall.com/sets/cp1',
 'https://scryfall.com/sets/tavr',
 'https://www.scryfall.com/sets/pc2/ja',
 'https://www.scryfall.com/sets/m13/es',
 'https://www.scryfall.

Looking good. A few more things to work on:
1. Some have www and some don't. Let's add that to all of them.
2. There will be more duplicates after we do that. (I suspect that we could just drop all that don't have www, but it doesn't hurt to be careful.
3. There are a few bad ones, like reddit and none. Let's lose those as well.

In [284]:
fixed_sets_list.remove(None)

In [285]:
fixed_sets_list

['https://www.scryfall.com/sets/psus',
 'https://scryfall.com/sets/ons',
 'https://www.scryfall.com/sets/tsp/zhs',
 'https://scryfall.com/sets/troe',
 'https://www.scryfall.com/sets/gpt/ja',
 'https://scryfall.com/sets/pmps07',
 'https://www.scryfall.com/sets/pmps09/ja',
 'https://www.scryfall.com/sets/usg/ja',
 'https://www.scryfall.com/sets/gpt/zhs',
 'https://www.scryfall.com/sets/bfz/pt',
 'https://www.scryfall.com/sets/wc97',
 'https://www.scryfall.com/sets/cn2/zhs',
 'https://www.scryfall.com/sets/m14/zhs',
 'https://scryfall.com/sets/ddi',
 'https://www.scryfall.com/sets/5dn/ja',
 'https://scryfall.com/sets/ddl',
 'https://www.scryfall.com/sets/war/de',
 'https://www.scryfall.com/sets/gtc/es',
 'https://scryfall.com/sets/me2',
 'https://www.scryfall.com/sets/pfrf',
 'https://www.scryfall.com/sets/tsoi',
 'https://www.scryfall.com/sets/cp1',
 'https://scryfall.com/sets/tavr',
 'https://www.scryfall.com/sets/pc2/ja',
 'https://www.scryfall.com/sets/m13/es',
 'https://www.scryfall.

In [286]:
# Join them all into one string
# Seems easer to do regex when it's all one string. 
# Should investigate this more. Am I just bad, or is this a good way to do it?

csv = ','.join(fixed_sets_list)

csv = re.sub(r'/scryfall', r'/www.scryfall', csv)

fixed_sets_list = csv.split(',')

In [287]:
fixed_sets_list = list(set(fixed_sets_list))
fixed_sets_list

['https://www.scryfall.com/sets/psus',
 'https://www.scryfall.com/sets/tsp/zhs',
 'https://www.scryfall.com/sets/gpt/ja',
 'https://www.scryfall.com/sets/pmps09/ja',
 'https://www.scryfall.com/sets/usg/ja',
 'https://www.scryfall.com/sets/gpt/zhs',
 'https://www.scryfall.com/sets/bfz/pt',
 'https://www.scryfall.com/sets/wc97',
 'https://www.scryfall.com/sets/cn2/zhs',
 'https://www.scryfall.com/sets/m14/zhs',
 'https://www.scryfall.com/sets/5dn/ja',
 'https://www.scryfall.com/sets/war/de',
 'https://www.scryfall.com/sets/gtc/es',
 'https://www.scryfall.com/sets/pfrf',
 'https://www.scryfall.com/sets/tsoi',
 'https://www.scryfall.com/sets/cp1',
 'https://www.scryfall.com/sets/pc2/ja',
 'https://www.scryfall.com/sets/m13/es',
 'https://www.scryfall.com/sets/ulg/fr',
 'https://www.scryfall.com/sets/mp2',
 'https://www.scryfall.com/sets/cst/es',
 'https://www.scryfall.com/sets/dis/ja',
 'https://www.scryfall.com/admin/cards/new',
 'https://www.scryfall.com/sets/emn/zhs',
 'https://www.scry

Ok, almost there. There's still a few unwanted sites in there. Let's get rid of every site which doesn't have a "www.scryfall.com/sets" in it

In [288]:
csv = ','.join(fixed_sets_list)

regex = r'[^,]*www.scryfall.com/sets[^,]*'

csv = re.findall(regex, csv)

fixed_sets_list = csv
fixed_sets_list

['https://www.scryfall.com/sets/psus',
 'https://www.scryfall.com/sets/tsp/zhs',
 'https://www.scryfall.com/sets/gpt/ja',
 'https://www.scryfall.com/sets/pmps09/ja',
 'https://www.scryfall.com/sets/usg/ja',
 'https://www.scryfall.com/sets/gpt/zhs',
 'https://www.scryfall.com/sets/bfz/pt',
 'https://www.scryfall.com/sets/wc97',
 'https://www.scryfall.com/sets/cn2/zhs',
 'https://www.scryfall.com/sets/m14/zhs',
 'https://www.scryfall.com/sets/5dn/ja',
 'https://www.scryfall.com/sets/war/de',
 'https://www.scryfall.com/sets/gtc/es',
 'https://www.scryfall.com/sets/pfrf',
 'https://www.scryfall.com/sets/tsoi',
 'https://www.scryfall.com/sets/cp1',
 'https://www.scryfall.com/sets/pc2/ja',
 'https://www.scryfall.com/sets/m13/es',
 'https://www.scryfall.com/sets/ulg/fr',
 'https://www.scryfall.com/sets/mp2',
 'https://www.scryfall.com/sets/cst/es',
 'https://www.scryfall.com/sets/dis/ja',
 'https://www.scryfall.com/sets/emn/zhs',
 'https://www.scryfall.com/sets/grn/zht',
 'https://www.scryfal

Now, I know this isn't yet correct, because on that page it says that there's 579 sets. So what's gone wrong?

Well, some of the set pages have additional "/" after the set designation. So it's some kind of subpage. 

Oh, I see, it's for each language. Well, I believe the default for each is English, so let's get rid of everything after the set description.

In [289]:
csv = ','.join(fixed_sets_list)

regex = r'(/sets/[^/,]*?)/[^/,]*?,'

# re.findall(regex, csv, flags = re.VERBOSE)

csv = re.sub(regex, r'\1,', csv, flags = re.VERBOSE)

final_sets_list = csv.split(',')

final_sets_list

['https://www.scryfall.com/sets/psus',
 'https://www.scryfall.com/sets/tsp',
 'https://www.scryfall.com/sets/gpt',
 'https://www.scryfall.com/sets/pmps09',
 'https://www.scryfall.com/sets/usg',
 'https://www.scryfall.com/sets/gpt',
 'https://www.scryfall.com/sets/bfz',
 'https://www.scryfall.com/sets/wc97',
 'https://www.scryfall.com/sets/cn2',
 'https://www.scryfall.com/sets/m14',
 'https://www.scryfall.com/sets/5dn',
 'https://www.scryfall.com/sets/war',
 'https://www.scryfall.com/sets/gtc',
 'https://www.scryfall.com/sets/pfrf',
 'https://www.scryfall.com/sets/tsoi',
 'https://www.scryfall.com/sets/cp1',
 'https://www.scryfall.com/sets/pc2',
 'https://www.scryfall.com/sets/m13',
 'https://www.scryfall.com/sets/ulg',
 'https://www.scryfall.com/sets/mp2',
 'https://www.scryfall.com/sets/cst',
 'https://www.scryfall.com/sets/dis',
 'https://www.scryfall.com/sets/emn',
 'https://www.scryfall.com/sets/grn',
 'https://www.scryfall.com/sets/eld',
 'https://www.scryfall.com/sets/aer',
 'htt

YES! Looking good! Now I just need to remove the duplicates again (could have waited until now, but I just like cleaning things).

In [290]:
final_sets_list = list(set(final_sets_list))
final_sets_list

['https://www.scryfall.com/sets/psus',
 'https://www.scryfall.com/sets/g18',
 'https://www.scryfall.com/sets/mma',
 'https://www.scryfall.com/sets/tcon',
 'https://www.scryfall.com/sets/sld',
 'https://www.scryfall.com/sets/pred',
 'https://www.scryfall.com/sets/p15a',
 'https://www.scryfall.com/sets/tc15',
 'https://www.scryfall.com/sets/l15',
 'https://www.scryfall.com/sets/oc16',
 'https://www.scryfall.com/sets/pdp13',
 'https://www.scryfall.com/sets/phou',
 'https://www.scryfall.com/sets/pdom',
 'https://www.scryfall.com/sets/pbok',
 'https://www.scryfall.com/sets/f05',
 'https://www.scryfall.com/sets/togw',
 'https://www.scryfall.com/sets/p02',
 'https://www.scryfall.com/sets/wc03',
 'https://www.scryfall.com/sets/pgpx',
 'https://www.scryfall.com/sets/pal01',
 'https://www.scryfall.com/sets/pdtp',
 'https://www.scryfall.com/sets/ddh',
 'https://www.scryfall.com/sets/f02',
 'https://www.scryfall.com/sets/wc97',
 'https://www.scryfall.com/sets/psoi',
 'https://www.scryfall.com/sets

So close!!! We ended up with 574 links. But there are actually 579 sets (According to the bottom of the page). We'll I'm not sure what I missed, but I think there's a better way to handle this stuff.

# A Better Way

Oh man, the sets page has some good stuff! Like number of cards and release date! I need to get that!!!

It looks like there's a better way to do all the stuff I just did. I need to look for the appropriate tag on that page. Still, guess that was good practice.

Let's inspect the page in Chrome to see which tag has the info I want.

In [291]:
for tag in sets_soup.find_all('tr'):
    print(tag.prettify())
    print()

<tr>
 <th>
  Name
 </th>
 <th class="em6">
  Cards
 </th>
 <th class="em9">
  Date
 </th>
 <th class="w35">
  Languages
 </th>
</tr>


<tr>
 <td class="flexbox">
  <a href="https://scryfall.com/sets/und">
   <svg>
    <use xlink:href="#sets-default-svg">
    </use>
   </svg>
   Unsanctioned
   <small>
    UND
   </small>
  </a>
 </td>
 <td>
  <a href="https://scryfall.com/sets/und" tabindex="-1">
   96
  </a>
 </td>
 <td>
  <a href="https://scryfall.com/sets/und" tabindex="-1">
   2020-02-29
  </a>
 </td>
 <td>
  <span class="pillbox">
   <a class="pillbox-item" href="/sets/und">
    en
   </a>
   <span aria-hidden="true" class="pillbox-item disabled">
    es
   </span>
   <span aria-hidden="true" class="pillbox-item disabled">
    fr
   </span>
   <span aria-hidden="true" class="pillbox-item disabled">
    de
   </span>
   <span aria-hidden="true" class="pillbox-item disabled">
    it
   </span>
   <span aria-hidden="true" class="pillbox-item disabled">
    pt
   </span>
   <span aria

In [292]:
for tag in sets_soup.find_all('td', class_="flexbox"):
    print(tag.prettify())
    print()

<td class="flexbox">
 <a href="https://scryfall.com/sets/und">
  <svg>
   <use xlink:href="#sets-default-svg">
   </use>
  </svg>
  Unsanctioned
  <small>
   UND
  </small>
 </a>
</td>


<td class="flexbox indent">
 <a href="https://scryfall.com/sets/tund">
  <svg>
   <use xlink:href="#sets-default-svg">
   </use>
  </svg>
  Unsanctioned Tokens
  <small>
   TUND
  </small>
 </a>
</td>


<td class="flexbox">
 <a href="https://scryfall.com/sets/thb">
  <svg>
   <use xlink:href="#sets-thb-svg">
   </use>
  </svg>
  Theros Beyond Death
  <small>
   THB
  </small>
 </a>
</td>


<td class="flexbox indent">
 <a href="https://scryfall.com/sets/pthb">
  <svg>
   <use xlink:href="#sets-thb-svg">
   </use>
  </svg>
  Theros Beyond Death Promos
  <small>
   PTHB
  </small>
 </a>
</td>


<td class="flexbox indent">
 <a href="https://scryfall.com/sets/tthb">
  <svg>
   <use xlink:href="#sets-thb-svg">
   </use>
  </svg>
  Theros Beyond Death Tokens
  <small>
   TTHB
  </small>
 </a>
</td>


<td clas

Ok, let's start a dataframe to keep track of all this info.

Once I learn SQL, that might be a better way to keep track of this stuff

In [293]:
sets = pd.DataFrame()
# magic['links'] = 