# Web Scraping scryfall.com

In [57]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

url = r'https://www.scryfall.com'

r = requests.get(url)

data = r.text

soup = BeautifulSoup(data)

In [58]:
print(soup.prettify()[:1000])

<!DOCTYPE html>
<html dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Scryfall Magic: The Gathering Search
  </title>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <link crossorigin="true" href="https://assets.scryfall.com" rel="preconnect"/>
  <link crossorigin="true" href="https://img.scryfall.com" rel="preconnect"/>
  <link href="https://assets.scryfall.com" rel="dns-prefetch"/>
  <link href="https://img.scryfall.com" rel="dns-prefetch"/>
  <meta content="A fast, powerful, comprehensive Magic: The Gathering card search." name="description"/>
  <link href="https://scryfall.com/blog/feed" rel="alternate" title="ATOM" type="application/atom+xml"/>
  <link href="https://scryfall.com/opensearch.xml" rel="search" title="Scryfall" type="application/opensearchdescription+xml"/>
  <meta content="Scryfall Magic: The Gathering Search" property="og:site_name"/>
  <meta content="website" property="og:type"/>
  <meta content="Scryfall Magic: The G

In [59]:
links = soup.find_all('a')

display(f'There are {len(links)} links')

for link in links:
    print(link.get('href'))

'There are 50 links'

#main
#footer
/
/advanced
/docs/syntax
/sets
/random
https://scryfall.com/sets/und
/sets/thb?order=set
/sets/sld?order=spoiled
/register
https://scryfall.com/card/ust/162/steel-squirrel
https://scryfall.com/card/ema/158/argothian-enchantress
https://scryfall.com/card/csp/82/goblin-furrier
https://scryfall.com/card/und/31/acornelia-fashionable-filcher
https://scryfall.com/card/eld/200/savvy-hunter
https://scryfall.com/card/mir/228/maro
https://scryfall.com/card/lrw/33/oaken-brawler
/advanced
/docs/syntax
/sets
/account/decks
/random
/bots
/docs/faqs
/blog
https://artgame.scryfall.com
https://tagger.scryfall.com
/settings/profile
/register
/docs/terms
/contact
/docs/privacy
/docs/api
/docs/api/cards
/docs/api/images
/docs/api/bulk-data
/blog/category/api
https://twitter.com/scryfall
https://www.reddit.com/user/Scryfall/
https://github.com/scryfall
#
#
None
None
None
None
/admin/cards/new
/admin
/settings/preferences


Ok, that's a lot of pages. The "sets" page looks promising. I think that will have every card. Let's go there!

In [60]:
r = requests.get(f'{url}/sets')

sets_data = r.text

sets_soup = BeautifulSoup(sets_data)

In [61]:
# print(sets_soup.prettify())

In [62]:
sets_links = sets_soup.find_all('a')

display(f'There are {len(sets_links)} links')

for link in sets_links[:100]:
    print(link.get('href'))

'There are 3345 links'

#main
#footer
/
/advanced
/docs/syntax
/sets
/random
/account/decks
/settings
/advanced
/docs/syntax
/sets
/random
/account/decks
/account/decks
/account/decks/new
/signout
/settings/preferences
/settings/profile
/settings/security
/settings/safety
https://scryfall.com/sets/und
https://scryfall.com/sets/und
https://scryfall.com/sets/und
/sets/und
https://scryfall.com/sets/tund
https://scryfall.com/sets/tund
https://scryfall.com/sets/tund
/sets/tund
https://scryfall.com/sets/thb
https://scryfall.com/sets/thb
https://scryfall.com/sets/thb
/sets/thb
/sets/thb/es
/sets/thb/fr
/sets/thb/de
/sets/thb/it
/sets/thb/pt
/sets/thb/ja
/sets/thb/ko
/sets/thb/ru
/sets/thb/zhs
/sets/thb/zht
https://scryfall.com/sets/pthb
https://scryfall.com/sets/pthb
https://scryfall.com/sets/pthb
/sets/pthb
https://scryfall.com/sets/tthb
https://scryfall.com/sets/tthb
https://scryfall.com/sets/tthb
/sets/tthb
https://scryfall.com/sets/j20
https://scryfall.com/sets/j20
https://scryfall.com/sets/j20
/sets/j20
https:/

Hmm, a lot of repeats and a lot of different formats. Let's fix all the links to be the full link, then create a list without duplicates.

In [63]:
def fix_link(link):
    try:
        if link[:5] == 'https':
            return link
        elif link[0] == '/':
            return f'{url}{link}'
    except:
        print('Empty Link')

In [64]:
sets_list = [link.get('href') for link in sets_links]
sets_list

['#main',
 '#footer',
 '/',
 '/advanced',
 '/docs/syntax',
 '/sets',
 '/random',
 '/account/decks',
 '/settings',
 '/advanced',
 '/docs/syntax',
 '/sets',
 '/random',
 '/account/decks',
 '/account/decks',
 '/account/decks/new',
 '/signout',
 '/settings/preferences',
 '/settings/profile',
 '/settings/security',
 '/settings/safety',
 'https://scryfall.com/sets/und',
 'https://scryfall.com/sets/und',
 'https://scryfall.com/sets/und',
 '/sets/und',
 'https://scryfall.com/sets/tund',
 'https://scryfall.com/sets/tund',
 'https://scryfall.com/sets/tund',
 '/sets/tund',
 'https://scryfall.com/sets/thb',
 'https://scryfall.com/sets/thb',
 'https://scryfall.com/sets/thb',
 '/sets/thb',
 '/sets/thb/es',
 '/sets/thb/fr',
 '/sets/thb/de',
 '/sets/thb/it',
 '/sets/thb/pt',
 '/sets/thb/ja',
 '/sets/thb/ko',
 '/sets/thb/ru',
 '/sets/thb/zhs',
 '/sets/thb/zht',
 'https://scryfall.com/sets/pthb',
 'https://scryfall.com/sets/pthb',
 'https://scryfall.com/sets/pthb',
 '/sets/pthb',
 'https://scryfall.com/

In [65]:
fixed_sets_list = [fix_link(string) for string in sets_list]
fixed_sets_list = list(set(fixed_sets_list))
fixed_sets_list

Empty Link
Empty Link
Empty Link
Empty Link


['https://www.scryfall.com/sets/tdgm',
 'https://www.scryfall.com/sets/c19/fr',
 'https://www.scryfall.com/sets/h09',
 'https://scryfall.com/sets/pbfz',
 'https://www.scryfall.com/sets/c16',
 'https://www.scryfall.com/sets/war/zht',
 'https://scryfall.com/sets/tjvc',
 'https://scryfall.com/sets/isd',
 'https://www.scryfall.com/contact',
 'https://www.scryfall.com/sets/mb1',
 'https://www.scryfall.com/sets/brb',
 'https://www.scryfall.com/sets/5dn/zhs',
 'https://www.scryfall.com/sets/w16/ja',
 'https://www.scryfall.com/sets/dom/pt',
 'https://scryfall.com/sets/arb',
 'https://scryfall.com/sets/dst',
 'https://www.scryfall.com/sets/pal06',
 'https://www.scryfall.com/sets/avr',
 'https://www.scryfall.com/sets/p02',
 'https://www.scryfall.com/sets/csp/de',
 'https://www.scryfall.com/sets/arb/ja',
 'https://scryfall.com/sets/pisd',
 'https://www.scryfall.com/sets/f10',
 'https://www.scryfall.com/sets/l16',
 'https://www.scryfall.com/sets/xln/de',
 'https://scryfall.com/sets/tdd2',
 'https:

Looking good. A few more things to work on:
1. Some have www and some don't. Let's add that to all of them.
2. There will be more duplicates after we do that. (I suspect that we could just drop all that don't have www, but it doesn't hurt to be careful.
3. There are a few bad ones, like reddit and none. Let's lose those as well.

In [66]:
fixed_sets_list.remove(None)

In [67]:
fixed_sets_list

['https://www.scryfall.com/sets/tdgm',
 'https://www.scryfall.com/sets/c19/fr',
 'https://www.scryfall.com/sets/h09',
 'https://scryfall.com/sets/pbfz',
 'https://www.scryfall.com/sets/c16',
 'https://www.scryfall.com/sets/war/zht',
 'https://scryfall.com/sets/tjvc',
 'https://scryfall.com/sets/isd',
 'https://www.scryfall.com/contact',
 'https://www.scryfall.com/sets/mb1',
 'https://www.scryfall.com/sets/brb',
 'https://www.scryfall.com/sets/5dn/zhs',
 'https://www.scryfall.com/sets/w16/ja',
 'https://www.scryfall.com/sets/dom/pt',
 'https://scryfall.com/sets/arb',
 'https://scryfall.com/sets/dst',
 'https://www.scryfall.com/sets/pal06',
 'https://www.scryfall.com/sets/avr',
 'https://www.scryfall.com/sets/p02',
 'https://www.scryfall.com/sets/csp/de',
 'https://www.scryfall.com/sets/arb/ja',
 'https://scryfall.com/sets/pisd',
 'https://www.scryfall.com/sets/f10',
 'https://www.scryfall.com/sets/l16',
 'https://www.scryfall.com/sets/xln/de',
 'https://scryfall.com/sets/tdd2',
 'https:

In [68]:
# regex = re.compile(r'/scryfall')

# result = list(filter(regex.match, fixed_sets_list))

# display(result)



# fixed_sets_list = [re.sub('(/scryfall)', r'/www.scryfall', link) for link in fixed_sets_list]
# fixed_sets_list

#, Hmm, not working, want to just capture the /scryfall part

# for link in fixed_sets_list:
#     re.findall(r'www', link)
    
# re.sub()

# Join them all into one string

csv = ','.join(fixed_sets_list)

csv = re.sub(r'/scryfall', r'/www.scryfall', csv)

fixed_sets_list = csv.split(',')

In [69]:
fixed_sets_list = list(set(fixed_sets_list))
fixed_sets_list

['https://www.scryfall.com/sets/tdgm',
 'https://www.scryfall.com/sets/c19/fr',
 'https://www.scryfall.com/sets/h09',
 'https://www.scryfall.com/sets/c16',
 'https://www.scryfall.com/sets/war/zht',
 'https://www.scryfall.com/contact',
 'https://www.scryfall.com/sets/mb1',
 'https://www.scryfall.com/sets/brb',
 'https://www.scryfall.com/sets/5dn/zhs',
 'https://www.scryfall.com/sets/w16/ja',
 'https://www.scryfall.com/sets/dom/pt',
 'https://www.scryfall.com/sets/pal06',
 'https://www.scryfall.com/sets/avr',
 'https://www.scryfall.com/sets/p02',
 'https://www.scryfall.com/sets/csp/de',
 'https://www.scryfall.com/sets/arb/ja',
 'https://www.scryfall.com/sets/f10',
 'https://www.scryfall.com/sets/l16',
 'https://www.scryfall.com/sets/xln/de',
 'https://www.scryfall.com/sets/tala',
 'https://www.scryfall.com/sets/leg/it',
 'https://www.scryfall.com/sets/shm/fr',
 'https://www.scryfall.com/sets/ddg/it',
 'https://www.scryfall.com/sets/m13/zhs',
 'https://www.scryfall.com/sets/f08',
 'https:

Ok, almost there. There's still a few unwanted sites in there. Let's get rid of every site which doesn't have a "www.scryfall.com/sets" in it

In [72]:
csv = ','.join(fixed_sets_list)

csv = re.findall(r'[^,].*www.scryfall.com/sets.*[^,]', csv)

csv

['https://www.scryfall.com/sets/tdgm,https://www.scryfall.com/sets/c19/fr,https://www.scryfall.com/sets/h09,https://www.scryfall.com/sets/c16,https://www.scryfall.com/sets/war/zht,https://www.scryfall.com/contact,https://www.scryfall.com/sets/mb1,https://www.scryfall.com/sets/brb,https://www.scryfall.com/sets/5dn/zhs,https://www.scryfall.com/sets/w16/ja,https://www.scryfall.com/sets/dom/pt,https://www.scryfall.com/sets/pal06,https://www.scryfall.com/sets/avr,https://www.scryfall.com/sets/p02,https://www.scryfall.com/sets/csp/de,https://www.scryfall.com/sets/arb/ja,https://www.scryfall.com/sets/f10,https://www.scryfall.com/sets/l16,https://www.scryfall.com/sets/xln/de,https://www.scryfall.com/sets/tala,https://www.scryfall.com/sets/leg/it,https://www.scryfall.com/sets/shm/fr,https://www.scryfall.com/sets/ddg/it,https://www.scryfall.com/sets/m13/zhs,https://www.scryfall.com/sets/f08,https://www.scryfall.com/sets/phop,https://www.scryfall.com/sets/ons/es,https://www.scryfall.com/sets/tcma

Ok, let's start a dataframe to keep track of all this info

In [None]:
magic = pd.DataFrame()
# magic['links'] = 

# A Better Way

Oh man, the sets page has some good stuff! Like number of cards and release date! I need to get that!!!

It looks like there's a better way to do all the stuff I just did. I need to look for the appropriate tag on that page. Still, guess that was good practice.