In [476]:
from selenium import webdriver
import bs4 as bs
from bs4 import NavigableString 
import pandas as pd
import urllib.request
import numpy as np
import time
import mtgsdk

## Get all expansion names from website
----

In [390]:
source = urllib.request.urlopen('https://www.cardmarket.com/en/Magic/Products/Singles').read()
expansion_parent_soup = bs.BeautifulSoup(source,'lxml')

In [415]:
def get_all_expansion_names(my_soup):
    expansions = []
    start=False
    for option in my_soup.find_all('option'): 
        if start:
            if option['value'] == "1669":
                expansions.append(option.text)
                break 
            else: 
                expansions.append(option.text)
        if 'selected' in option.attrs and option['value'] == "0":
            start = True
    return pd.Series(expansions)

expansions = get_all_expansion_names(expansion_parent_soup)
expansions[:10]

0        2005 Player Cards
1        2006 Player Cards
2        2007 Player Cards
3      Aaron Miller Tokens
4            Aether Revolt
5    Aether Revolt: Promos
6             Alara Reborn
7                Alliances
8                    Alpha
9          ALRadeck Tokens
dtype: object

In [416]:
expansions.shape # Almost 400 expansions

(397,)

## Only get Commander specific expansions, and format the name to be URL ready
----

In [432]:
def format_url_extension(x):
    x = x.replace(' ', '-')
    x = x.replace(':', '')
    return x.replace("\'", '')

formated_commander_names = expansions[(expansions.str.find('Commander') != -1)].apply(format_url_extension)

compare_formats = pd.DataFrame()
compare_formats['expansion_raw'] = expansions[(expansions.str.find('Commander') != -1)]
compare_formats['expansion_formated'] = formated_commander_names

compare_formats

Unnamed: 0,expansion_raw,expansion_formated
49,Commander,Commander
50,Commander 2013,Commander-2013
51,Commander 2014,Commander-2014
52,Commander 2015,Commander-2015
53,Commander 2016,Commander-2016
54,Commander 2017,Commander-2017
55,Commander 2018,Commander-2018
56,Commander 2019,Commander-2019
57,Commander Anthology,Commander-Anthology
58,Commander Anthology II,Commander-Anthology-II


In [496]:
def create_url_from_expansion(expansion, page=False):
    base_url = "https://www.cardmarket.com/en/Magic/Products/Singles/"
    if page: 
        end_url = '?idRarity=0&site='
        return '{0}{1}{2}{3}'.format(base_url, expansion, end_url, page)
    else:
        end_url = "?idRarity=0&perSite=20"
        return '{0}{1}{2}'.format(base_url, expansion, end_url)

In [497]:
create_url_from_expansion(formated_commander_names[60])

'https://www.cardmarket.com/en/Magic/Products/Singles/Commanders-Arsenal?idRarity=0&perSite=20'

## Getting number of pages of a expansion
----

In [501]:
expansion =  formated_commander_names[52]
source = urllib.request.urlopen(create_url_from_expansion(expansion)).read()
child_soup = bs.BeautifulSoup(source,'lxml')

n_pages = int(np.ceil(int(child_soup.find_all('div', {"class":"col-auto d-none d-md-block"})[0].text.split(' ')[0]) / 20))
n_pages

19

In [502]:
urls = [create_url_from_expansion(expansion, page=page) for page in range(n_pages)]

## Getting all links from all pages from an expansion

In [539]:
all_links = []
for url in urls:
    source = urllib.request.urlopen(url).read()
    soup = bs.BeautifulSoup(source, 'lxml')
    links_of_page = []
    for link in soup.find_all('a', href=True):
        if expansion + '/' in link['href']: 
            links_of_page.append("https://www.cardmarket.com/" + link['href'])
    all_links.extend(links_of_page)

## Getting all values 

In [544]:
df = pd.DataFrame()

for link in all_links[:2]: 
    source = urllib.request.urlopen(link).read()
    soup = bs.BeautifulSoup(source, 'lxml')
    
    first_chart_obj = soup.find_all('script',{'class':'chart-init-script'})[0]
    readable_json_string = first_chart_obj.text.split(',', 1)[1].split('backgroundColor')[0][:-2] + "}]}}"

    temp = pd.DataFrame()
    temp['dates'] = json.loads(readable_json_string)['data']['labels']
    temp['prices'] = json.loads(readable_json_string)['data']['datasets'][0]['data']
    temp['card'] = pd.Series([link.split('/')[-1] for i in range(temp.shape[0])])
    df = pd.concat([df, temp])

In [557]:
def get_data_for_expansion(expansion):
    ## Basically all the operations are above, this is just getting everything in one function 
    ## Ables you to get a 30 day range of prices for all the cards of the above expansion (the string has to be well formatted)
    
    source = urllib.request.urlopen(create_url_from_expansion(expansion)).read()
    child_soup = bs.BeautifulSoup(source,'lxml')

    n_pages = int(np.ceil(int(child_soup.find_all('div', {"class":"col-auto d-none d-md-block"})[0].text.split(' ')[0]) / 20))
    
    urls = [create_url_from_expansion(expansion, page=page) for page in range(n_pages)]
    
    all_links = []
    for url in urls:
        source = urllib.request.urlopen(url).read()
        soup = bs.BeautifulSoup(source, 'lxml')
        links_of_page = []
        for link in soup.find_all('a', href=True):
            if expansion + '/' in link['href']: 
                links_of_page.append("https://www.cardmarket.com/" + link['href'])
        all_links.extend(links_of_page)
        
    df = pd.DataFrame()

    for link in all_links: 
        source = urllib.request.urlopen(link).read()
        soup = bs.BeautifulSoup(source, 'lxml')

        first_chart_obj = soup.find_all('script',{'class':'chart-init-script'})[0]
        readable_json_string = first_chart_obj.text.split(',', 1)[1].split('backgroundColor')[0][:-2] + "}]}}"

        temp = pd.DataFrame()
        temp['dates'] = json.loads(readable_json_string)['data']['labels']
        temp['prices'] = json.loads(readable_json_string)['data']['datasets'][0]['data']
        temp['card'] = pd.Series([link.split('/')[-1] for i in range(temp.shape[0])])
        df = pd.concat([df, temp])
    
    return df

In [565]:
formated_commander_names

49                 Commander
50            Commander-2013
51            Commander-2014
52            Commander-2015
53            Commander-2016
54            Commander-2017
55            Commander-2018
56            Commander-2019
57       Commander-Anthology
58    Commander-Anthology-II
59          Commander-Ikoria
60        Commanders-Arsenal
dtype: object

In [571]:
t1 = time.time()

df = pd.DataFrame()

for expansion in formated_commander_names[2:]:
    temp = get_data_for_expansion(expansion)
    temp['expansion'] = [expansion for i in range(temp.shape[0])]
    
    df = pd.concat([df, temp])

t2 = time.time()

In [588]:
formated_commander_names[:2]

49         Commander
50    Commander-2013
dtype: object

In [583]:
formated_commander_names[2:]

51            Commander-2014
52            Commander-2015
53            Commander-2016
54            Commander-2017
55            Commander-2018
56            Commander-2019
57       Commander-Anthology
58    Commander-Anthology-II
59          Commander-Ikoria
60        Commanders-Arsenal
dtype: object

In [572]:
(t2 - t1) / len(df['card'].unique()) # 11 seconde pour avoir l'info d'une carte 

11.01252188262504

In [581]:
df.to_csv('./Data/CardPricePull_05-20.csv', index=False)