In [1]:
import codecs
import collections
import gzip
import itertools
import json
import re
from operator import itemgetter

import nltk.stem, nltk.corpus
from gensim import corpora, models, similarities

# Generate documents

In [2]:
# curl -O https://mtgjson.com/json/AllCards.json.gz
fname = 'AllCards.json.gz'
cards = json.load(gzip.open(fname, 'rt'))

In [3]:
cards['Mossdog']

{'colorIdentity': ['G'],
 'colors': ['G'],
 'convertedManaCost': 1.0,
 'foreignData': [{'language': 'German',
   'name': 'Mooshund',
   'text': 'Immer wenn der Mooshund das Ziel eines Zauberspruchs oder einer Fähigkeit ist, die ein Gegner kontrolliert, lege eine +1/+1-Marke auf den Mooshund.',
   'type': 'Kreatur — Hund'},
  {'language': 'Spanish', 'name': 'Perro de musgo'},
  {'language': 'French', 'name': 'Chien de mousse'},
  {'language': 'Italian', 'name': 'Cane di Muschio'},
  {'language': 'Japanese', 'name': '苔犬'},
  {'language': 'Portuguese (Brazil)', 'name': 'Cão de Musgo'}],
 'layout': 'normal',
 'legalities': {'commander': 'Legal',
  'duel': 'Legal',
  'legacy': 'Legal',
  'pauper': 'Legal',
  'penny': 'Legal',
  'vintage': 'Legal'},
 'manaCost': '{G}',
 'mtgstocksId': 8192,
 'name': 'Mossdog',
 'power': '1',
 'printings': ['NEM'],
 'purchaseUrls': {'cardmarket': 'https://mtgjson.com/links/cc00eafc98e25a19',
  'mtgstocks': 'https://mtgjson.com/links/2caa0df46f99b875',
  'tcgp

In [4]:
card_names = list(cards.keys())
with open('card_names.txt', 'w') as f:
    f.writelines(t + '\n' for t in card_names)

In [5]:
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/clayton_davis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
stemmer = nltk.stem.snowball.SnowballStemmer('english')

In [7]:
def tokenize(card):
    text = ' '.join([card.get('text', '')]
                   # + card.get('types', [])
                   + card.get('subtypes', [])
                    )
    text = text.lower()
    ## Replace card name with ~
    text = text.replace(card['name'].lower(), '~')
    ## remove reminder text (in parentheses)
    text = re.sub(r'\([^)]+\)', '', text)
    ## remove costs
    text = re.sub(r'\{[^}]+\}', '', text)
    ## genericize all p/t (de)buffs
    text = re.sub(r'([+-])[\dX*]/([+-])[\dX*]', r'\1X/\2X', text)
    ## genericize numbers
    text = re.sub(r'\d+', 'N', text)
    ## split on punctuation and spaces
    tokens = re.split(r'[\s.,;:—()]+', text)
    # use only unique tokens?
    # tokens = set(tokens)
    # stem tokens
    tokens = (stemmer.stem(t) for t in tokens if t and t not in stopwords)

    ## The following allows us to singularize certain terms.
    ## For example, the word 'equip' is way over-represented on equipment
    counter = collections.Counter(tokens)
    if counter['equip']:
        counter['equip'] = 1
    
    tokens = itertools.chain.from_iterable([token] * count for token, count in counter.items())
        
    return list(tokens)

documents = [tokenize(c) for c in cards.values()]
# next(documents)

In [8]:
dictionary = corpora.Dictionary(documents)
dictionary.save('dictionary.dict')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [9]:
print(len(dictionary))

1917


In [10]:
corpus = [dictionary.doc2bow(doc) for doc in documents]
corpora.MmCorpus.serialize('card_text_corpus.mm', corpus)

# Model - LSI

In [11]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [12]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100)
corpus_lsi = lsi[corpus_tfidf]

In [13]:
lsi.print_topics()[:5]

[(0,
  '0.537*"creatur" + 0.415*"~" + 0.257*"target" + 0.253*"card" + 0.219*"control" + 0.179*"turn" + 0.170*"battlefield" + 0.132*"end" + 0.131*"n" + 0.131*"+x/+x"'),
 (1,
  '-0.576*"card" + 0.506*"creatur" + -0.255*"librari" + -0.183*"~" + -0.177*"put" + -0.153*"player" + -0.143*"hand" + 0.140*"turn" + 0.139*"enchant" + -0.132*"battlefield"'),
 (2,
  '-0.649*"~" + 0.411*"card" + 0.381*"creatur" + 0.185*"librari" + -0.178*"counter" + -0.158*"enter" + -0.142*"damag" + -0.133*"battlefield" + 0.131*"enchant" + -0.127*"n"'),
 (3,
  '-0.407*"target" + -0.376*"damag" + -0.349*"n" + -0.307*"player" + -0.293*"deal" + 0.284*"battlefield" + 0.197*"enter" + 0.188*"+x/+x" + 0.171*"~" + 0.156*"counter"'),
 (4,
  '-0.577*"turn" + -0.473*"end" + -0.282*"get" + 0.265*"creatur" + 0.222*"enchant" + -0.215*"+x/+x" + -0.161*"gain" + 0.160*"damag" + 0.153*"deal" + 0.131*"n"')]

In [14]:
card = cards['Windfall']
card['text']

'Each player discards their hand, then draws cards equal to the greatest number of cards a player discarded this way.'

In [15]:
vec_bow = dictionary.doc2bow(tokenize(card))
vec_lsi = lsi[vec_bow]
vec_lsi[:5]

[(0, 1.026754946131473),
 (1, -1.8650618411934503),
 (2, 1.0913062891403233),
 (3, -0.6766340059751109),
 (4, -0.06561613951360196)]

# Create index

In [16]:
index = similarities.MatrixSimilarity(corpus_lsi)
index.save('all_cards_lsi.index')

In [17]:
def similarity(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_lsi = lsi[tfidf[vec_bow]]
    scores = index[vec_lsi]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [18]:
similarity(card)[:10]

[(19418, 1.0),
 (19306, 0.9999841),
 (8673, 0.94538784),
 (9486, 0.8555135),
 (9504, 0.8487427),
 (16923, 0.84758043),
 (10650, 0.82489854),
 (487, 0.8227881),
 (6156, 0.80599296),
 (13493, 0.8019552)]

In [19]:
def get_similar_cards(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score,
                    ' '.join([this_card['name'], this_card.get('manaCost', '')]),
                    this_card['text'],
                    ))
        if len(similar_cards) >= N:
            break
    return similar_cards
        

# Query similar cards

In [20]:
get_similar_cards('Cadaverous Bloom')

[(0.80681056, 'Inner Fire {3}{R}', 'Add {R} for each card in your hand.'),
 (0.769855,
  'Serum Powder {3}',
  '{T}: Add {C}.\nAny time you could mulligan and Serum Powder is in your hand, you may exile all the cards from your hand, then draw that many cards. (You can do this in addition to taking mulligans.)'),
 (0.7496449,
  'Simian Spirit Guide {2}{R}',
  'Exile Simian Spirit Guide from your hand: Add {R}.'),
 (0.74501336,
  'Reliquary Tower ',
  'You have no maximum hand size.\n{T}: Add {C}.'),
 (0.74501336,
  'Thought Vessel {2}',
  'You have no maximum hand size.\n{T}: Add {C}.'),
 (0.7206175,
  'Chrome Mox {0}',
  "Imprint — When Chrome Mox enters the battlefield, you may exile a nonartifact, nonland card from your hand.\n{T}: Add one mana of any of the exiled card's colors."),
 (0.7045518,
  'Scavenger Grounds ',
  '{T}: Add {C}.\n{2}, {T}, Sacrifice a Desert: Exile all cards from all graveyards.'),
 (0.70360094,
  'Struggle for Sanity {2}{B}{B}',
  'Target opponent reveals the

In [21]:
cards['Bonehoard']['text']

'Living weapon (When this Equipment enters the battlefield, create a 0/0 black Germ creature token, then attach this to it.)\nEquipped creature gets +X/+X, where X is the number of creature cards in all graveyards.\nEquip {2}'

In [22]:
tokenize(cards['Bonehoard'])

['live',
 'weapon',
 'equip',
 'creatur',
 'creatur',
 'get',
 '+x/+x',
 'x',
 'number',
 'card',
 'graveyard']

# Model - LDA

In [23]:
lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=100)
corpus_lda = lda[corpus]

In [24]:
index = similarities.MatrixSimilarity(corpus_lda)
index.save('all_cards_lda.index')

In [25]:
def similarity_lda(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_lda = lda[vec_bow]
    scores = index[vec_lda]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [26]:
def get_similar_cards_lda(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity_lda(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card.get('text', '')))
        if len(similar_cards) >= N:
            break
    return similar_cards

In [27]:
get_similar_cards_lda('Fatal Push')

[(1.0,
  '"Rumors of My Death . . ."',
  '{3}{B}, Exile a permanent you control with a League of Dastardly Doom watermark: Return a permanent card with a League of Dastardly Doom watermark from your graveyard to the battlefield.'),
 (1.0,
  'Airdrop Aeronauts',
  'Flying\nRevolt — When Airdrop Aeronauts enters the battlefield, if a permanent you controlled left the battlefield this turn, you gain 5 life.'),
 (1.0,
  'Anafenza, Kin-Tree Spirit',
  'Whenever another nontoken creature enters the battlefield under your control, bolster 1. (Choose a creature with the least toughness among creatures you control and put a +1/+1 counter on it.)'),
 (1.0,
  'Arcbound Bruiser',
  'Modular 3 (This enters the battlefield with three +1/+1 counters on it. When it dies, you may put its +1/+1 counters on target artifact creature.)'),
 (1.0,
  'Arcbound Crusher',
  'Trample\nWhenever another artifact enters the battlefield, put a +1/+1 counter on Arcbound Crusher.\nModular 1 (This enters the battlefiel

In [28]:
tokenize(cards['Fatal Push'])

['destroy',
 'destroy',
 'target',
 'creatur',
 'creatur',
 'convert',
 'convert',
 'mana',
 'mana',
 'cost',
 'cost',
 'n',
 'n',
 'less',
 'less',
 'revolt',
 'instead',
 'perman',
 'control',
 'left',
 'battlefield',
 'turn']

In [29]:
cards['Fatal Push']

{'colorIdentity': ['B'],
 'colors': ['B'],
 'convertedManaCost': 1.0,
 'foreignData': [],
 'layout': 'normal',
 'legalities': {'commander': 'Legal',
  'duel': 'Legal',
  'frontier': 'Legal',
  'legacy': 'Legal',
  'modern': 'Legal',
  'vintage': 'Legal'},
 'manaCost': '{B}',
 'name': 'Fatal Push',
 'printings': ['AER', 'F17', 'PRM'],
 'rulings': [{'date': '2017-02-09',
   'text': 'Fatal Push can target any creature, even one with converted mana cost 5 or greater. The creature’s converted mana cost is checked only as Fatal Push resolves.'},
  {'date': '2017-02-09',
   'text': 'If the mana cost of a creature on the battlefield includes {X}, X is considered to be 0.'},
  {'date': '2017-02-09',
   'text': 'Revolt abilities check only whether a permanent you controlled left the battlefield this turn or not. They don’t apply multiple times if more than one permanent you controlled left the battlefield. They don’t check whether the permanent that left the battlefield is still in the zone it m

# Model - HDP

In [None]:
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]

In [None]:
index = similarities.MatrixSimilarity(corpus_hdp)
index.save('all_cards_hdp.index')

In [None]:
def similarity_hdp(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_hdp = hdp[vec_bow]
    scores = index[vec_hdp]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [None]:
def get_similar_cards_hdp(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity_hdp(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card.get('text', '')))
        if len(similar_cards) >= N:
            break
    return similar_cards

In [None]:
get_similar_cards_hdp('Windfall')

In [None]:
hdp.print_topics()