In [1]:
import codecs
import collections
import gzip
import itertools
import json
import re
from operator import itemgetter

import nltk.stem, nltk.corpus
import wget
from gensim import corpora, models, similarities

# Generate documents

In [3]:
try:
    fname = 'AllCards.json.gz'
    cards = json.load(gzip.open(fname, 'rt'))
except FileNotFoundError:
    url = 'https://mtgjson.com/json/AllCards.json.gz'
    fname = wget.download(url)
    cards = json.load(gzip.open(fname, 'rt'))

In [4]:
cards['Mossdog']

{'cmc': 1,
 'colorIdentity': ['G'],
 'colors': ['Green'],
 'imageName': 'mossdog',
 'layout': 'normal',
 'manaCost': '{G}',
 'name': 'Mossdog',
 'power': '1',
 'subtypes': ['Plant', 'Hound'],
 'text': 'Whenever Mossdog becomes the target of a spell or ability an opponent controls, put a +1/+1 counter on Mossdog.',
 'toughness': '1',
 'type': 'Creature — Plant Hound',
 'types': ['Creature']}

In [5]:
card_names = list(cards.keys())
with open('card_names.txt', 'w') as f:
    f.writelines(t + '\n' for t in card_names)

In [6]:
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/claydavi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
stemmer = nltk.stem.snowball.SnowballStemmer('english')

In [8]:
def tokenize(card):
    text = ' '.join([card.get('text', '')]
                   # + card.get('types', [])
                   + card.get('subtypes', [])
                    )
    text = text.lower()
    ## Replace card name with ~
    text = text.replace(card['name'].lower(), '~')
    ## remove reminder text (in parentheses)
    text = re.sub(r'\([^)]+\)', '', text)
    ## remove costs
    text = re.sub(r'\{[^}]+\}', '', text)
    ## genericize all p/t (de)buffs
    text = re.sub(r'([+-])[\dX*]/([+-])[\dX*]', r'\1X/\2X', text)
    ## genericize numbers
    text = re.sub(r'\d+', 'N', text)
    ## split on punctuation and spaces
    tokens = re.split(r'[\s.,;:—()]+', text)
    # use only unique tokens?
    # tokens = set(tokens)
    # stem tokens
    tokens = (stemmer.stem(t) for t in tokens if t and t not in stopwords)

    ## The following allows us to singularize certain terms.
    ## For example, the word 'equip' is way over-represented on equipment
    counter = collections.Counter(tokens)
    if counter['equip']:
        counter['equip'] = 1
    
    tokens = itertools.chain.from_iterable([token] * count for token, count in counter.items())
        
    return list(tokens)

documents = [tokenize(c) for c in cards.values()]
# next(documents)

In [9]:
dictionary = corpora.Dictionary(documents)
dictionary.save('dictionary.dict')

In [10]:
print(len(dictionary))

1661


In [11]:
corpus = [dictionary.doc2bow(doc) for doc in documents]
corpora.MmCorpus.serialize('card_text_corpus.mm', corpus)

# Model - LSI

In [12]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [13]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100)
corpus_lsi = lsi[corpus_tfidf]

In [14]:
lsi.print_topics()[:5]

[(0,
  '0.555*"creatur" + 0.438*"~" + 0.250*"target" + 0.231*"card" + 0.198*"control" + 0.174*"turn" + 0.156*"battlefield" + 0.152*"player" + 0.136*"damag" + 0.130*"n"'),
 (1,
  '0.559*"card" + -0.517*"creatur" + 0.240*"librari" + 0.232*"~" + 0.174*"put" + -0.156*"enchant" + 0.152*"player" + 0.142*"battlefield" + 0.139*"hand" + -0.129*"turn"'),
 (2,
  '0.632*"~" + -0.458*"card" + -0.349*"creatur" + -0.200*"librari" + 0.195*"counter" + 0.148*"enter" + -0.136*"enchant" + 0.120*"battlefield" + -0.098*"reveal" + 0.096*"damag"'),
 (3,
  '-0.426*"player" + -0.421*"damag" + -0.348*"n" + -0.330*"deal" + -0.278*"target" + 0.236*"+x/+x" + 0.225*"battlefield" + 0.186*"get" + 0.159*"end" + 0.149*"enter"'),
 (4,
  '-0.575*"turn" + -0.442*"end" + 0.277*"creatur" + 0.269*"enchant" + -0.243*"target" + -0.238*"get" + 0.198*"battlefield" + -0.175*"gain" + -0.149*"+x/+x" + 0.132*"enter"')]

In [15]:
card = cards['Windfall']
card['text']

'Each player discards his or her hand, then draws cards equal to the greatest number of cards a player discarded this way.'

In [16]:
vec_bow = dictionary.doc2bow(tokenize(card))
vec_lsi = lsi[vec_bow]
vec_lsi[:5]

[(0, 1.0178801543798091),
 (1, 1.815634316389064),
 (2, -1.2813018304561745),
 (3, -0.79626354443940306),
 (4, -0.32558171210953024)]

# Create index

In [17]:
index = similarities.MatrixSimilarity(corpus_lsi)
index.save('all_cards_lsi.index')

In [18]:
def similarity(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_lsi = lsi[tfidf[vec_bow]]
    scores = index[vec_lsi]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [19]:
similarity(card)[:10]

[(2397, 1.0),
 (13742, 0.99997938),
 (688, 0.94405633),
 (13934, 0.85117394),
 (9855, 0.85089725),
 (6873, 0.83937097),
 (4267, 0.83885503),
 (5414, 0.81919342),
 (15507, 0.80748463),
 (8377, 0.79543173)]

In [38]:
def get_similar_cards(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score,
                    ' '.join([this_card['name'], this_card.get('manaCost', '')]),
                    this_card['text'],
                    ))
        if len(similar_cards) >= N:
            break
    return similar_cards
        

# Query similar cards

In [39]:
get_similar_cards('Cadaverous Bloom')

[(0.87844652,
  'Inner Fire {3}{R}',
  'Add {R} to your mana pool for each card in your hand.'),
 (0.84163749,
  'Reliquary Tower ',
  'You have no maximum hand size.\n{T}: Add {C} to your mana pool.'),
 (0.84163749,
  'Thought Vessel {2}',
  'You have no maximum hand size.\n{T}: Add {C} to your mana pool.'),
 (0.83419704,
  'Simian Spirit Guide {2}{R}',
  'Exile Simian Spirit Guide from your hand: Add {R} to your mana pool.'),
 (0.80424207,
  'Chrome Mox {0}',
  "Imprint — When Chrome Mox enters the battlefield, you may exile a nonartifact, nonland card from your hand.\n{T}: Add one mana of any of the exiled card's colors to your mana pool."),
 (0.76806962,
  'Serum Powder {3}',
  '{T}: Add {C} to your mana pool.\nAny time you could mulligan and Serum Powder is in your hand, you may exile all the cards from your hand, then draw that many cards. (You can do this in addition to taking mulligans.)'),
 (0.76363921,
  'Magnifying Glass {3}',
  '{T}: Add {C} to your mana pool.\n{4}, {T}: In

In [38]:
cards['Bonehoard']['text']

'Living weapon (When this Equipment enters the battlefield, create a 0/0 black Germ creature token, then attach this to it.)\nEquipped creature gets +X/+X, where X is the number of creature cards in all graveyards.\nEquip {2}'

In [39]:
tokenize(cards['Bonehoard'])

['graveyard',
 'live',
 'get',
 'number',
 'weapon',
 'card',
 'creatur',
 'creatur',
 'equip',
 '+x/+x',
 'x']

# Model - LDA

In [187]:
lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=100)
corpus_lda = lda[corpus]

In [188]:
index = similarities.MatrixSimilarity(corpus_lda)
index.save('all_cards_lda.index')

In [189]:
def similarity_lda(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_lda = lda[vec_bow]
    scores = index[vec_lda]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [25]:
def get_similar_cards_lda(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity_lda(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card.get('text', '')))
        if len(similar_cards) >= N:
            break
    return similar_cards

In [26]:
get_similar_cards_lda('Fatal Push')

NameError: name 'similarity_lda' is not defined

In [145]:
tokenize(cards['Fatal Push'])

['destroy',
 'target',
 'creatur',
 'convert',
 'mana',
 'cost',
 'n',
 'less',
 'revolt',
 'destroy',
 'creatur',
 'convert',
 'mana',
 'cost',
 'n',
 'less',
 'instead',
 'perman',
 'control',
 'left',
 'battlefield',
 'turn']

In [146]:
cards['Fatal Push']

{'cmc': 1,
 'colorIdentity': ['B'],
 'colors': ['Black'],
 'imageName': 'fatal push',
 'layout': 'normal',
 'manaCost': '{B}',
 'name': 'Fatal Push',
 'text': 'Destroy target creature if it has converted mana cost 2 or less.\nRevolt — Destroy that creature if it has converted mana cost 4 or less instead if a permanent you controlled left the battlefield this turn.',
 'type': 'Instant',
 'types': ['Instant']}

# Model - HDP

In [None]:
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]

In [None]:
index = similarities.MatrixSimilarity(corpus_hdp)
index.save('all_cards_hdp.index')

In [None]:
def similarity_hdp(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_hdp = hdp[vec_bow]
    scores = index[vec_hdp]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [None]:
def get_similar_cards_hdp(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity_hdp(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card.get('text', '')))
        if len(similar_cards) >= N:
            break
    return similar_cards

In [None]:
get_similar_cards_hdp('Windfall')

In [None]:
hdp.print_topics()