In [1]:
import codecs
import collections
import itertools
import json
import re
import zipfile
from operator import itemgetter

import nltk.stem, nltk.corpus
import wget
from gensim import corpora, models, similarities

# Generate documents

In [2]:
try:
    fname = 'AllCards.json.zip'
    zf = zipfile.ZipFile(fname)
except FileNotFoundError:
    url = 'https://mtgjson.com/json/AllCards.json.zip'
    fname = wget.download(url)
    zf = zipfile.ZipFile(fname) 

In [3]:
reader = codecs.getreader("utf-8")
cards = json.load(reader(zf.open('AllCards.json')))

In [4]:
cards['Mossdog']

{'cmc': 1,
 'colorIdentity': ['G'],
 'colors': ['Green'],
 'imageName': 'mossdog',
 'layout': 'normal',
 'manaCost': '{G}',
 'name': 'Mossdog',
 'power': '1',
 'subtypes': ['Plant', 'Hound'],
 'text': 'Whenever Mossdog becomes the target of a spell or ability an opponent controls, put a +1/+1 counter on Mossdog.',
 'toughness': '1',
 'type': 'Creature — Plant Hound',
 'types': ['Creature']}

In [5]:
card_names = list(cards.keys())
with open('card_names.txt', 'w') as f:
    f.writelines(t + '\n' for t in card_names)

In [6]:
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/claydavi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
stemmer = nltk.stem.snowball.SnowballStemmer('english')

In [24]:
def tokenize(card):
    text = ' '.join([card.get('text', '')]
                   # + card.get('types', [])
                   + card.get('subtypes', [])
                    )
    text = text.lower()
    ## Replace card name with ~
    text = text.replace(card['name'].lower(), '~')
    ## remove reminder text (in parentheses)
    text = re.sub(r'\([^)]+\)', '', text)
    ## remove costs
    text = re.sub(r'\{[^}]+\}', '', text)
    ## genericize all p/t (de)buffs
    text = re.sub(r'([+-])[\dX*]/([+-])[\dX*]', r'\1X/\2X', text)
    ## genericize numbers
    text = re.sub(r'\d+', 'N', text)
    ## split on punctuation and spaces
    tokens = re.split(r'[\s.,;:—()]+', text)
    # use only unique tokens?
    # tokens = set(tokens)
    # stem tokens
    tokens = (stemmer.stem(t) for t in tokens if t and t not in stopwords)

    ## The following allows us to singularize certain terms.
    ## For example, the word 'equip' is way over-represented on equipment
    counter = collections.Counter(tokens)
    if counter['equip']:
        counter['equip'] = 1
    
    tokens = itertools.chain.from_iterable([token] * count for token, count in counter.items())
        
    return list(tokens)

documents = [tokenize(c) for c in cards.values()]
# next(documents)

In [25]:
dictionary = corpora.Dictionary(documents)
dictionary.save('dictionary.dict')

In [26]:
print(len(dictionary))

1661


In [27]:
corpus = [dictionary.doc2bow(doc) for doc in documents]
corpora.MmCorpus.serialize('card_text_corpus.mm', corpus)

# Model - LSI

In [28]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [29]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100)
corpus_lsi = lsi[corpus_tfidf]

In [30]:
lsi.print_topics()[:5]

[(0,
  '0.555*"creatur" + 0.438*"~" + 0.250*"target" + 0.231*"card" + 0.198*"control" + 0.174*"turn" + 0.156*"battlefield" + 0.152*"player" + 0.136*"damag" + 0.130*"n"'),
 (1,
  '0.559*"card" + -0.517*"creatur" + 0.240*"librari" + 0.232*"~" + 0.174*"put" + -0.156*"enchant" + 0.152*"player" + 0.142*"battlefield" + 0.139*"hand" + -0.129*"turn"'),
 (2,
  '-0.632*"~" + 0.458*"card" + 0.349*"creatur" + 0.200*"librari" + -0.195*"counter" + -0.148*"enter" + 0.136*"enchant" + -0.120*"battlefield" + 0.098*"reveal" + -0.096*"damag"'),
 (3,
  '0.426*"player" + 0.421*"damag" + 0.348*"n" + 0.330*"deal" + 0.278*"target" + -0.236*"+x/+x" + -0.225*"battlefield" + -0.186*"get" + -0.159*"end" + -0.149*"enter"'),
 (4,
  '-0.575*"turn" + -0.442*"end" + 0.277*"creatur" + 0.269*"enchant" + -0.243*"target" + -0.238*"get" + 0.198*"battlefield" + -0.175*"gain" + -0.149*"+x/+x" + 0.132*"enter"')]

In [31]:
card = cards['Windfall']
card['text']

'Each player discards his or her hand, then draws cards equal to the greatest number of cards a player discarded this way.'

In [32]:
vec_bow = dictionary.doc2bow(tokenize(card))
vec_lsi = lsi[vec_bow]
vec_lsi[:5]

[(0, 1.0178801609098955),
 (1, 1.8156341013312149),
 (2, 1.2813015562618124),
 (3, 0.79626321583329962),
 (4, -0.32558114210264161)]

# Create index

In [33]:
index = similarities.MatrixSimilarity(corpus_lsi)
index.save('all_cards_lsi.index')

In [34]:
def similarity(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_lsi = lsi[tfidf[vec_bow]]
    scores = index[vec_lsi]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [35]:
similarity(card)[:10]

[(15407, 1.0),
 (7638, 0.9999792),
 (3952, 0.9443627),
 (6033, 0.85077488),
 (13712, 0.8505941),
 (5159, 0.83898509),
 (11175, 0.83895528),
 (9489, 0.81935519),
 (13713, 0.80658031),
 (5024, 0.79528826)]

In [36]:
def get_similar_cards(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card['text']))
        if len(similar_cards) >= N:
            break
    return similar_cards
        

In [37]:
get_similar_cards('Bonehoard')

[(0.88850492,
  'Nighthowler',
  "Bestow {2}{B}{B} (If you cast this card for its bestow cost, it's an Aura spell with enchant creature. It becomes a creature again if it's not attached to a creature.)\nNighthowler and enchanted creature each get +X/+X, where X is the number of creature cards in all graveyards."),
 (0.8348543,
  "Ghoul's Feast",
  'Target creature gets +X/+0 until end of turn, where X is the number of creature cards in your graveyard.'),
 (0.77709156,
  'Meishin, the Mind Cage',
  'All creatures get -X/-0, where X is the number of cards in your hand.'),
 (0.7465235,
  'Multani',
  'Creatures you control get +X/+0, where X is the number of cards in your hand.'),
 (0.74499315,
  "Marshal's Anthem",
  "Multikicker {1}{W} (You may pay an additional {1}{W} any number of times as you cast this spell.)\nCreatures you control get +1/+1.\nWhen Marshal's Anthem enters the battlefield, return up to X target creature cards from your graveyard to the battlefield, where X is the num

In [38]:
cards['Bonehoard']['text']

'Living weapon (When this Equipment enters the battlefield, create a 0/0 black Germ creature token, then attach this to it.)\nEquipped creature gets +X/+X, where X is the number of creature cards in all graveyards.\nEquip {2}'

In [39]:
tokenize(cards['Bonehoard'])

['graveyard',
 'live',
 'get',
 'number',
 'weapon',
 'card',
 'creatur',
 'creatur',
 'equip',
 '+x/+x',
 'x']

# Model - LDA

In [187]:
lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=100)
corpus_lda = lda[corpus]

In [188]:
index = similarities.MatrixSimilarity(corpus_lda)
index.save('all_cards_lda.index')

In [189]:
def similarity_lda(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_lda = lda[vec_bow]
    scores = index[vec_lda]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [190]:
def get_similar_cards_lda(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity_lda(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card.get('text', '')))
        if len(similar_cards) >= N:
            break
    return similar_cards

In [191]:
get_similar_cards_lda('Fatal Push')

[(0.97876126,
  "Ephara's Warden",
  '{T}: Tap target creature with power 3 or less.'),
 (0.97876126,
  'Renegade Wheelsmith',
  "Whenever Renegade Wheelsmith becomes tapped, target creature can't block this turn."),
 (0.97876126,
  'Tel-Jilad Justice',
  'Destroy target artifact. Scry 2. (Look at the top two cards of your library, then put any number of them on the bottom of your library and the rest on top in any order.)'),
 (0.97876126,
  'Pontiff of Blight',
  'Extort (Whenever you cast a spell, you may pay {W/B}. If you do, each opponent loses 1 life and you gain that much life.)\nOther creatures you control have extort. (If a creature has multiple instances of extort, each triggers separately.)'),
 (0.97876126,
  'Darksteel Pendant',
  'Indestructible (Effects that say "destroy" don\'t destroy this artifact.)\n{1}, {T}: Scry 1. (Look at the top card of your library. You may put that card on the bottom of your library.)'),
 (0.97876126,
  'Smite the Monstrous',
  'Destroy target c

In [145]:
tokenize(cards['Fatal Push'])

['destroy',
 'target',
 'creatur',
 'convert',
 'mana',
 'cost',
 'n',
 'less',
 'revolt',
 'destroy',
 'creatur',
 'convert',
 'mana',
 'cost',
 'n',
 'less',
 'instead',
 'perman',
 'control',
 'left',
 'battlefield',
 'turn']

In [146]:
cards['Fatal Push']

{'cmc': 1,
 'colorIdentity': ['B'],
 'colors': ['Black'],
 'imageName': 'fatal push',
 'layout': 'normal',
 'manaCost': '{B}',
 'name': 'Fatal Push',
 'text': 'Destroy target creature if it has converted mana cost 2 or less.\nRevolt — Destroy that creature if it has converted mana cost 4 or less instead if a permanent you controlled left the battlefield this turn.',
 'type': 'Instant',
 'types': ['Instant']}

# Model - HDP

In [None]:
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]

In [None]:
index = similarities.MatrixSimilarity(corpus_hdp)
index.save('all_cards_hdp.index')

In [None]:
def similarity_hdp(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_hdp = hdp[vec_bow]
    scores = index[vec_hdp]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [None]:
def get_similar_cards_hdp(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity_hdp(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card.get('text', '')))
        if len(similar_cards) >= N:
            break
    return similar_cards

In [None]:
get_similar_cards_hdp('Windfall')

In [None]:
hdp.print_topics()