In [1]:
import codecs
import json
import re
import requests
import zipfile
from operator import itemgetter

import nltk.stem, nltk.corpus
from gensim import corpora, models, similarities

import wget

# Generate documents

In [2]:
try:
    fname = 'AllCards.json.zip'
    zf = zipfile.ZipFile(fname)
except FileNotFoundError:
    url = 'https://mtgjson.com/json/AllCards.json.zip'
    fname = wget.download(url)
    zf = zipfile.ZipFile(fname) 

In [3]:
zf = zipfile.ZipFile(fname)

In [4]:
reader = codecs.getreader("utf-8")
cards = json.load(reader(zf.open('AllCards.json')))

In [5]:
cards['Mossdog']

{'cmc': 1,
 'colorIdentity': ['G'],
 'colors': ['Green'],
 'imageName': 'mossdog',
 'layout': 'normal',
 'manaCost': '{G}',
 'name': 'Mossdog',
 'power': '1',
 'subtypes': ['Plant', 'Hound'],
 'text': 'Whenever Mossdog becomes the target of a spell or ability an opponent controls, put a +1/+1 counter on Mossdog.',
 'toughness': '1',
 'type': 'Creature — Plant Hound',
 'types': ['Creature']}

In [134]:
card_names = list(cards.keys())
with open('card_names.txt', 'w') as f:
    f.writelines(t + '\n' for t in card_names)

In [6]:
nltk.download('stopwords')
stops = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/claydavi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
## This is a choice: which stemmer to use

# stemmer = nltk.stem.porter.PorterStemmer()
stemmer = nltk.stem.snowball.SnowballStemmer('english')

In [171]:
def tokenize(card):
    text = ' '.join([card.get('text', '')]
                   # + card.get('types', [])
                   + card.get('subtypes', [])
                    )
    text = text.lower()
    # Replace card name with ~
    text = text.replace(card['name'].lower(), '~')
    # remove parentheticals
    text = re.sub(r'\([^)]+\)', '', text)
    # remove costs
    text = re.sub(r'\{[^}]+\}', '', text)
    # genericize all p/t (de)buffs
    text = re.sub(r'([+-])[\dX*]/([+-])[\dX*]', r'\1X/\2X', text)
    # split on punctuation and spaces
    text = re.sub(r'\d+', 'N', text)
    tokens = re.split(r'[\s.,;:—]+', text)
    # use only unique tokens?
    # tokens = set(tokens)
    # stem tokens
    tokens = [stemmer.stem(token) for token in tokens]
    return [t for t in tokens if t and t not in stops]

documents = [tokenize(c) for c in cards.values()]
# next(documents)

In [172]:
dictionary = corpora.Dictionary(documents)
dictionary.save('dictionary.dict')

In [173]:
print(len(dictionary))

1662


In [174]:
corpus = [dictionary.doc2bow(doc) for doc in documents]
corpora.MmCorpus.serialize('card_text_corpus.mm', corpus)

# Model - LSI

In [175]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [176]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100)
corpus_lsi = lsi[corpus_tfidf]

In [177]:
lsi.print_topics()[:5]

[(0,
  '0.554*"creatur" + 0.437*"~" + 0.249*"target" + 0.231*"card" + 0.198*"control" + 0.174*"turn" + 0.155*"battlefield" + 0.152*"player" + 0.136*"damag" + 0.130*"n"'),
 (1,
  '0.557*"card" + -0.517*"creatur" + 0.240*"librari" + 0.231*"~" + 0.174*"put" + -0.156*"enchant" + 0.151*"player" + 0.142*"battlefield" + 0.139*"hand" + -0.128*"turn"'),
 (2,
  '-0.632*"~" + 0.458*"card" + 0.348*"creatur" + 0.200*"librari" + -0.194*"counter" + -0.147*"enter" + 0.135*"enchant" + -0.119*"battlefield" + 0.098*"reveal" + -0.096*"damag"'),
 (3,
  '-0.426*"player" + -0.420*"damag" + -0.348*"n" + -0.330*"deal" + -0.279*"target" + 0.236*"+x/+x" + 0.225*"battlefield" + 0.186*"get" + 0.158*"end" + 0.149*"enter"'),
 (4,
  '-0.575*"turn" + -0.442*"end" + 0.276*"creatur" + 0.267*"enchant" + -0.243*"target" + -0.238*"get" + 0.197*"battlefield" + -0.175*"gain" + -0.148*"+x/+x" + 0.132*"enter"')]

In [178]:
card = cards['Windfall']
card['text']

'Each player discards his or her hand, then draws cards equal to the greatest number of cards a player discarded this way.'

In [179]:
vec_bow = dictionary.doc2bow(tokenize(card))
vec_lsi = lsi[vec_bow]
vec_lsi

[(0, 1.0173168667074808),
 (1, 1.8099304437434234),
 (2, 1.2833624902882144),
 (3, -0.79700062794686799),
 (4, -0.32419765992324617),
 (5, -0.41207544695007997),
 (6, 0.34370367606671653),
 (7, -0.314118756321635),
 (8, -0.76352433853457269),
 (9, 0.65106258132516825),
 (10, 0.52963137408671068),
 (11, -0.34745641882672196),
 (12, -1.337062292803092),
 (13, 0.20827782781931109),
 (14, 0.67674741597809585),
 (15, 0.40002497439306944),
 (16, -0.21366608981334761),
 (17, 0.16849037148106757),
 (18, -0.49114123793541781),
 (19, 0.062648504581883688),
 (20, 0.11622326370676878),
 (21, -0.26751281187346465),
 (22, 0.32351832618598814),
 (23, -0.56393667057954755),
 (24, -0.21144423100758816),
 (25, 0.41754060322172631),
 (26, 0.44712626623858898),
 (27, -0.15720710971598176),
 (28, 0.13830899434698576),
 (29, -0.0079188790232541887),
 (30, 0.14125488325319499),
 (31, -0.46605974933366234),
 (32, 0.0051460513691150098),
 (33, 0.2961037372141811),
 (34, -0.062412640277062885),
 (35, -0.3192228

# Create index

In [180]:
index = similarities.MatrixSimilarity(corpus_lsi)
index.save('all_cards_lsi.index')

In [181]:
def similarity(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_lsi = lsi[tfidf[vec_bow]]
    scores = index[vec_lsi]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [182]:
similarity(card)

[(2719, 0.99999994),
 (3068, 0.99997729),
 (12934, 0.94199342),
 (13711, 0.85900944),
 (2861, 0.84266651),
 (8784, 0.83459491),
 (12105, 0.82252502),
 (7255, 0.81517601),
 (15682, 0.79437119),
 (9950, 0.79231244),
 (14208, 0.79169929),
 (7049, 0.78010231),
 (3000, 0.76560974),
 (10962, 0.75456107),
 (6188, 0.75322604),
 (2679, 0.75294548),
 (8241, 0.75201714),
 (7004, 0.74551946),
 (4644, 0.74383867),
 (8349, 0.73900098),
 (15409, 0.73574346),
 (6870, 0.73558068),
 (5323, 0.73527348),
 (12170, 0.72897118),
 (12607, 0.72817612),
 (6042, 0.72202152),
 (1446, 0.71857899),
 (13596, 0.71787655),
 (13236, 0.70887917),
 (165, 0.70709872),
 (1794, 0.70700109),
 (9264, 0.70419872),
 (10183, 0.70042592),
 (11069, 0.69921058),
 (3962, 0.69601625),
 (1557, 0.69472349),
 (6652, 0.69436395),
 (7521, 0.69374961),
 (16501, 0.69374961),
 (4095, 0.69301534),
 (11330, 0.69151336),
 (6827, 0.6877147),
 (11965, 0.68174988),
 (11486, 0.68123066),
 (7848, 0.67883658),
 (4498, 0.67827737),
 (10313, 0.67808104

In [183]:
def get_similar_cards(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card['text']))
        if len(similar_cards) >= N:
            break
    return similar_cards
        

In [185]:
get_similar_cards('Windfall')

[(0.99997729,
  'Whispering Madness',
  'Each player discards his or her hand, then draws cards equal to the greatest number of cards a player discarded this way.\nCipher (Then you may exile this spell card encoded on a creature you control. Whenever that creature deals combat damage to a player, its controller may cast a copy of the encoded card without paying its mana cost.)'),
 (0.94199342,
  "Jace's Archivist",
  '{U}, {T}: Each player discards his or her hand, then draws cards equal to the greatest number of cards a player discarded this way.'),
 (0.85900944,
  "Laquatus's Creativity",
  'Target player draws cards equal to the number of cards in his or her hand, then discards that many cards.'),
 (0.84266651,
  'Syphon Mind',
  'Each other player discards a card. You draw a card for each card discarded this way.'),
 (0.83459491,
  'Ancient Excavation',
  'Draw cards equal to the number of cards in your hand, then discard a card for each card drawn this way.\nBasic landcycling {2} 

In [186]:
tokenize(cards['Ancient Excavation'])

['draw',
 'card',
 'equal',
 'number',
 'card',
 'hand',
 'discard',
 'card',
 'card',
 'drawn',
 'way',
 'basic',
 'landcycl']

# Model - LDA

In [187]:
lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=100)
corpus_lda = lda[corpus]

In [188]:
index = similarities.MatrixSimilarity(corpus_lda)
index.save('all_cards_lda.index')

In [189]:
def similarity_lda(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_lda = lda[vec_bow]
    scores = index[vec_lda]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [190]:
def get_similar_cards_lda(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity_lda(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card.get('text', '')))
        if len(similar_cards) >= N:
            break
    return similar_cards

In [191]:
get_similar_cards_lda('Fatal Push')

[(0.97876126,
  "Ephara's Warden",
  '{T}: Tap target creature with power 3 or less.'),
 (0.97876126,
  'Renegade Wheelsmith',
  "Whenever Renegade Wheelsmith becomes tapped, target creature can't block this turn."),
 (0.97876126,
  'Tel-Jilad Justice',
  'Destroy target artifact. Scry 2. (Look at the top two cards of your library, then put any number of them on the bottom of your library and the rest on top in any order.)'),
 (0.97876126,
  'Pontiff of Blight',
  'Extort (Whenever you cast a spell, you may pay {W/B}. If you do, each opponent loses 1 life and you gain that much life.)\nOther creatures you control have extort. (If a creature has multiple instances of extort, each triggers separately.)'),
 (0.97876126,
  'Darksteel Pendant',
  'Indestructible (Effects that say "destroy" don\'t destroy this artifact.)\n{1}, {T}: Scry 1. (Look at the top card of your library. You may put that card on the bottom of your library.)'),
 (0.97876126,
  'Smite the Monstrous',
  'Destroy target c

In [145]:
tokenize(cards['Fatal Push'])

['destroy',
 'target',
 'creatur',
 'convert',
 'mana',
 'cost',
 'n',
 'less',
 'revolt',
 'destroy',
 'creatur',
 'convert',
 'mana',
 'cost',
 'n',
 'less',
 'instead',
 'perman',
 'control',
 'left',
 'battlefield',
 'turn']

In [146]:
cards['Fatal Push']

{'cmc': 1,
 'colorIdentity': ['B'],
 'colors': ['Black'],
 'imageName': 'fatal push',
 'layout': 'normal',
 'manaCost': '{B}',
 'name': 'Fatal Push',
 'text': 'Destroy target creature if it has converted mana cost 2 or less.\nRevolt — Destroy that creature if it has converted mana cost 4 or less instead if a permanent you controlled left the battlefield this turn.',
 'type': 'Instant',
 'types': ['Instant']}

# Model - HDP

In [None]:
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]

In [None]:
index = similarities.MatrixSimilarity(corpus_hdp)
index.save('all_cards_hdp.index')

In [None]:
def similarity_hdp(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_hdp = hdp[vec_bow]
    scores = index[vec_hdp]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [None]:
def get_similar_cards_hdp(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity_hdp(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card.get('text', '')))
        if len(similar_cards) >= N:
            break
    return similar_cards

In [None]:
get_similar_cards_hdp('Windfall')

In [None]:
hdp.print_topics()