In [1]:
import numpy as np
import itertools
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from nltk.stem.snowball import SnowballStemmer
import scipy
from scipy import spatial
from scipy.sparse import hstack
import gensim
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Importação do Dataset

In [2]:
starter_dataset = pd.read_feather('data/processed_data')
starter_dataset.head(5)

Unnamed: 0,cmc,oracle_text,power,toughness,colors,produced_mana,loyalty,devotion,phyrexian_mana,flavor_size,...,Tribal,World,Creature,Instant,Land,Artifact,Planeswalker,Sorcery,Enchantment,Equipment
0,6.0,All Sliver creatures have double strike. Abili...,3,3,1,0,0,1,0,228,...,0,0,0,1,0,0,0,0,0,0
1,2.0,"When CARDNAME enters the battlefield, you may ...",2,2,1,0,0,2,0,71,...,0,0,0,1,0,0,0,0,0,0
2,3.0,"Flying\nWhen CARDNAME enters the battlefield, ...",1,2,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1.0,"When CARDNAME dies, put a +1/+1 counter on tar...",2,1,1,0,0,1,0,71,...,0,0,0,1,0,0,0,0,0,0
4,3.0,"Whenever you draw your second card each turn, ...",3,1,1,0,0,1,0,99,...,0,0,0,1,0,0,0,0,0,0


# 1. Conversão do Texto para Minúsculas

In [3]:
df_lower_case = starter_dataset.copy()
df_lower_case['oracle_text'] = df_lower_case['oracle_text'].apply(lambda row: " ".join(row.lower() for row in row.split()))
df_lower_case

Unnamed: 0,cmc,oracle_text,power,toughness,colors,produced_mana,loyalty,devotion,phyrexian_mana,flavor_size,...,Tribal,World,Creature,Instant,Land,Artifact,Planeswalker,Sorcery,Enchantment,Equipment
0,6.0,all sliver creatures have double strike. abili...,3,3,1,0,0,1,0,228,...,0,0,0,1,0,0,0,0,0,0
1,2.0,"when cardname enters the battlefield, you may ...",2,2,1,0,0,2,0,71,...,0,0,0,1,0,0,0,0,0,0
2,3.0,"flying when cardname enters the battlefield, i...",1,2,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1.0,"when cardname dies, put a +1/+1 counter on tar...",2,1,1,0,0,1,0,71,...,0,0,0,1,0,0,0,0,0,0
4,3.0,"whenever you draw your second card each turn, ...",3,1,1,0,0,1,0,99,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24650,2.0,enchantment spells you cast cost {1} less to c...,2,2,1,0,0,1,0,100,...,0,0,0,1,0,0,0,0,0,0
24651,0.0,"{t}: add {c}. {1}, {t}: add one mana of any co...",0,0,0,1,0,0,0,86,...,0,0,0,0,0,0,0,1,0,0
24652,4.0,({u/p} can be paid with either {u} or 2 life.)...,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
24653,5.0,"morbid — when cardname enters the battlefield,...",4,4,1,0,0,2,0,104,...,0,0,0,1,0,0,0,0,0,0


# 2. Remoção de Pontuação

In [4]:
import re

regex_mask = r"[~`!@#\$\%\^&*()_\=\[\]\\|:;\"',.?’“”]" # Representa uma expressão que corresponde a quaisquer sinais de acentuação, ignorando-se a barra e o sinal de + ou -

df_without_ponctuation = df_lower_case.copy()
df_without_ponctuation['oracle_text'] = df_without_ponctuation['oracle_text'].apply(lambda row: re.sub(regex_mask, "", row))
df_without_ponctuation['oracle_text']

0        all sliver creatures have double strike abilit...
1        when cardname enters the battlefield you may a...
2        flying when cardname enters the battlefield it...
3        when cardname dies put a +1/+1 counter on targ...
4        whenever you draw your second card each turn c...
                               ...                        
24650    enchantment spells you cast cost {1} less to c...
24651        {t} add {c} {1} {t} add one mana of any color
24652    {u/p} can be paid with either {u} or 2 life dr...
24653    morbid — when cardname enters the battlefield ...
24654                                               flying
Name: oracle_text, Length: 24655, dtype: object

# 3. Remoção das Stepwords

In [5]:
import nltk
from nltk.corpus import stopwords

nltk.download()

sp = stopwords.words('english')

df_without_stopwords = df_without_ponctuation.copy()
df_without_stopwords['oracle_text'] = df_without_stopwords['oracle_text'].apply(lambda row: " ".join(x for x in row.split() if x not in sp))
df_without_stopwords['oracle_text'] 

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


0        sliver creatures double strike abilities slive...
1        cardname enters battlefield may attach target ...
2        flying cardname enters battlefield explores re...
3        cardname dies put +1/+1 counter target knight ...
4        whenever draw second card turn cardname gains ...
                               ...                        
24650    enchantment spells cast cost {1} less cast whe...
24651               {t} add {c} {1} {t} add one mana color
24652    {u/p} paid either {u} 2 life draw two cards pr...
24653    morbid — cardname enters battlefield creature ...
24654                                               flying
Name: oracle_text, Length: 24655, dtype: object

# 4. Substituição de menções a poder e resistência

In [6]:
import re

def replaceModifierStats(text: str) -> str:
    
    if text:

        modifier_mask = "[\+-][0-9]\/[\+-][0-9]" # Representa uma expressão que corresponde a quaisquer sinais de acentuação, ignorando-se a barra e o sinal de + ou -
        debuffer_mask = '[\-][0-9]\/[\-][0-9]'
        buffer_mask = '[\+][0-9]\/[\+][0-9]'
        not_number_mask = '[0-9]'

        # print(f'MEU TEXTO: {text}')
        
        modifier = re.search(modifier_mask, text)

        if modifier:
            
            modifier = modifier.group()
    
            splitted_modifier = modifier.split('/')
        
            power_modifier = re.sub(not_number_mask, '', list(splitted_modifier)[0])
            toughness_modifier = re.sub(not_number_mask, '', list(splitted_modifier)[1])
        
            if power_modifier != toughness_modifier:
                return 'mixed_modifier'
        
            if re.search(debuffer_mask, text):
                return 'debuff_modifier'
        
            return 'buff_modifier'

    return ""

In [7]:
df_tokenized_modifiers = df_without_stopwords.copy()
df_tokenized_modifiers['oracle_text'] = df_tokenized_modifiers['oracle_text'].apply(lambda row: re.sub("[\+-][0-9]\/[\+-][0-9]", replaceModifierStats(row), row))
df_tokenized_modifiers['oracle_text'][3]

'cardname dies put buff_modifier counter target knight control knight control dealt lethal damage time venerable knight die time knight cant receive counter venerable knights ability time save'

# 5. Tokenização do Texto

In [8]:
from textblob import TextBlob, Word

df_tokenized = df_tokenized_modifiers.copy()
df_tokenized['oracle_text_tokenized'] = df_tokenized['oracle_text'].apply(lambda row: TextBlob(row).words)
df_tokenized['oracle_text_tokenized'].head()

0    [sliver, creatures, double, strike, abilities,...
1    [cardname, enters, battlefield, may, attach, t...
2    [flying, cardname, enters, battlefield, explor...
3    [cardname, dies, put, buff_modifier, counter, ...
4    [whenever, draw, second, card, turn, cardname,...
Name: oracle_text_tokenized, dtype: object

# 6. Stem com NLTK

In [9]:
from nltk import PorterStemmer
st = PorterStemmer()

df_stem = df_tokenized.copy()
df_stem['oracle_text_stem'] = df_stem['oracle_text_tokenized'].apply(lambda row: [st.stem(word) for word in row])
df_stem['oracle_text_stem']

0        [sliver, creatur, doubl, strike, abil, sliver,...
1        [cardnam, enter, battlefield, may, attach, tar...
2        [fli, cardnam, enter, battlefield, explor, rev...
3        [cardnam, die, put, buff_modifi, counter, targ...
4        [whenev, draw, second, card, turn, cardnam, ga...
                               ...                        
24650    [enchant, spell, cast, cost, 1, less, cast, wh...
24651             [t, add, c, 1, t, add, one, mana, color]
24652    [u/p, paid, either, u, 2, life, draw, two, car...
24653    [morbid, —, cardnam, enter, battlefield, creat...
24654                                                [fli]
Name: oracle_text_stem, Length: 24655, dtype: object

# 7. Salvando o Conjunto de Dados

In [10]:
import pyarrow.feather as feather
import pyarrow as pa
import pyarrow.parquet as pq

In [11]:
str_dataset = df_stem.astype(str)

In [12]:
str_dataset.to_feather('data/processed_data_token_and_stem')