In [1]:
import pandas as pd 
import numpy as np

import requests 
from bs4 import BeautifulSoup

In [2]:
allSpells = pd.read_html('view-source_dnd5e.wikidot.com_spells_wizard.html',index_col = 'Spell Name')

In [3]:
for i in range(len(allSpells)): 
    allSpells[i]['Spell Level'] = int(i)
    allSpells[i]['Spell Level'] = allSpells[i]['Spell Level'].astype(int)

allSpells = pd.concat(allSpells)
allSpells.head()

Unnamed: 0_level_0,School,Casting Time,Range,Duration,Components,Spell Level
Spell Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Acid Splash,Conjuration,1 Action,60 Feet,Instantaneous,"V, S",0
Blade Ward,Abjuration,1 Action,Self,1 round,"V, S",0
Booming Blade,Evocation,1 Action,Self (5-foot radius),1 round,"S, M",0
Chill Touch,Necromancy,1 Action,120 Feet,1 round,"V, S",0
Control Flames,Transmutation,1 Action,60 Feet,Instantaneous or 1 hour,S,0


In [4]:
def getDescription(url : str): 
    #print(url)
    txt = requests.get(url).text
    soup = BeautifulSoup(txt, 'html.parser',multi_valued_attributes=None)

    try: 
        description = soup.find("div", {"id": "page-content"})
        description = [x.text for x in description('p') if not ('Spell Lists.' in x.text)]
        #parsed = '\n'.join([x.text for x in description('p') if x.text != ""])

        return ' '.join(description[3:])
    except Exception as e: 
        print(e)
        return np.NaN

def getSpellText(row : pd.Series): 
    name = str(row.name).lower()

    for c in [' (ua)','\'']: #characters to delete 
        name = name.replace(c,'')

    for c in [' ','/']:  #characters to replace with hyphen 
        name = name.replace(c,'-')
        
    url = f'http://dnd5e.wikidot.com/spell:{name}'
    
    return getDescription(url)

In [5]:
#debugging block 
print(getSpellText(allSpells.loc['System Backdoor (UA)']))


This spell allows you to bypass system security in order to create a secure login on a foreign system. The login you create allows you administrator-level privileges in any computer system not enhanced through technomagic. The login defeats any technomagic spells of 3rd level or lower. Once the duration of the spell expires, the login and all privileges are wiped from the system. System logs still show the activity of the user, but the user identification cannot be found or traced. At Higher Levels. When you cast this spell using a spell slot of 5th level or higher, you are able to bypass technomagic spells if the spell’s level is equal to or less than the level of the spell slot you used.


In [6]:
allSpells['Spell Description'] = allSpells.apply(getSpellText,axis=1)

In [7]:
#initialize to False
allSpells['Concentration'] = False
allSpells['Ritual'] = False 

filter = allSpells['Duration'].str.contains('Concentration') #find row index where spell takes concentration 
allSpells.loc[filter,'Concentration'] = True                 #at row index, concentration column, set value to True

filter = allSpells['Casting Time'].str[-1] == 'R'            #find row indices where spell can be cast as ritual 
allSpells.loc[filter,'Ritual'] = True                        #give index,col to set value to True 

In [None]:
#inspecting a certain spell's spell description 
allSpells.loc['Pulse Wave']['Spell Description']

In [None]:
allSpells.to_csv('Wizard Spells.csv')

In [None]:
allSpells.shape

<h2> Trying to do some NMF on the descriptions </h2>

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.decomposition

from nltk import PorterStemmer
from nltk.tokenize.regexp import regexp_tokenize
from nltk.corpus import stopwords

import re
from typing import List,Set


In [13]:
stops = set(stopwords.words('english'))
stops.update({'level','feet','within'})

In [14]:
def pprocess(desc : str,_stops : Set[str] = stops) -> str : 
    ret = desc.replace('\'','')       #delete apostrophes so when we tolkenize we don't split contractions
    ret = re.sub('[\d]+[trs ][hdt]','',ret) #stuff in the format of 1st or 4rd or 45th 
    ret = regexp_tokenize(ret,r'(?u)\b\w\w+\b') #default regexp from sklearn tolkenizer 
    ret = [word for word in ret if not word.isnumeric()]
    ret = [PorterStemmer().stem(x) for x in ret]
    ret = [word for word in ret if word not in _stops]
    
    return ' '.join(ret)

In [15]:
pprocess(allSpells.loc['Acid Splash']['Spell Description'])

'hurl bubbl acid choos one creatur see rang choos two creatur see rang target must succeed dexter save throw take 1d6 acid damag higher thi spell damag increas 1d6 reach 2d6 3d6 4d6'

In [16]:
allSpells['stemmed'] = allSpells['Spell Description'].apply(pprocess)

In [17]:
vectorizer = TfidfVectorizer(max_df = .80
                             ,ngram_range=(1,1)
#                             ,max_features=5
                             ) 
X = vectorizer.fit_transform(allSpells['stemmed'])


In [18]:
print(X.shape)
print(vectorizer.get_feature_names_out())

(402, 2296)
['10d10' '10d12' '10d4' ... 'younger' 'yugoloth' 'zombi']


In [27]:
model = sklearn.decomposition.NMF(n_components=3)
W = model.fit_transform(X)
H = model.components_



In [28]:
W

array([[0.16052237, 0.        , 0.06951127],
       [0.04921774, 0.0048186 , 0.02040316],
       [0.08925637, 0.        , 0.1541074 ],
       ...,
       [0.01958648, 0.13614127, 0.13907832],
       [0.09020379, 0.01690822, 0.12687025],
       [0.01009162, 0.10131564, 0.02192703]])

In [29]:
H

array([[0.00000000e+00, 1.83350362e-04, 1.66457414e-02, ...,
        0.00000000e+00, 0.00000000e+00, 1.32818002e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.91033527e-03, 1.77191464e-02, 9.70896576e-03],
       [9.30197212e-03, 5.70313518e-03, 0.00000000e+00, ...,
        1.63441038e-05, 0.00000000e+00, 1.81831439e-02]])

In [30]:
import seaborn as sns 
import matplotlib.pyplot as plt 

In [31]:
h_df = pd.DataFrame(data = H
                   ,columns = vectorizer.get_feature_names_out())

In [32]:
h_df

Unnamed: 0,10d10,10d12,10d4,10d6,10d8,12d12,12d6,14d6,1d10,1d12,...,wrote,wrought,ye,year,yellow,yet,yield,younger,yugoloth,zombi
0,0.0,0.000183,0.016646,0.012599,0.022232,0.00172,0.014761,5e-06,0.044026,0.041089,...,0.0,0.0,0.0,0.0,0.016545,0.005482,0.0,0.0,0.0,0.013282
1,0.0,0.0,0.0,0.001726,0.0,0.001352,0.0,0.0,0.0,0.0,...,0.002227,0.006593,0.001021,0.021123,6.1e-05,0.0,0.001016,0.00191,0.017719,0.009709
2,0.009302,0.005703,0.0,0.0,0.006615,0.0,0.008959,0.008604,0.012722,0.024139,...,0.0,0.003142,0.001247,0.008266,0.004642,0.0,0.00012,1.6e-05,0.0,0.018183


In [33]:
h_df.loc[0].nlargest()

damag      0.929306
save       0.594439
higher     0.501674
slot       0.501306
creatur    0.470182
Name: 0, dtype: float64

In [34]:
nKeywords = h_df.apply(lambda x: x.nlargest()
           ,axis=1)
nKeywords

Unnamed: 0,block,creatur,damag,end,higher,object,save,slot,stat,target,throw,use
0,,0.470182,0.929306,,0.501674,,0.594439,0.501306,,,,
1,0.306122,0.441772,,,,0.301404,,,0.330372,,,0.3668
2,,0.39542,,0.382909,,,0.398026,,,1.187369,0.326555,


In [None]:
nKeywords.loc[0].dropna()

In [None]:
W[0].argmax()

In [None]:
W[7].argmax()

In [None]:
allSpells.iloc[0]

In [None]:
allSpells.iloc[0,-1]