<a href="https://colab.research.google.com/github/erheault/DnDCharacterPrediction/blob/main/GRUDnDBackstoryToCharacter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Global values to set what to run

In [None]:
run_web_scraper = False

# Obtain Data Set
Imports needed for web scraping

In [None]:
import requests as req
from bs4 import BeautifulSoup as bs

Use in web-scraper to parse questionable class names. Also helps with DND 3.5 to DND 5e conversion

In [None]:
# Return true is name is a core 3.5/5e class
def is_core_class(name):
    if (name == "fighter"):
        return True
    elif (name == "sorcerer"):
        return True
    elif (name == "barbarian"):
        return True
    elif (name == "monk"):
        return True
    elif (name == "paladin"):
        return True
    elif (name == "rogue"):
        return True
    elif (name == "warlock"):
        return True
    elif (name == "wizard"):
        return True
    elif (name == "bard"):
        return True
    elif (name == "druid"):
        return True
    elif (name == "ranger"):
        return True
    elif (name == "cleric"):
        return True
    elif (name == "artificer"):
        return True
    else:
        return False   
# End is_core_class

"""
Attempt to parse questionable class names like
"dark knight" "ftr" "wiz/rog" "theif" "scout"
into usefull information.
"""
def parse_class_name(class_name):

    # dont use this if the class name is not
    # good, not in DND 5e, or cant be transfered
    # to DND 5e.
    if (len(class_name) == 0 or class_name == "???"):
        return "BAD_CLASS"

    name = class_name.lower()

    # if this is a core class, return it
    if (is_core_class(name)):
        return name
    
    # Multiclassing case, just get first class
    name_list = name.split('/')
    levels = [-1] * len(name_list)
    counter = 0
    for multi in name_list:

        # Return the first matching class
        if is_core_class(multi):
            return multi

        # Get the class with the heighest level
        index = multi.find("(")
        if (index != -1):
            value = multi[index+1:index+2]
            if (value.isdigit()):
                level = int(value)
                levels[counter] = level
        # End if
        counter += 1 # increment counter
    # End for

    # If best_index is positive, then there are
    # level labels, return the class of highest level
    first_invalid = True
    failsafe = 0
    while (True and failsafe < 10):
        max = -1
        index = 0
        for c in range(0, len(levels)):
            if (levels[c] > max):
                max = levels[c]
                index = c
            # End if
        # End for

        if max >= 0:
            this_name = name_list[c]
            format_name = this_name[0:len(this_name)-3]
            if is_core_class(this_name[0:len(this_name)-3]):
                return format_name
            else:
                # In the case we find nothing, record the first
                # name and attempt to parse it later
                levels[index] = -1
                if first_invalid:
                    name = format_name
                    first_invalid = False
                # End if
            # End if
        else:
            break;
        # End if
        failsafe += 1 # failsafe to exit infinite loop
    # End while

    # Ideally we should of reaturened a valid class name by now,
    # if not try and parse "name" further

    if (name == "factotum" or name == "lurk" or
          "bard" in name):
        return "bard"
    elif (name == "sha-ir" or name == "sha ir" or
          name == "shair" or name == "sha'ir" or
          name == "hexblade" or name == "soulborn" or
          name == "wu jen" or name == "wu-jen" or
          name == "wujen" or name == "wu'jen" or
          name == "binder" or name == "shadowcaster" or
          name == "dragonfire adept" or name == "dragonfireadept" or
          "warlock" in name):
        return "warlock"
    elif (name == "favored soul" or name == "favored-soul" or
          name == "favoredsould" or name == "wilder" or
          name == "swordsage" or name == "sorceress" or
          "sorcerer" in name):
        return "sorcerer"
    elif (name == "healer" or name == "divine mind" or
          name == "divinemind" or "priest" in name or
          "cleric" in name):
        return "cleric"
    elif (name == "spirit shaman" or name == "spiritshaman" or
          name == "spirit-shaman" or name == "incarnate" or 
          name == "totemist" or name == "animal" or
          "druid" in name):
        return "druid"
    elif (name == "crusader" or "paladin" in name):
        return "paladin"
    elif (name == "assassin" or name == "swashbuckler" or
          name == "theif" or name[0] == "r"or name == "ninja" or
          name == "beguiler" or name == "soulknife" or
          name == "soul knife" or name == "psychic rogue" or
          name == "psychicrouge" or name == "thief" or
          "rouge" in name or "rogue" in name
          or "scout" in name):
        return "rogue"
    elif (name == "archer" or name[0] == "f" or
          name == "gladiator" or name == "knight" or
          name == "marshal" or name == "samurai" or
          name == "duskblade" or name == "psychic warrior" or
          name == "psychicwarrior" or name == "warblade" or
          "warrior" in name or "fighter" in name):
        return "fighter"
    elif (name == "mage" or name == "spellthief" or
          name == "dread necromancer" or name == "dreadnecromancer" or
          name == "warmage" or name == "archivist" or
          name == "shugenja" or name == "erudite" or
          name == "truenamer" or name == "illusionist" or
          name == "witch" or name == "conjurer" or
          name == "necromancer" or
          "wizard" in name or "illusionist" in name or
          "magic" in name or "mage" in name):
        return "wizard"
    # End if

    # if the name has been correctaly parsed, return it
    # else, skip this feature
    return "BAD_CLASS" + name
# End parse_class_name

def parse_race_name(race):

    if (len(race) == 0):
        return "BAD_RACE"

    return race
# End parse_race_name

Parse "https://3edb.com/selection.asp" for character backgrounds.

In [None]:
if (run_web_scraper):
    # Macros
    BACKSTORY_MIN_LENGTH = 50
    FILTER_CLASS_NAMES = True
    FILTER_RACE_NAMES = False
    WRITE_TO_OUTPUT = True

    # Set up file to write data to
    data_file = open("CharacterData8.data", "w")
    data_file.write("class|race|str|dex|con|int|wis|cha|backstory\n")

    # URL for a character page, cid is the character id, starting from 12 to 23690.
    # Note: Not all id's are used
    URL = "https://3edb.com/viewCharacter.asp?cid="

    first_cid = 12 # first cid in the database
    last_cid = 23690 # maximum cid in the database

    # Used to separate web scraping into multiple steps
    b1s = first_cid
    b1e = 3000
    b2s = 3001
    b2e = 6000
    b3s = 6001
    b3e = 9000
    b4s = 9001
    b4e = 12000
    b5s = 12001
    b5e = 15000
    b6s = 15001
    b6e = 18000
    b7s = 18001
    b7e = 21000
    b8s = 21001
    b8e = last_cid + 1

    # Iterate through all cid's in range [first_cid,last_cid]
    for cid in range(first_cid, last_cid + 1):

        # Set up GET request
        Character_URL = URL + str(cid)
        headers = {'User-Agent': 'Mozilla/5.0'} # Set headers to pass bot detection
        character_sheet = req.get(Character_URL, headers=headers)

        # Get parser
        soup = bs(character_sheet.content, "html.parser")
        
        # Fields to fill from request
        Class = ""
        Race = ""
        Backstory = ""
        STR = ""
        DEX = ""
        CON = ""
        INT = ""
        WIS = ""
        CHA = ""

        # Get only font and small class from html page
        results = soup.findAll('font', class_="small")
        if (len(results) < 11):
            continue # Page does not exist, go to next page
        
        # Get Character Class
        messy_class = results[1].text.lower() # tune class name
        if (messy_class == "BAD_CLASS" and FILTER_CLASS_NAMES):
            continue # Skip classes with bad labels
        Class = parse_class_name(messy_class)

        # Get Character Race
        messy_race_name = results[2].text.lower()
        if (messy_race_name == "BAD_RACE" and FILTER_RACE_NAMES):
            continue
        Race = parse_race_name(messy_race_name)

        # Get Character Backstory
        Backstory = results[len(results)-1].text.lower()

        # Skip characters with no backstory
        if (Backstory == "no background assigned." or
            len(Backstory) < BACKSTORY_MIN_LENGTH):
            continue

        # Get ability scores
        ability = soup.find(lambda tag:tag.name=="u" and "Abilities" in tag.text)
        table = ability.parent.parent.parent

        # Get STR
        STR_table = table.findNext('tr')
        STR_td = STR_table.findChild('font', class_="small")
        STR = STR_td.text

        # Get DEX
        DEX_table = STR_table.findNext('tr')
        DEX_td = DEX_table.findChild('font', class_="small")
        DEX = DEX_td.text

        # Get CON
        CON_table = DEX_table.findNext('tr')
        CON_td = CON_table.findChild('font', class_="small")
        CON = CON_td.text

        # Get INT
        INT_table = CON_table.findNext('tr')
        INT_td = INT_table.findChild('font', class_="small")
        INT = INT_td.text

        # Get WIS
        WIS_table = INT_table.findNext('tr')
        WIS_td = WIS_table.findChild('font', class_="small")
        WIS = WIS_td.text

        # Get CHA
        CHA_table = WIS_table.findNext('tr')
        CHA_td = CHA_table.findChild('font', class_="small")
        CHA = CHA_td.text

        # Write data to data_file
        if (WRITE_TO_OUTPUT):
          data_file.write("{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}\n".format(Class,Race,
            STR,DEX,CON,INT,WIS,CHA,Backstory))
        # End if

    # End for each cid
    data_file.close()
# End if

# Load the data file

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# define funciton to load data
import csv

def load_data(filename):
  lines = []
  with open(filename) as csvfile:
      reader = csv.DictReader(csvfile)
      for line in reader:
          lines.append(line)
  # End with
  
  return lines

In [None]:
# Append the directory to your python path using sys
import sys
import os
prefix = '/content/drive/My Drive/'
# modifty custom path to find the data file
custom_path_to_data_file = 'Colab Notebooks/'
sys_path = prefix + custom_path_to_data_file
sys.path.append(sys_path)

fp_data = os.path.join(sys_path, 'AllCharacterData.csv')
data = load_data(fp_data)
print('Path to AllCharacterData.data: {}'.format(fp_data))

Path to AllCharacterData.data: /content/drive/My Drive/Colab Notebooks/AllCharacterData.csv


In [None]:
print(len(data))
print(data[0])
print(len(data[0]))
print(data[0].keys())
print(data[0].values())
print(data[0]['class'])
print(data[0]['backstory'])

4961
OrderedDict([('class', 'fighter'), ('race', 'minotaur'), ('str', '20'), ('dex', '13'), ('con', '20'), ('int', '13'), ('wis', '10'), ('cha', '13'), ('backstory', "mayor jihadmy name is jihad my master added the destroyer some years ago. i was captured as a calf some fifty years ago. i was sold on the block here in nexus a few months later. my master mustofa delaroach put me into a school for gladiators at that time. i've been in the ring since i was eight. i fought for years for mustofa. he sold me to a man named frezil'gordon my second owner. my third and finial owner the st.jacks family. garthe purchased me as a present for rudy's 10th birthday rudy loved to watch me fight in the arena. for a time i lived with more freedom than i had ever known. i had a room in the mansion house i was nearly a pet for rudy. i became a mentor to him though in a wierd way.i tought rudy to love freedom. as he learned what it washe demanded i his friend be given it. rudy was going to sneak me off the

# Lemmatize and embed data


https://www.nltk.org/howto/wordnet.html

https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

In [None]:
from nltk.classify.rte_classify import lemmatize
import nltk
from nltk.stem import WordNetLemmatizer
import random
import numpy as np

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# Lemmatize the data
lemmatizer = WordNetLemmatizer()

print(data[0]['backstory'])

for i in range(0, len(data)):
    bad_sentence = data[i]['backstory'].lower()
    word_list = nltk.word_tokenize(bad_sentence)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    data[i]['backstory'] = lemmatized_output
# End for

print(data[0]['backstory'])

mayor jihadmy name is jihad my master added the destroyer some years ago. i was captured as a calf some fifty years ago. i was sold on the block here in nexus a few months later. my master mustofa delaroach put me into a school for gladiators at that time. i've been in the ring since i was eight. i fought for years for mustofa. he sold me to a man named frezil'gordon my second owner. my third and finial owner the st.jacks family. garthe purchased me as a present for rudy's 10th birthday rudy loved to watch me fight in the arena. for a time i lived with more freedom than i had ever known. i had a room in the mansion house i was nearly a pet for rudy. i became a mentor to him though in a wierd way.i tought rudy to love freedom. as he learned what it washe demanded i his friend be given it. rudy was going to sneak me off the isle.when garthe found out he arranged to make money on the setting me free. garthe never really felt like i was anything more than a beast like you in the beginning.

Embed the data: source https://austingwalters.com/word-embedding-and-data-splitting/

In [None]:

def create_word_embedding(data, feature, add_pos_tags=False):
    count = 0
    word_embedding = {}
    encoded_backsories = []

    for i in range(0, len(data)):
        backstory = nltk.word_tokenize(data[i][feature])

        if add_pos_tags:
            backsotry = [ele for word_tuple in nltk.pos_tag(backstory) for ele in word_tuple]
        # End if

        # Create mapping: { "this": 1, "is": 2, ... } & encode each backstory
        encoded_backstory = []
        for word in backstory:
            if word not in word_embedding:
                word_embedding[word] = count
                count += 1
            # End if
            encoded_backstory.append(word_embedding[word])
        # End for
        encoded_backsories.append(encoded_backstory)
    # End for

    return encoded_backsories, word_embedding
# End create_word_embedding

def encode_and_split_data(data, randomize=False, data_split=0.8, add_pos_to_tags=False):

    character_data = data

    if randomize:
        character_data = data
        random.shuffle(character_data)
    # End if

    # Encode the data
    encoded_backstories, backstory_embedding = create_word_embedding(character_data, 'backstory', add_pos_tags=True)

    # Encode the classes/lables
    encoded_classes, class_embedding = create_word_embedding(character_data, 'class', add_pos_tags=False)

    # Determine the training sample split point
    training_sample = int(len(encoded_backstories) * data_split)

    print("encoded_backstories: {0}\ndata_split: {1}\ntraining_sample: {2}".format(len(encoded_backstories), data_split, training_sample))
    print("encoded_classes: {0}\nsplit: {1}".format(len(encoded_classes), int(len(encoded_classes) * data_split)))

    x_train = np.array(encoded_backstories[0:training_sample])
    x_test  = np.array(encoded_backstories[training_sample:len(encoded_backstories)])
    y_train = np.array(encoded_classes[0:training_sample])
    y_test  = np.array(encoded_classes[training_sample:len(encoded_classes)])

    return x_train, x_test, y_train, y_test, backstory_embedding, class_embedding
# End encode_and_split_data

Run the above algorithm to encode the data

In [None]:
# Run the above algorithm
x_train, x_test, y_train, y_test, backstory_embedding, class_embedding = encode_and_split_data(data,
                                                                                               randomize=True,
                                                                                               data_split=0.8,
                                                                                               add_pos_to_tags=True)

encoded_backstories: 4961
data_split: 0.8
training_sample: 3968
encoded_classes: 4961
split: 3968




Print out the embeded data

In [None]:
print("--- Lematized Data ---")
print(data[0]['backstory'])
print(data[1]['backstory'])

print("\n--- Backstories ---")
print(x_train[0])
print(x_test[0])

print("\n--- Classes ---")
print(y_train[0])
print(data[0]['class'])
print(y_train[1])
print(data[1]['class'])
print(y_test[0])
print(data[3968]['class'])
print(y_test[1])
print(data[3969]['class'])

print("\n --- Embedding ---")
print(backstory_embedding)
print(class_embedding)


--- Lematized Data ---
lyithoeryn ha no past ; that is to say her origin her childhood memory her parent and so on all the thing that most people take for granted were erased when she underwent the psychic surgery which turned her into an elan . she know nothing of her former life only that she wa found to posse extremely rare mental gift and wa deemed worthy to become one of the select and secretive race known a elans.the invasive process left her forever changed seperating her further from the normal physical process of the humanoid race and leaving her conscious mind in supreme control both of her body and her environment.lyithoeryn came to the lhazaar principality 6 year ago seeking a number of rare tome which were rumored to be stashed away somewhere in the archive of the ( insert library ) there . although the library cater more to student of the arcane art lyithoeryn wa able to uncover tome which hid mystery so deeply even the most intelligent of mages could never unlock them . 

# Build the Recurrent Neural Net


In [None]:
# libraries
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
from keras.layers import RNN, LSTM, Embedding
from keras.layers import GRU, SimpleRNN
from keras.preprocessing import sequence

import math
import numpy as np
import matplotlib.pyplot as plt
from keras.utils.np_utils import to_categorical

In [None]:
# parameters
max_words = len(backstory_embedding)
print("max_word: {0}".format(max_words))
batch_size = 32
maxlen = 1200
epochs = 30
optimizer = 'Adam'

# fighter sorcerer barbarian monk paladin rogue
# warlock wizard bard druid ranger cleric
num_classes = 12

# variables: x_train, x_test, y_train, y_test, backstory_embedding, class_embedding

# Vectorize the output
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)


# Pad the input vectors to ensure a consistent length
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test  = sequence.pad_sequences(x_test, maxlen=maxlen)

# Normalize the data
x_train = x_train.astype('float32')
x_test  = x_test.astype('float32')


max_word: 69131


Documentation: 


*   https://www.tensorflow.org/guide/keras/rnn
*   https://austingwalters.com/classify-sentences-via-a-recurrent-neural-network-lstm/

In [None]:
def class_rnn():
    model = Sequential()

    model.add(Embedding(max_words, 512))
    model.add(GRU(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))

    model.add(SimpleRNN(256))

    model.add(Dense(num_classes, activation='softmax'))

    if optimizer == "Adam":
      opt = keras.optimizers.Adam()
    elif optimizer == "Adagrad":
        opt = keras.optimizers.Adagrad()
    else:
        opt = keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6)
    # End if else

    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    model.fit(x_train, y_train, batch_size=batch_size,
              epochs=epochs, validation_data=(x_test, y_test))
    
    score = model.evaluate(x_test, y_test, batch_size=batch_size)

    print('Test accuracy:', score[1])

# End base_nn

In [None]:
class_rnn()

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
 18/124 [===>..........................] - ETA: 8:14 - loss: 2.3389 - accuracy: 0.2118

KeyboardInterrupt: ignored