In [1]:
import pandas as pd
import seaborn as sns
import re

In [2]:
tsv = pd.read_csv('dnd_chars_tsv.txt', sep='\t')

In [3]:
tsv.head()

Unnamed: 0,ip,finger,hash,name,race,background,date,class,justClass,subclass,...,country,countryCode,processedAlignment,good,lawful,processedRace,processedSpells,processedWeapons,levelGroup,alias
0,081b9c8,038162c,b36cc828d3d069c3ebf4916c3072480b,bb0cd171,Human,Knight,2021-07-27T01:30:16Z,Warlock 4,Warlock,The Hexblade,...,United States,US,,,,Human,Eldritch Blast*0|Create Bonfire*0|Friends*0|He...,"Crossbow, Light|Dagger|Mace|Longsword",4-7,competent_mclaren
1,081b9c8,038162c,cae2a5212b208dc3c9fdd13c6677662d,bb0cd171,Human,Knight,2021-07-27T01:07:20Z,Warlock 1,Warlock,The Hexblade,...,United States,US,,,,Human,,"Crossbow, Light|Dagger|Mace",1-3,competent_mclaren
2,347f396,ce53022,574c56f949de0adfaa2c4142fb4c54e5,24415eef,Hengenyokai- Sabre Toothed Tiger,Knight,2021-07-26T19:09:58Z,Warlock 3,Warlock,The Great Old One,...,United States,US,,,,Shifter,Chill Touch*0|Eldritch Blast*0|Fire Bolt*0|Mag...,Dagger|Quarterstaff|||,1-3,hardcore_dijkstra
3,0062322,542e778,de4c50489033371f5e2c15ac8d992c08,e9d88d5f,Serpentblood,Knight,2021-07-25T21:15:34Z,Sorcerer 4,Sorcerer,Storm Sorcery,...,Canada,CA,,,,Yaun-Ti,,"Crossbow, Light|Dagger",4-7,zealous_volhard
4,3e52d52,dec07a2,f7a17d3836c7fe22d7fb78d4f2b7d14c,a35cdfac,Half-Elf,Urchin,2021-07-25T18:53:42Z,Rogue 3,Rogue,Swashbuckler,...,United States,US,,,,Half-Elf,,"Rapier|Shortbow|Dagger|Crossbow, Light",1-3,pensive_bhabha


In [4]:
tsv.shape

(9784, 36)

In [5]:
pd.unique(tsv['class'])

array(['Warlock 4', 'Warlock 1', 'Warlock 3', ..., 'Monk 3|Druid 1',
       'Warlock 5|Sorcerer 15', 'Druid 2|Bard 2'], dtype=object)

In [6]:
tsv.describe()

Unnamed: 0,level,HP,AC,Str,Dex,Con,Int,Wis,Cha,good,lawful
count,9784.0,9784.0,9784.0,9784.0,9784.0,9784.0,9784.0,9784.0,9784.0,0.0,0.0
mean,4.892069,45.32267,15.406991,12.82083,14.652903,14.346893,11.978945,13.121934,13.143806,,
std,5.140607,92.583394,3.381186,3.988983,3.183303,2.543954,3.160591,3.141482,3.690033,,
min,1.0,-24.0,7.0,1.0,3.0,4.0,0.0,1.0,0.0,,
25%,2.0,14.0,14.0,10.0,13.0,13.0,10.0,11.0,10.0,,
50%,4.0,31.0,15.0,12.0,15.0,14.0,12.0,13.0,13.0,,
75%,6.0,55.0,17.0,16.0,17.0,16.0,14.0,15.0,16.0,,
max,261.0,6894.0,222.0,103.0,101.0,103.0,99.0,100.0,99.0,,


In [7]:
tsv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9784 entries, 0 to 9783
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ip                  8105 non-null   object 
 1   finger              8867 non-null   object 
 2   hash                9784 non-null   object 
 3   name                9580 non-null   object 
 4   race                9760 non-null   object 
 5   background          9766 non-null   object 
 6   date                9784 non-null   object 
 7   class               9784 non-null   object 
 8   justClass           9784 non-null   object 
 9   subclass            7500 non-null   object 
 10  level               9784 non-null   int64  
 11  feats               3066 non-null   object 
 12  HP                  9784 non-null   int64  
 13  AC                  9784 non-null   int64  
 14  Str                 9784 non-null   int64  
 15  Dex                 9784 non-null   int64  
 16  Con   

## Features

**name**: This column has hashes that represent character names. If the hashes are the same, that means the names are the same. Real names are removed to protect character anonymity. Yes D&D characters have rights.

**race**: This is the race field as it come out of the application. It is not really helpful as subrace and race information all mixed up together and unevenly available. It also includes some homebrew content. You probably want to use the processedRace column if you are interested in this.

**background**: Background as it comes out of the application.

**date**: Time & date of input. Dates before 2018-04-16 are unreliable as some has accidentally changed while moving files around.

**class**: Class and level. Different classes are separated by | when needed.

**justClass**: Class without level. Different classes are separated by | when needed.

**subclass**: Subclasses. Again, separated by | when needed.

**level**: Total character level.

**feats**: Feats chosen by character. Separated by | when needed.

**HP**: Character HP.

**AC**: Character AC.

**Str, Dex, Con, Int, Wis, Cha**: ability scores

**alignment**: Alignment free text field. It is a mess, don't touch it. See processedAlignment,good and lawful instead.

**skills**: List of skills with proficiency. Separated by |.

**weapons**: List weapons. Separated by |. It is somewhat of a mess as it allows free text inputs. See processedWeapons.

**spells**: List of spells and their levels. Spells are separated by |s. Each spell has its level next to it separated by *s. This is a huge mess as its a free text field and some users included things like damage dice in them. See processedSpells.

**day**: A shortened version of date. Only includes day information.

**processedAlignment**: Processed version of the alignment column. Way people wrote up their alignments are manually sifted through and assigned to the matching aligmment. First character represents lawfulness (L, N, C), second one goodness (G,N,E). An empty string means alignment wasn't written or unclear.

**good, lawful**: Isolated columns for goodness and lawfulness.

**processedRace**: I have gone through the way race column is filled by the app and asigned them to correct races. If empty, indiciates a homebrew race not natively supported by the app.

**processedSpells**: Formatting is same as the spells column but it is cleaned up. Using string similarity I tried to match the spells to the full list of spells available in the official publications. The spell is removed if the spell I guessed does not have the correct level or doesn't include all words of the original spell and has too many modifications to be recognizable. It may have a few false matches but it should be mostly fine

**processedWeapons**: Similar to processedSpells, weapons column is matched to the closest official weapon with some restrictions.

**levelGroup**: splits levels into groups as used in the feat percentage plot. Only present in the filtered data but easy enough to make on your own.

## Features to drop for Determining a Character's DND class
- name since it is usually arbitrary and it uses hashes to represent the character names
- race because it is unprocessed data and very messy


In [8]:
tsv.drop(labels = ['ip', 'hash', 'finger', 'name', 'race', 'date', 'class', 'subclass', 'alignment', 'spells', 'choices', 
                    'country', 'countryCode', 'processedAlignment','good', 'lawful', 'alias'], axis=1, inplace=True)

In [9]:
tsv.shape

(9784, 19)

In [10]:
tsv.head()

Unnamed: 0,background,justClass,level,feats,HP,AC,Str,Dex,Con,Int,Wis,Cha,skills,weapons,castingStat,processedRace,processedSpells,processedWeapons,levelGroup
0,Knight,Warlock,4,Fighting Initiate,29,14,9,16,12,12,8,18,History|Investigation|Deception|Intimidation|P...,"Crossbow, light|Dagger|Mace|Longsword",Cha,Human,Eldritch Blast*0|Create Bonfire*0|Friends*0|He...,"Crossbow, Light|Dagger|Mace|Longsword",4-7
1,Knight,Warlock,1,Fighting Initiate,9,13,10,15,13,12,8,16,History|Investigation|Deception|Intimidation|P...,"Crossbow, light|Dagger|Mace",Cha,Human,,"Crossbow, Light|Dagger|Mace",1-3
2,Knight,Warlock,3,,28,12,11,14,14,11,13,19,Investigation|Nature|Intimidation|Persuasion,Dagger|Quarterstaff|Bite|Claw|Eldritch Blast,Cha,Shifter,Chill Touch*0|Eldritch Blast*0|Fire Bolt*0|Mag...,Dagger|Quarterstaff|||,1-3
3,Knight,Sorcerer,4,,20,11,14,12,12,14,11,16,History|Religion|Insight|Persuasion,"Crossbow, light|Dagger",Cha,Yaun-Ti,,"Crossbow, Light|Dagger",4-7
4,Urchin,Rogue,3,,29,15,8,17,14,10,15,12,Sleight of Hand|Stealth|Investigation|Percepti...,"Rapier|Shortbow|Dagger|Crossbow, Light",Int,Half-Elf,,"Rapier|Shortbow|Dagger|Crossbow, Light",1-3


In [11]:
#tsv['skills']

In [12]:
def processor(df, column):
    global lofvals
    col_dict = df[column].to_dict()
    vals = list(col_dict.values())
    lofvals = []
    for i in vals:
        s = str(i).split('|')
        lofvals.append(s)
    return lofvals

In [26]:
def splitter(df, column, term, col):
    global df2
    processor(df, column)
    data = []
    for lis in lofvals:
        if term in lis:
            data.append(1)
        else:
            data.append(0)
    df2 = df[col] = data
    return df2
    

In [14]:
terms = ['Acrobatics', 'Animal Handling', 'Arcana', 'Athletics', 'Deception', 'History', 'Insight', 
         'Intimidation', 'Investigation', 'Medicine', 'Nature', 'Perception', 'Performance', 'Persuasion', 
         'Religion', 'Sleight of Hand', 'Stealth', 'Survival']
for term in terms:
    splitter(tsv, 'skills', term, term)

In [23]:
def single(df, column, new_col):
    processor(df, column)
    mc = []
    for lis in lofvals:
        if len(lis) > 1:
            mc.append(1)
        else:
            mc.append(0)
    first = df[new_col] = mc
    return first          

In [24]:
single(tsv, 'justClass', 'multiclass')

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [89]:
def categorize(df, column, cats, n_col):
    processor(df, column)
    for lis in lofvals:
        check = all(c in cats for c in lis)
        if check:
            df[n_col] = 1
        else:
            df[n_col] = 0
    return df[n_col]

In [90]:
categorize(tsv, 'processedWeapons', simple, 'simpleWeapon')

0       0
1       0
2       0
3       0
4       0
       ..
9779    0
9780    0
9781    0
9782    0
9783    0
Name: simpleWeapon, Length: 9784, dtype: int64

In [91]:
categorize(tsv, 'processedWeapons', martial, 'martialWeapon')

0       0
1       0
2       0
3       0
4       0
       ..
9779    0
9780    0
9781    0
9782    0
9783    0
Name: martialWeapon, Length: 9784, dtype: int64

In [92]:
tsv['simpleWeapon'].unique()

array([0], dtype=int64)

In [93]:
tsv['martialWeapon'].unique()

array([0], dtype=int64)

In [94]:
categorize(tsv, 'processedWeapons', ranged, 'rangedWeapon')

0       0
1       0
2       0
3       0
4       0
       ..
9779    0
9780    0
9781    0
9782    0
9783    0
Name: rangedWeapon, Length: 9784, dtype: int64

In [95]:
tsv['rangedWeapon'].unique()

array([0], dtype=int64)

In [40]:
tsv['processedWeapons']

0        Crossbow, Light|Dagger|Mace|Longsword
1                  Crossbow, Light|Dagger|Mace
2                       Dagger|Quarterstaff|||
3                       Crossbow, Light|Dagger
4       Rapier|Shortbow|Dagger|Crossbow, Light
                         ...                  
9779                Shortsword|Crossbow, Light
9780                                  Scimitar
9781                         Longsword|Javelin
9782                             Dagger|Dagger
9783                    Rapier|Shortbow|Dagger
Name: processedWeapons, Length: 9784, dtype: object

In [80]:
simple = ['Club', 'Dagger', 'Greatclub', 'Handaxe', 'Javelin',
         'Light Hammer', 'Mace', 'Quarterstaff', 'Sickle', 
          'Spear']

In [81]:
martial = ['Battleaxe', 'Flail', 'Glaive', 'Greataxe', 
           'Greatsword', 'Halberd', 'Lance', 'Longsword',
          'Maul', 'Morningstar', 'Pike', 'Rapier', 'Scimitar',
          'Shortsword', 'Trident', 'War pick', 'Warhammer', 'Whip']

In [82]:
ranged = ['Crossbow, light', 'Dart', 'Shortbow', 'Sling',
          'Blowgun', 'Crossbow, hand', 'Crossbow, heavy',
          'Longbow', 'Net']

In [18]:
tsv.columns

Index(['background', 'justClass', 'level', 'feats', 'HP', 'AC', 'Str', 'Dex',
       'Con', 'Int', 'Wis', 'Cha', 'skills', 'weapons', 'castingStat',
       'processedRace', 'processedSpells', 'processedWeapons', 'levelGroup',
       'Acrobatics', 'Animal Handling', 'Arcana', 'Athletics', 'Deception',
       'History', 'Insight', 'Intimidation', 'Investigation', 'Medicine',
       'Nature', 'Perception', 'Performance', 'Persuasion', 'Religion',
       'Sleight of Hand', 'Stealth', 'Survival'],
      dtype='object')

In [20]:
tsv['background'].unique()

array(['Knight', 'Urchin', 'Hermit', 'Sage', 'Entertainer', 'Smuggler',
       'Criminal', 'Acolyte', 'Soldier', 'Courtier', 'Folk Hero',
       'Athlete', 'Charlatan', 'Faction Agent', 'Outlander', 'Noble',
       'Custom', 'Reporter', 'Guild Artisan', 'Clan Crafter',
       'House Agent', 'Haunted One', 'Sailor', 'Barbarian Tribe Member',
       'City Watch', 'Far Traveler', 'Archaeologist', 'Fisher',
       'Mercenary Veteran', 'Cloistered Scholar', 'Investigator',
       'Violent Assassin', 'Guild Member - Spycraft',
       'Urban Bounty Hunter', 'Inheritor', 'Archaelogist',
       'Sage (Wiz Apprentice)', 'Gambler', 'Criminal Spy(Custom)',
       'Anthropologist', 'Grand Minstrel', nan,
       'City Watch, Investigator', 'Student of Magic', 'Shipwright',
       'gladiator', 'Cult Hero', 'Marine', 'Outlander (Mountaineer)',
       'Pirate', 'member of the shades', 'Archeologist', 'noble',
       'Knight of the Order', 'First Mate', 'Assassin', 'Merc Veteran',
       'Guild Member -

In [21]:
tsv['feats'].unique()

array(['Fighting Initiate', nan, 'Squat Nimbleness', 'Keen Mind', 'Tough',
       'Fey Touched|Shadow Touched', 'Alert', 'Great Weapon Master',
       'Sharpshooter', 'Fey Touched',
       'Great Weapon Master|Savage Attacker', 'Elemental Adept',
       'War Caster|Skill Expert|Healer', 'Defensive Duelist|Lucky',
       'Weapon Master', 'Great Weapon Master|Polearm Master',
       'Mounted Combatant', 'Telekinetic', 'Skill Expert|Metamagic Adept',
       'Polearm Master',
       'Heavy Armor Master|Tough|Dwarven Fortitude|Durable',
       'Great Weapon Master|Tough|Durable|Tavern Brawler|Medium Armor Master|Slasher',
       'War Caster', 'Inspiring Leader',
       'Magic Initiate|War Caster|Resilient',
       'Magic Initiate|Savage Attacker|Metamagic Adept', 'Observant',
       'Lucky', 'Skill Expert', 'Dual Wielder|Telepathic',
       'Magic Initiate', 'Grappler', 'Great Weapon Master|Slasher',
       'Lucky|Shield Master', 'Resilient', 'Fey Teleportation',
       'Heavy Armor Master|

In [56]:
tsv['multiclass'].unique()

array([0, 1], dtype=int64)