# Making a document-term matrix quickly
Let's say you already counted all of your words, and you want to drop the stopwords in your document-term matrix.

We're going to walk through this one step at a time, again for *Harry Potter*.

In [214]:
import os

def absolute_paths(directory, txt_only = True):
    files = os.listdir(directory)
    absolute_paths = []
    
    for file in files:
        path = os.path.join(directory, file)
        absolute_paths.append(path)
    
    if txt_only is True:
        txts = []
        for x in absolute_paths:
            if str('.txt') in str(x):
                txts.append(x)
        return txts
    
    else:        
        return absolute_paths

In [215]:
# get harry potter paths using our absolute_paths function
hp_dir = '/Users/e/code/literarytextmining/corpora/harry_potter/texts'
hp_files = absolute_paths(hp_dir)
hp_files

['/Users/e/code/literarytextmining/corpora/harry_potter/texts/5 Order of the Phoenix.txt',
 '/Users/e/code/literarytextmining/corpora/harry_potter/texts/4 Goblet of Fire.txt',
 '/Users/e/code/literarytextmining/corpora/harry_potter/texts/6 Half-Blood Prince.txt',
 '/Users/e/code/literarytextmining/corpora/harry_potter/texts/1 Sorcerers Stone.txt',
 '/Users/e/code/literarytextmining/corpora/harry_potter/texts/3 Prisoner of Azkaban.txt',
 '/Users/e/code/literarytextmining/corpora/harry_potter/texts/7 Deathly Hallows.txt',
 '/Users/e/code/literarytextmining/corpora/harry_potter/texts/2 Chamber of Secrets.txt']

In [216]:
import string
import re

def tokenize(text, keep_punct = False):
    if keep_punct is True:
        for punct in string.punctuation:
            text = text.replace(punct, ' ' + punct + ' ')
    else:
        for punct in string.punctuation:
            text = text.replace(punct, ' ')
    
    # this replaces *any* amount of whitespace with a single space using regular expressions
    text = re.sub('\s+', ' ', text)
    
    result = []
    
    for x in text.lower().split(' '):
        if x.isalpha():
            result.append(x)
    
    return result

In [217]:
test = tokenize(open(hp_files[0]).read())

In [218]:
test[:5]

['chapter', 'one', 'dudley', 'demented', 'the']

In [219]:
def count_words(word_list):
    d = {}
    
    for word in word_list:
        if word not in d:
            d[word] = 1
        else:
            d[word] += 1
    
    return d

In [220]:
test_d = count_words(test)

In [221]:
test_d

{'chapter': 51,
 'one': 511,
 'dudley': 90,
 'demented': 1,
 'the': 11902,
 'hottest': 1,
 'day': 131,
 'of': 5448,
 'summer': 44,
 'so': 705,
 'far': 100,
 'was': 3668,
 'drawing': 26,
 'to': 6452,
 'a': 4967,
 'close': 76,
 'and': 6169,
 'drowsy': 2,
 'silence': 55,
 'lay': 57,
 'over': 582,
 'large': 123,
 'square': 21,
 'houses': 19,
 'privet': 24,
 'drive': 32,
 'cars': 10,
 'that': 2395,
 'were': 1001,
 'usually': 27,
 'gleaming': 17,
 'stood': 117,
 'dusty': 22,
 'in': 3149,
 'their': 596,
 'drives': 1,
 'lawns': 9,
 'once': 252,
 'emerald': 11,
 'green': 69,
 'parched': 2,
 'yellowing': 3,
 'use': 82,
 'hosepipes': 1,
 'had': 2422,
 'been': 802,
 'banned': 7,
 'due': 11,
 'drought': 2,
 'deprived': 1,
 'usual': 66,
 'car': 15,
 'washing': 1,
 'lawn': 23,
 'mowing': 1,
 'pursuits': 1,
 'inhabitants': 3,
 'retreated': 8,
 'into': 705,
 'shade': 12,
 'cool': 33,
 'windows': 45,
 'thrown': 23,
 'wide': 50,
 'hope': 52,
 'tempting': 1,
 'nonexistent': 3,
 'breeze': 6,
 'only': 302,


## Looping through all of these to quickly make a DTM out of a directory of text files
We're going to combine our functions to automatically output a DTM.

In [225]:
import pandas as pd

def make_dtm(directory, scaled = False):
    files = absolute_paths(directory)
    
    result = [] # empty list where I will append the dictionaries of word counts
    
    for file in files: # looping over the results
        text = open(file).read() # read in text file
        tokens = tokenize(text) # make tokens list
        d = count_words(tokens) # use count_words to create a dictionary
        
        if scaled is True:
            total_words = sum(list(d.values()))
            for key,value in d.items():
                d[key] = d[key] / total_words
        
        # os.path.split() returns the base path and the filename as a pair:
        d['_filename'] = os.path.split(file)[-1] # include the _ before filename in case the text contains "filename"
        result.append(d) # append the unscaled result
    
    return pd.DataFrame(result).set_index('_filename').sort_index()

In [261]:
df = make_dtm(hp_dir, scaled = False)

You could apply this exact same code to your corpus directory!

In [231]:
df

Unnamed: 0_level_0,a,aaaaaaaaargh,aaaaaaaarrrrrgh,aaaaaaand,aaaaaand,aaaaahed,aaaaargh,aaaah,aaah,aargh,...,zograf,zombie,zone,zonko,zoo,zoological,zoom,zoomed,zooming,éclairs
_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Sorcerers Stone.txt,1066,,,,,,,,,,...,,2.0,,,7.0,,1.0,1,2.0,1.0
2 Chamber of Secrets.txt,1879,,,,,,,,,,...,,,,,2.0,,,2,,
3 Prisoner of Azkaban.txt,2222,,,,,,,,,,...,,1.0,,1.0,,,,9,3.0,
4 Goblet of Fire.txt,3680,,1.0,1.0,1.0,1.0,,,1.0,,...,1.0,,,,,1.0,4.0,9,12.0,
5 Order of the Phoenix.txt,4967,1.0,,,,,1.0,1.0,,2.0,...,,,1.0,,,,2.0,23,7.0,
6 Half-Blood Prince.txt,3323,,,,,,1.0,1.0,,,...,,,,,,,,7,2.0,1.0
7 Deathly Hallows.txt,3604,,,,,,,,,1.0,...,,,,,,,1.0,6,5.0,


# Filtering our dataframe for "real" words
Our functions counted 19,911 types in the *Harry Potter* books. But this includes some "words" that we're not really interested in like "aaaaaaaarrrrrgh."

We can make our processing faster by dropping very rare words. To do this, we have to do a few things:

## Dropping low-frequency words
Even if a word were really interesting to us as readers, we can't do much with it computationally if it only appears once or twice in a corpus.

To do this, one approach would be to sum all of the columns, and find out which ones appear below our cutoff.

Let's say we only want to keep words that appear, on average, once per book. For that, we would want a minimum value of `7`:

In [270]:
word_sums = df.sum(min_count = 7)

In [267]:
word_sums[word_sums > 7]

a               20741.0
able              385.0
about            2455.0
above             220.0
absurd              9.0
accept             41.0
accepted           17.0
accidentally       32.0
accidents          18.0
according          41.0
across            548.0
act                79.0
acted              26.0
acting             43.0
actually          142.0
add                32.0
added             249.0
addressed          36.0
advice             36.0
affect             13.0
afford             17.0
afraid            127.0
after            1143.0
afternoon          99.0
afterward          29.0
again            1932.0
against           640.0
age                93.0
ages               61.0
ago               221.0
                 ...   
wrenched           48.0
wrinkled           12.0
wrist              38.0
write              72.0
writing            91.0
written            77.0
wrong             244.0
wrote              65.0
yawned             27.0
yawning            24.0
yeah            

In [245]:
word_sums2 = df.sum()

In [264]:
word_sums[df.sum() > 7]

a               0.148597
able            0.002571
about           0.017481
above           0.001436
absurd          0.000077
accept          0.000251
accepted        0.000137
accidentally    0.000239
accidents       0.000117
according       0.000252
across          0.003649
act             0.000500
acted           0.000181
acting          0.000313
actually        0.000808
add             0.000241
added           0.001653
addressed       0.000285
advice          0.000238
affect          0.000110
afford          0.000173
afraid          0.000833
after           0.007515
afternoon       0.000751
afterward       0.000185
again           0.012539
against         0.004194
age             0.000608
ages            0.000490
ago             0.001459
                  ...   
wrenched        0.000328
wrinkled        0.000118
wrist           0.000326
write           0.000512
writing         0.000648
written         0.000592
wrong           0.001755
wrote           0.000465
yawned          0.000209


In [246]:
word_sums2[df.sum() > 7]

a                 20741.0
aback                31.0
abandon              10.0
abandoned            33.0
abandoning           15.0
abbott               16.0
aberforth            71.0
ability              23.0
able                385.0
about              2455.0
above               220.0
abroad               14.0
abrupt               16.0
abruptly             64.0
absence              33.0
absent               10.0
absently             12.0
absentmindedly       17.0
absolute             10.0
absolutely           30.0
absorbed             10.0
absurd                9.0
abuse                 8.0
accept               41.0
accepted             17.0
accepting             9.0
access               10.0
accident             36.0
accidentally         32.0
accidents            18.0
                   ...   
yellowish            19.0
yells                24.0
yelp                  8.0
yelped               13.0
yer                  91.0
yes                 337.0
yesterday            39.0
yet         

In [271]:
word_sums = word_sums[word_sums.notnull()] # .notnull() drops all of the None values

In [272]:
len(word_sums)

2494

## How many are left?

In [16]:
len(word_sums[word_sums.notnull()])

2494

There are only 2,494 words that appear more than once per book on average in *Harry Potter*. If we were to cut our dataframe down just for these words, we would be able to do almost *90% fewer computations* to analyze our data.

We could change that `min_count` argument value to anything we wanted. Maybe we just want to get rid of words that appear 1 time? There are a lot of possible approaches you can make the case for, but this makes your data much more manageable, and (usually) much closer to what you want for your project.

# How do these words change over time?

The value of Pandas comes primarily from the fact that it allows us to easily slice, dice, and add new data to our results.

Let's say we want to analyze how Rowling's use of these words changes over time. We'll start with a scaled data frame so that each book is given equal weight:

In [268]:
df = make_dtm(hp_dir, scaled = True)

First things first, let's cut this dataframe down to size using our new list of words that average one instance per book in our corpus.

The `.index` method allows us to see all of the words:

In [273]:
word_sums.index

Index(['a', 'able', 'about', 'above', 'absurd', 'accept', 'accepted',
       'accidentally', 'accidents', 'according',
       ...
       'yesterday', 'yet', 'you', 'young', 'younger', 'youngest', 'your',
       'yours', 'yourself', 'zoomed'],
      dtype='object', length=2494)

And we can use that list to filter our DataFrame!

First, we need to set our `_filename` variable as our index (AKA our row names) so that we don't lose it:

In [274]:
df

Unnamed: 0_level_0,a,aaaaaaaaargh,aaaaaaaarrrrrgh,aaaaaaand,aaaaaand,aaaaahed,aaaaargh,aaaah,aaah,aargh,...,zograf,zombie,zone,zonko,zoo,zoological,zoom,zoomed,zooming,éclairs
_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Sorcerers Stone.txt,0.025712,,,,,,,,,,...,,4.8e-05,,,0.000169,,2.4e-05,2.4e-05,4.8e-05,2.4e-05
2 Chamber of Secrets.txt,0.022353,,,,,,,,,,...,,,,,2.4e-05,,,2.4e-05,,
3 Prisoner of Azkaban.txt,0.021197,,,,,,,,,,...,,1e-05,,1e-05,,,,8.6e-05,2.9e-05,
4 Goblet of Fire.txt,0.019706,,5e-06,5e-06,5e-06,5e-06,,,5e-06,,...,5e-06,,,,,5e-06,2.1e-05,4.8e-05,6.4e-05,
5 Order of the Phoenix.txt,0.020075,4e-06,,,,,4e-06,4e-06,,8e-06,...,,,4e-06,,,,8e-06,9.3e-05,2.8e-05,
6 Half-Blood Prince.txt,0.020399,,,,,,6e-06,6e-06,,,...,,,,,,,,4.3e-05,1.2e-05,6e-06
7 Deathly Hallows.txt,0.019155,,,,,,,,,5e-06,...,,,,,,,5e-06,3.2e-05,2.7e-05,


Now we can use `word_list` to drop a bunch of columns:

In [256]:
valid_words = word_sums[word_sums.notnull()]

In [257]:
valid_words.index

Index(['a', 'able', 'about', 'above', 'absurd', 'accept', 'accepted',
       'accidentally', 'accidents', 'according',
       ...
       'yesterday', 'yet', 'you', 'young', 'younger', 'youngest', 'your',
       'yours', 'yourself', 'zoomed'],
      dtype='object', length=2494)

In [275]:
df = df[valid_words.index]

In [276]:
df

Unnamed: 0_level_0,a,able,about,above,absurd,accept,accepted,accidentally,accidents,according,...,yesterday,yet,you,young,younger,youngest,your,yours,yourself,zoomed
_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Sorcerers Stone.txt,0.025712,0.000241,0.003184,0.000121,2.4e-05,2.4e-05,4.8e-05,2.4e-05,2.4e-05,2.4e-05,...,0.000121,0.000507,0.009913,0.000169,4.8e-05,9.6e-05,0.002002,0.000145,0.000193,2.4e-05
2 Chamber of Secrets.txt,0.022353,0.000428,0.002332,0.00025,1.2e-05,1.2e-05,1.2e-05,5.9e-05,1.2e-05,2.4e-05,...,5.9e-05,0.000345,0.009815,9.5e-05,2.4e-05,2.4e-05,0.001761,4.8e-05,9.5e-05,2.4e-05
3 Prisoner of Azkaban.txt,0.021197,0.000343,0.002471,0.0002,1e-05,1.9e-05,1e-05,5.7e-05,1e-05,1.9e-05,...,3.8e-05,0.000487,0.010551,8.6e-05,1.9e-05,1.9e-05,0.001965,7.6e-05,0.000219,8.6e-05
4 Goblet of Fire.txt,0.019706,0.000278,0.002811,0.000246,5e-06,4.3e-05,1.1e-05,3.7e-05,1.1e-05,3.7e-05,...,4.3e-05,0.000386,0.009398,0.000155,4.3e-05,1.6e-05,0.001826,3.7e-05,0.000171,4.8e-05
5 Order of the Phoenix.txt,0.020075,0.000388,0.002425,0.000243,8e-06,2.8e-05,1.2e-05,3.2e-05,3.2e-05,4e-05,...,3.6e-05,0.000505,0.011127,0.000137,8e-06,8e-06,0.002183,6.9e-05,0.000146,9.3e-05
6 Half-Blood Prince.txt,0.020399,0.000503,0.002302,0.000153,1.2e-05,5.5e-05,1.2e-05,1.2e-05,1.2e-05,4.9e-05,...,1.8e-05,0.00043,0.012455,0.000209,7.4e-05,6e-06,0.002314,6.1e-05,0.000172,4.3e-05
7 Deathly Hallows.txt,0.019155,0.000388,0.001956,0.000223,5e-06,6.9e-05,3.2e-05,1.6e-05,1.6e-05,5.8e-05,...,2.7e-05,0.000489,0.010412,0.000234,9.6e-05,2.7e-05,0.001844,0.000117,0.000117,3.2e-05


# Which words are the most frequent overall?
There are multiple ways of approaching this question.

Since we're using a scaled matrix, let's evaluate their frequencies as an average of each word's relative frequency over all of the books:

In [277]:
len(df) # len provides the number of rows in a dataframe

7

We can use `len` in combination with `df.sum()` to easily create an average:

Let's look at what `df.sum()` outputs:

In [278]:
df.sum()[:3]

a        0.148597
able     0.002571
about    0.017481
dtype: float64

In [26]:
df.sum()[-3:]

yours       0.000553
yourself    0.001113
zoomed      0.000350
dtype: float64

In [27]:
df['a']

_filename
1 Sorcerers Stone.txt         0.025712
2 Chamber of Secrets.txt      0.022353
3 Prisoner of Azkaban.txt     0.021197
4 Goblet of Fire.txt          0.019706
5 Order of the Phoenix.txt    0.020075
6 Half-Blood Prince.txt       0.020399
7 Deathly Hallows.txt         0.019155
Name: a, dtype: float64

So now we can use `df.sum()` and simply divide each value in the series by the `len` of df:

In [279]:
avg_freq = df.sum() / len(df)

In [29]:
avg_freq[:5]

a         0.021228
able      0.000367
about     0.002497
above     0.000205
absurd    0.000011
dtype: float64

Our 25 most frequent words contain two character names ('harry', 'ron') along with other high-frequency words that are usually present in English-language documents:

In [30]:
avg_freq.sort_values(ascending=False)[:25]

the      0.050390
and      0.026280
to       0.025690
a        0.021228
of       0.020903
he       0.019339
harry    0.016008
was      0.015512
his      0.014060
said     0.013590
in       0.012398
it       0.012019
you      0.010524
had      0.009809
that     0.009180
at       0.008438
as       0.007308
on       0.007186
i        0.006993
him      0.006619
with     0.006328
they     0.005794
ron      0.005506
for      0.005458
but      0.005015
dtype: float64

# Which words does Rowling use most differently over time?
We can use these averages to compare the frequency of a word in a given text with its average frequency. That shows us whether a word is used more or less often than we would expect, if all of the words were distributed evenly across the books.

Positive values indicate that the observation exceeded the average; negative values indicate that they fell below the average.

The command is amazingly simple. Pandas matches the column names correctly for us:

In [283]:
df + 30

Unnamed: 0_level_0,a,able,about,above,absurd,accept,accepted,accidentally,accidents,according,...,yesterday,yet,you,young,younger,youngest,your,yours,yourself,zoomed
_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Sorcerers Stone.txt,30.025712,30.000241,30.003184,30.000121,30.000024,30.000024,30.000048,30.000024,30.000024,30.000024,...,30.000121,30.000507,30.009913,30.000169,30.000048,30.000096,30.002002,30.000145,30.000193,30.000024
2 Chamber of Secrets.txt,30.022353,30.000428,30.002332,30.00025,30.000012,30.000012,30.000012,30.000059,30.000012,30.000024,...,30.000059,30.000345,30.009815,30.000095,30.000024,30.000024,30.001761,30.000048,30.000095,30.000024
3 Prisoner of Azkaban.txt,30.021197,30.000343,30.002471,30.0002,30.00001,30.000019,30.00001,30.000057,30.00001,30.000019,...,30.000038,30.000487,30.010551,30.000086,30.000019,30.000019,30.001965,30.000076,30.000219,30.000086
4 Goblet of Fire.txt,30.019706,30.000278,30.002811,30.000246,30.000005,30.000043,30.000011,30.000037,30.000011,30.000037,...,30.000043,30.000386,30.009398,30.000155,30.000043,30.000016,30.001826,30.000037,30.000171,30.000048
5 Order of the Phoenix.txt,30.020075,30.000388,30.002425,30.000243,30.000008,30.000028,30.000012,30.000032,30.000032,30.00004,...,30.000036,30.000505,30.011127,30.000137,30.000008,30.000008,30.002183,30.000069,30.000146,30.000093
6 Half-Blood Prince.txt,30.020399,30.000503,30.002302,30.000153,30.000012,30.000055,30.000012,30.000012,30.000012,30.000049,...,30.000018,30.00043,30.012455,30.000209,30.000074,30.000006,30.002314,30.000061,30.000172,30.000043
7 Deathly Hallows.txt,30.019155,30.000388,30.001956,30.000223,30.000005,30.000069,30.000032,30.000016,30.000016,30.000058,...,30.000027,30.000489,30.010412,30.000234,30.000096,30.000027,30.001844,30.000117,30.000117,30.000032


In [280]:
df - avg_freq

Unnamed: 0_level_0,a,able,about,above,absurd,accept,accepted,accidentally,accidents,according,...,yesterday,yet,you,young,younger,youngest,your,yours,yourself,zoomed
_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Sorcerers Stone.txt,0.004484,-0.000126,0.000687,-8.5e-05,1.317925e-05,-1.2e-05,2.9e-05,-1e-05,7.431419e-06,-1.2e-05,...,7.2e-05,5.7e-05,-0.000611,1.38202e-05,4e-06,6.8e-05,1.7e-05,6.6e-05,3.4e-05,-2.6e-05
2 Chamber of Secrets.txt,0.001125,6.1e-05,-0.000166,4.5e-05,9.554399e-07,-2.4e-05,-8e-06,2.5e-05,-4.79239e-06,-1.2e-05,...,1.1e-05,-0.000105,-0.00071,-5.985005e-05,-2.1e-05,-4e-06,-0.000224,-3.1e-05,-6.4e-05,-2.6e-05
3 Prisoner of Azkaban.txt,-3.1e-05,-2.4e-05,-2.6e-05,-5e-06,-1.401257e-06,-1.7e-05,-1e-05,2.3e-05,-7.149087e-06,-1.7e-05,...,-1.1e-05,3.7e-05,2.7e-05,-6.916392e-05,-2.5e-05,-9e-06,-2e-05,-3e-06,6e-05,3.6e-05
4 Goblet of Fire.txt,-0.001522,-8.9e-05,0.000314,4.1e-05,-5.586042e-06,7e-06,-9e-06,3e-06,-5.978947e-06,1e-06,...,-6e-06,-6.4e-05,-0.001126,2.715052e-07,-2e-06,-1.2e-05,-0.000159,-4.2e-05,1.2e-05,-2e-06
5 Order of the Phoenix.txt,-0.001153,2.1e-05,-7.2e-05,3.7e-05,-2.857611e-06,-8e-06,-7e-06,-2e-06,1.564463e-05,4e-06,...,-1.3e-05,5.6e-05,0.000602,-1.760426e-05,-3.6e-05,-2e-05,0.000198,-1e-05,-1.4e-05,4.3e-05
6 Half-Blood Prince.txt,-0.00083,0.000136,-0.000195,-5.2e-05,1.336279e-06,1.9e-05,-7e-06,-2.2e-05,-4.411551e-06,1.3e-05,...,-3.1e-05,-2e-05,0.001931,5.369186e-05,2.9e-05,-2.2e-05,0.000329,-1.8e-05,1.3e-05,-7e-06
7 Deathly Hallows.txt,-0.002073,2.1e-05,-0.000541,1.8e-05,-5.626058e-06,3.3e-05,1.2e-05,-1.8e-05,-7.440709e-07,2.2e-05,...,-2.2e-05,3.9e-05,-0.000112,7.883466e-05,5.1e-05,-1e-06,-0.000141,3.8e-05,-4.2e-05,-1.8e-05


Now we want to compute the *biggest* difference between the low value and the high value for each column:

In [281]:
df_scaled = df - avg_freq

We want to take the absolute value with `abs` to make sure that we're seeing the biggest change:

In [282]:
df_scaled.max()

a               0.004484
able            0.000136
about           0.000687
above           0.000045
absurd          0.000013
accept          0.000033
accepted        0.000029
accidentally    0.000025
accidents       0.000016
according       0.000022
across          0.000156
act             0.000040
acted           0.000027
acting          0.000015
actually        0.000075
add             0.000049
added           0.000063
addressed       0.000056
advice          0.000023
affect          0.000022
afford          0.000048
afraid          0.000043
after           0.000255
afternoon       0.000086
afterward       0.000032
again           0.000440
against         0.000193
age             0.000026
ages            0.000051
ago             0.000038
                  ...   
wrenched        0.000015
wrinkled        0.000031
wrist           0.000074
write           0.000058
writing         0.000122
written         0.000084
wrong           0.000039
wrote           0.000088
yawned          0.000018


In [61]:
(df_scaled.abs().max() - df_scaled.abs().min()).sort_values(ascending = False)

a             0.004453
the           0.004183
hermione      0.002822
dumbledore    0.002802
said          0.002736
of            0.002503
on            0.002491
hagrid        0.002472
i             0.002407
they          0.002359
her           0.002348
ron           0.002307
that          0.002290
and           0.002234
to            0.002147
harry         0.002099
uncle         0.001935
he            0.001928
dudley        0.001928
you           0.001904
black         0.001777
not           0.001678
vernon        0.001621
it            0.001582
wand          0.001546
professor     0.001524
very          0.001247
she           0.001208
all           0.001199
mr            0.001147
                ...   
supply        0.000008
crime         0.000008
brushed       0.000008
dozen         0.000008
spots         0.000008
tin           0.000008
behave        0.000008
sallow        0.000007
greeting      0.000007
occasions     0.000007
slapping      0.000007
splattered    0.000007
sunny      

So how do we read these results? Characters like Dumbledore, Dudley, and Sirius Black ('black') have some of the largest discrepancies between their smallest values and their largest values. Of course, at the top of this list we still see many of the highest frequency words because they have largest absolute amounts to gain or lose.

At the bottom we see words that have almost no difference between their highest and lowest values, implying that they are used at very similar rates across all of the books.

Indeed, let's look at 'awkward':

In [68]:
avg_freq['awkward']

2.5798655897512307e-05

In [67]:
df['awkward']

_filename
1 Sorcerers Stone.txt         0.000024
2 Chamber of Secrets.txt      0.000024
3 Prisoner of Azkaban.txt     0.000029
4 Goblet of Fire.txt          0.000021
5 Order of the Phoenix.txt    0.000032
6 Half-Blood Prince.txt       0.000018
7 Deathly Hallows.txt         0.000032
Name: awkward, dtype: float64

In [69]:
df_scaled['awkward']

_filename
1 Sorcerers Stone.txt        -0.000002
2 Chamber of Secrets.txt     -0.000002
3 Prisoner of Azkaban.txt     0.000003
4 Goblet of Fire.txt         -0.000004
5 Order of the Phoenix.txt    0.000007
6 Half-Blood Prince.txt      -0.000007
7 Deathly Hallows.txt         0.000006
Name: awkward, dtype: float64

This is both a word with relatively few instances, and those instances are distributed similarly across the books.

So this result is perhaps overdetermined by the differences in the frequencies of the words. How can we correct that?

# Observations and expectations
We can try to account for this by looking at the number of observed values over the number of expected values: Which word furthest outperformed its average?

In [72]:
(df.abs().max() / avg_freq).sort_values(ascending=False)[:30]

dursley      5.868460
prince       5.844709
yer          5.366949
platforms    5.302555
chamber      5.296560
spiders      5.161556
sword        5.148417
flavor       4.988612
egg          4.823550
hut          4.804517
twins        4.733027
toad         4.704410
lots         4.535718
gaunt        4.509343
storm        4.420254
bus          4.348360
dudley       4.312980
coat         4.290419
shops        4.194387
bronze       4.131053
noses        4.109805
goblin       4.093261
car          4.044799
gringotts    4.035403
package      4.035296
monster      3.993382
lumpy        3.990766
problems     3.976754
bird         3.976557
nick         3.943294
dtype: float64

Now we're getting into the subject matter of the books! There are words that are much more frequent in certain books than they are throughout the rest of the series, like the Dursleys, the "prince" of book 6, the "chamber" of secrets, etc.

Which words most seriously under-perform their averages?

In [74]:
(df.abs().min() / avg_freq).sort_values()[:30]

fudge       0.012596
severus     0.035531
points      0.035930
tent        0.035993
hospital    0.037188
sir         0.037962
field       0.038720
class       0.042556
upon        0.047986
charlie     0.049545
w           0.050747
lily        0.052189
bludger     0.053000
bell        0.056057
team        0.057067
quaffle     0.058731
twins       0.061349
leaky       0.062511
tom         0.062591
envelope    0.064803
norris      0.065580
pomfrey     0.066955
feast       0.068260
package     0.068399
classes     0.068458
shop        0.068813
james       0.071593
goblin      0.072029
lunch       0.074397
among       0.077819
dtype: float64

In this result, we see that there are words that are used much less than expected on average in certain books, like Cornelius Fudge ('fudge') who comes up very little in at least one book. Interesting! Let's see which ones:

In [75]:
(df / avg_freq)['fudge']

_filename
1 Sorcerers Stone.txt         0.057163
2 Chamber of Secrets.txt      0.479293
3 Prisoner of Azkaban.txt     2.283456
4 Goblet of Fire.txt          0.964502
5 Order of the Phoenix.txt    2.126429
6 Half-Blood Prince.txt       1.076561
7 Deathly Hallows.txt         0.012596
Name: fudge, dtype: float64

So, if we can assume that most instances of 'fudge' refer to the character Cornelius Fudge, then we can see that *Azkaban* and *Phoenix* have a lot to do with him, where he shows up very little in others.

# Calculate type-token ratio with DTM
Let's compare the type-token ratios of each of the books in our corpus. We're going to make a new DataFrame with unscaled counts:

In [76]:
df = make_dtm(hp_dir, scaled = False)

In [77]:
df

Unnamed: 0_level_0,a,aaaaaaaaargh,aaaaaaaarrrrrgh,aaaaaaand,aaaaaand,aaaaahed,aaaaargh,aaaah,aaah,aargh,...,zograf,zombie,zone,zonko,zoo,zoological,zoom,zoomed,zooming,éclairs
_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Sorcerers Stone.txt,1066,,,,,,,,,,...,,2.0,,,7.0,,1.0,1,2.0,1.0
2 Chamber of Secrets.txt,1879,,,,,,,,,,...,,,,,2.0,,,2,,
3 Prisoner of Azkaban.txt,2222,,,,,,,,,,...,,1.0,,1.0,,,,9,3.0,
4 Goblet of Fire.txt,3680,,1.0,1.0,1.0,1.0,,,1.0,,...,1.0,,,,,1.0,4.0,9,12.0,
5 Order of the Phoenix.txt,4967,1.0,,,,,1.0,1.0,,2.0,...,,,1.0,,,,2.0,23,7.0,
6 Half-Blood Prince.txt,3323,,,,,,1.0,1.0,,,...,,,,,,,,7,2.0,1.0
7 Deathly Hallows.txt,3604,,,,,,,,,1.0,...,,,,,,,1.0,6,5.0,


Let's get our sum of tokens. We have to ask Pandas to sum *across* the columns since we want all of the words in each book, which correspond with our rows:

In [80]:
tokens = df.sum(axis = 'columns')

In [81]:
tokens

_filename
1 Sorcerers Stone.txt          41459.0
2 Chamber of Secrets.txt       84059.0
3 Prisoner of Azkaban.txt     104825.0
4 Goblet of Fire.txt          186744.0
5 Order of the Phoenix.txt    247422.0
6 Half-Blood Prince.txt       162903.0
7 Deathly Hallows.txt         188150.0
dtype: float64

To get our types, we need one more dataframe trick, `transpose`. Transpose, which is called with `df.T`, flips our dataframe such that our rows become columns, and our columns become rows.

We want the total number of columns in each row that are not null (i.e. there is at least 1 instance of the word in the text.)

In [82]:
df.T.head()

_filename,1 Sorcerers Stone.txt,2 Chamber of Secrets.txt,3 Prisoner of Azkaban.txt,4 Goblet of Fire.txt,5 Order of the Phoenix.txt,6 Half-Blood Prince.txt,7 Deathly Hallows.txt
a,1066.0,1879.0,2222.0,3680.0,4967.0,3323.0,3604.0
aaaaaaaaargh,,,,,1.0,,
aaaaaaaarrrrrgh,,,,1.0,,,
aaaaaaand,,,,1.0,,,
aaaaaand,,,,1.0,,,


Now when we sum the number of values that are `notnull`, we get the result we want:

In [83]:
types = df.T.notnull().sum()

In [84]:
types

_filename
1 Sorcerers Stone.txt          4314
2 Chamber of Secrets.txt       6823
3 Prisoner of Azkaban.txt      7335
4 Goblet of Fire.txt          10054
5 Order of the Phoenix.txt    11760
6 Half-Blood Prince.txt       10017
7 Deathly Hallows.txt         10781
dtype: int64

Rowling did not quite linearly increase the vocabulary she used over the series:

In [85]:
(tokens / types).sort_values()

_filename
1 Sorcerers Stone.txt          9.610338
2 Chamber of Secrets.txt      12.319947
3 Prisoner of Azkaban.txt     14.291070
6 Half-Blood Prince.txt       16.262653
7 Deathly Hallows.txt         17.451999
4 Goblet of Fire.txt          18.574100
5 Order of the Phoenix.txt    21.039286
dtype: float64

# Using NLTK for high-frequency words
Now we need to talk about which words *not* to count. It's easy to say why we shouldn't count typos. It becomes more complicated to talk about why we might not to count high-frequency words, also known as "stopwords."

Why we want to skip them? Because their very frequency will override other signals we want to look at.

But we certainly need to know which words these are. There are standard lists in some packages like NLTK, which stands for "Natural Language Toolkit."

To start, run the following commands:

In [275]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

This opens a new window, which will allow you to download NLTK data. The collection "book" should be sufficient for this class.

In [86]:
from nltk.corpus import stopwords

This will show you the beginning of the stopword list:

In [87]:
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

There are quite a few!

In [88]:
len(stopwords.words('english'))

179

## Skipping stopwords
How would we use stopwords to determine which words to keep in our lists of words?

This is one way using familiar techniques:

In [12]:
s = 'which words in this string will get cut if we filter it using the list of stopwords'
l = s.split(' ')
not_stops = []

for word in l:
    if word not in stopwords.words('english'):
        not_stops.append(word)
        
not_stops

['words', 'string', 'get', 'cut', 'filter', 'using', 'list', 'stopwords']

We could also drop stopwords from our dataframe column if we wanted to:

In [90]:
columns_no_stops = []
for x in list(df.columns):
    if x not in stopwords.words('english'):
        columns_no_stops.append(x)

You can then pass Pandas a list of 

In [91]:
df[columns_no_stops]

Unnamed: 0_level_0,aaaaaaaaargh,aaaaaaaarrrrrgh,aaaaaaand,aaaaaand,aaaaahed,aaaaargh,aaaah,aaah,aargh,ab,...,zograf,zombie,zone,zonko,zoo,zoological,zoom,zoomed,zooming,éclairs
_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Sorcerers Stone.txt,,,,,,,,,,,...,,2.0,,,7.0,,1.0,1,2.0,1.0
2 Chamber of Secrets.txt,,,,,,,,,,,...,,,,,2.0,,,2,,
3 Prisoner of Azkaban.txt,,,,,,,,,,,...,,1.0,,1.0,,,,9,3.0,
4 Goblet of Fire.txt,,1.0,1.0,1.0,1.0,,,1.0,,,...,1.0,,,,,1.0,4.0,9,12.0,
5 Order of the Phoenix.txt,1.0,,,,,1.0,1.0,,2.0,1.0,...,,,1.0,,,,2.0,23,7.0,
6 Half-Blood Prince.txt,,,,,,1.0,1.0,,,,...,,,,,,,,7,2.0,1.0
7 Deathly Hallows.txt,,,,,,,,,1.0,,...,,,,,,,1.0,6,5.0,
