# A couple of quick things from HW05

When you set a variable name as `True` or `False` in a function declaration, that allows your user to decide *how* they want the function to run. It can be used to determine which parts of the function get run in a particular case.

For example:

In [1]:
def weird_print(my_string, reverse_it = False):
    if reverse_it is True:
        print(my_string[::-1]) # this reverses each character in the string
    else:
        print(my_string)

In [2]:
weird_print('check it out')

check it out


In [3]:
weird_print('check it out', reverse_it=True)

tuo ti kcehc


# Correlator
We're going to use our document-term matrix to create a version of "Correlator" from Heuser and Le-Khac's pamphlet.

First, we need to load up our DTM:

In [4]:
import os

def absolute_paths(directory, txt_only = True):
    files = os.listdir(directory)
    absolute_paths = []
    
    for file in files:
        path = os.path.join(directory, file)
        absolute_paths.append(path)
    
    if txt_only is True:
        txts = []
        for x in absolute_paths:
            if str('.txt') in str(x):
                txts.append(x)
        return txts
    
    else:        
        return absolute_paths

In [5]:
import string
import re

def tokenize(text, keep_punct = False):
    if keep_punct is True:
        for punct in string.punctuation:
            text = text.replace(punct, ' ' + punct + ' ')
    else:
        for punct in string.punctuation:
            text = text.replace(punct, ' ')
    
    # this replaces *any* amount of whitespace with a single space using regular expressions
    text = re.sub('\s+', ' ', text)
    
    result = []
    
    for x in text.lower().split(' '):
        if x.isalpha():
            result.append(x)
    
    return result

In [6]:
def count_words(word_list):
    d = {}
    
    for word in word_list:
        if word not in d:
            d[word] = 1
        else:
            d[word] += 1
    
    return d

In [7]:
import pandas as pd

def make_dtm(directory, scaled = False):
    files = absolute_paths(directory)
    
    result = [] # empty list where I will append the dictionaries of word counts
    
    for file in files: # looping over the results
        text = open(file).read() # read in text file
        tokens = tokenize(text) # make tokens list
        d = count_words(tokens) # use count_words to create a dictionary
        
        if scaled is True:
            total_words = sum(list(d.values()))
            for key,value in d.items():
                d[key] = d[key] / total_words
        
        # os.path.split() returns the base path and the filename as a pair:
        d['_filename'] = os.path.split(file)[-1] # include the _ before filename in case the text contains "filename"
        result.append(d) # append the unscaled result
    
    return pd.DataFrame(result).set_index('_filename').sort_index()

In [8]:
hp_dir = '/Users/e/code/literarytextmining/corpora/harry_potter/texts'
df = make_dtm(hp_dir, scaled = True)

In [9]:
df

Unnamed: 0_level_0,a,aaaaaaaaargh,aaaaaaaarrrrrgh,aaaaaaand,aaaaaand,aaaaahed,aaaaargh,aaaah,aaah,aargh,...,zograf,zombie,zone,zonko,zoo,zoological,zoom,zoomed,zooming,éclairs
_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Sorcerers Stone.txt,0.025712,,,,,,,,,,...,,4.8e-05,,,0.000169,,2.4e-05,2.4e-05,4.8e-05,2.4e-05
2 Chamber of Secrets.txt,0.022353,,,,,,,,,,...,,,,,2.4e-05,,,2.4e-05,,
3 Prisoner of Azkaban.txt,0.021197,,,,,,,,,,...,,1e-05,,1e-05,,,,8.6e-05,2.9e-05,
4 Goblet of Fire.txt,0.019706,,5e-06,5e-06,5e-06,5e-06,,,5e-06,,...,5e-06,,,,,5e-06,2.1e-05,4.8e-05,6.4e-05,
5 Order of the Phoenix.txt,0.020075,4e-06,,,,,4e-06,4e-06,,8e-06,...,,,4e-06,,,,8e-06,9.3e-05,2.8e-05,
6 Half-Blood Prince.txt,0.020399,,,,,,6e-06,6e-06,,,...,,,,,,,,4.3e-05,1.2e-05,6e-06
7 Deathly Hallows.txt,0.019155,,,,,,,,,5e-06,...,,,,,,,5e-06,3.2e-05,2.7e-05,


Let's cut it down to size for words for which we have data:

In [10]:
words_in_each_book = df.sum(min_count = 7)

In [11]:
my_columns = words_in_each_book[words_in_each_book.notnull()].index

In [12]:
my_columns

Index(['a', 'able', 'about', 'above', 'absurd', 'accept', 'accepted',
       'accidentally', 'accidents', 'according',
       ...
       'yesterday', 'yet', 'you', 'young', 'younger', 'youngest', 'your',
       'yours', 'yourself', 'zoomed'],
      dtype='object', length=2494)

In [13]:
df = df[my_columns]

In [14]:
df

Unnamed: 0_level_0,a,able,about,above,absurd,accept,accepted,accidentally,accidents,according,...,yesterday,yet,you,young,younger,youngest,your,yours,yourself,zoomed
_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 Sorcerers Stone.txt,0.025712,0.000241,0.003184,0.000121,2.4e-05,2.4e-05,4.8e-05,2.4e-05,2.4e-05,2.4e-05,...,0.000121,0.000507,0.009913,0.000169,4.8e-05,9.6e-05,0.002002,0.000145,0.000193,2.4e-05
2 Chamber of Secrets.txt,0.022353,0.000428,0.002332,0.00025,1.2e-05,1.2e-05,1.2e-05,5.9e-05,1.2e-05,2.4e-05,...,5.9e-05,0.000345,0.009815,9.5e-05,2.4e-05,2.4e-05,0.001761,4.8e-05,9.5e-05,2.4e-05
3 Prisoner of Azkaban.txt,0.021197,0.000343,0.002471,0.0002,1e-05,1.9e-05,1e-05,5.7e-05,1e-05,1.9e-05,...,3.8e-05,0.000487,0.010551,8.6e-05,1.9e-05,1.9e-05,0.001965,7.6e-05,0.000219,8.6e-05
4 Goblet of Fire.txt,0.019706,0.000278,0.002811,0.000246,5e-06,4.3e-05,1.1e-05,3.7e-05,1.1e-05,3.7e-05,...,4.3e-05,0.000386,0.009398,0.000155,4.3e-05,1.6e-05,0.001826,3.7e-05,0.000171,4.8e-05
5 Order of the Phoenix.txt,0.020075,0.000388,0.002425,0.000243,8e-06,2.8e-05,1.2e-05,3.2e-05,3.2e-05,4e-05,...,3.6e-05,0.000505,0.011127,0.000137,8e-06,8e-06,0.002183,6.9e-05,0.000146,9.3e-05
6 Half-Blood Prince.txt,0.020399,0.000503,0.002302,0.000153,1.2e-05,5.5e-05,1.2e-05,1.2e-05,1.2e-05,4.9e-05,...,1.8e-05,0.00043,0.012455,0.000209,7.4e-05,6e-06,0.002314,6.1e-05,0.000172,4.3e-05
7 Deathly Hallows.txt,0.019155,0.000388,0.001956,0.000223,5e-06,6.9e-05,3.2e-05,1.6e-05,1.6e-05,5.8e-05,...,2.7e-05,0.000489,0.010412,0.000234,9.6e-05,2.7e-05,0.001844,0.000117,0.000117,3.2e-05


# Writing correlator

Heuser and Le-Khac describe their correlator like so:

> To do so, we made use of a feature of the novelistic database Matthew Jockers had designed: a data-table of the number of occurrences of each word in our corpus. From this, we selected the words that appeared at least once in each decade of the nineteenth century, creating a new data-table of the se- lected words’ frequencies of appearance.3 We used normalized frequencies—the number of occurrences of a given word in a given decade, divided by the total number of word- occurrences in that decade—to correct for the over-representation of late century texts in our corpus. Then, we built a script to loop through each unique word-to-word comparison, calculate the degree of correlation between the two words’ decade-by-decade frequen- cies, and store this information in a new data-table. As a measure of correlation, we used the Pearson product-moment correlation coefficient, a simple and widely-used statistical measure of the covariance of two numerical series, converted into standard deviations so that differences in scale were ignored4. (This scale-invariance was important, as we hoped to find words that behaved similarly despite differences in their overall frequencies.)

Now that we have this document-term matrix, we can easily make one of our own.

Remember, correlation is not causation, but it can show us whether and how words "move together."

With Pandas, we can easily correlate all of the columns of a DataFrame with each other, giving us a new correlation matrix:

In [15]:
corrs = df.corr()

In [16]:
corrs

Unnamed: 0,a,able,about,above,absurd,accept,accepted,accidentally,accidents,according,...,yesterday,yet,you,young,younger,youngest,your,yours,yourself,zoomed
a,1.000000,-0.452957,0.714343,-0.634130,0.954499,-0.596491,0.617015,0.142322,0.214967,-0.671289,...,0.936722,0.121581,-0.283874,-0.260596,-0.217702,0.874082,-0.052842,0.523618,0.262625,-0.392595
able,-0.452957,1.000000,-0.775012,0.147052,-0.310355,0.268301,-0.498790,-0.143118,-0.181401,0.432767,...,-0.684735,-0.277239,0.745634,0.164652,0.197033,-0.664819,0.416372,-0.402871,-0.439619,0.034746
about,0.714343,-0.775012,1.000000,-0.458758,0.665811,-0.462024,0.390882,0.081576,0.216598,-0.587046,...,0.801773,0.091270,-0.420420,-0.235979,-0.331880,0.688034,-0.006262,0.230448,0.562374,-0.101896
above,-0.634130,0.147052,-0.458758,1.000000,-0.806128,-0.062758,-0.615364,0.484011,-0.099691,0.112292,...,-0.492245,-0.437171,-0.342931,-0.332560,-0.330268,-0.644551,-0.493720,-0.639231,-0.570150,0.279729
absurd,0.954499,-0.310355,0.665811,-0.806128,1.000000,-0.414568,0.655769,-0.111351,0.276006,-0.486530,...,0.851838,0.221386,-0.024904,-0.047774,-0.058131,0.842198,0.204366,0.580423,0.315114,-0.396184
accept,-0.596491,0.268301,-0.462024,-0.062758,-0.414568,1.000000,0.150697,-0.802943,-0.116636,0.933934,...,-0.517021,0.159594,0.354467,0.914910,0.874706,-0.247319,0.183983,0.153125,-0.157178,-0.210428
accepted,0.617015,-0.498790,0.390882,-0.615364,0.655769,0.150697,1.000000,-0.477066,0.373075,0.044649,...,0.716435,0.498841,-0.245543,0.463534,0.439264,0.896089,-0.099819,0.931711,0.061593,-0.542812
accidentally,0.142322,-0.143118,0.081576,0.484011,-0.111351,-0.802943,-0.477066,1.000000,-0.315073,-0.782133,...,0.098327,-0.411753,-0.491250,-0.948997,-0.755990,-0.142884,-0.523607,-0.449422,-0.002643,0.257406
accidents,0.214967,-0.181401,0.216598,-0.099691,0.276006,-0.116636,0.373075,-0.315073,1.000000,0.103844,...,0.317487,0.605315,0.123399,0.112075,-0.262632,0.284937,0.394459,0.406573,-0.084161,0.279475
according,-0.671289,0.432767,-0.587046,0.112292,-0.486530,0.933934,0.044649,-0.782133,0.103844,1.000000,...,-0.591220,0.133708,0.439710,0.863956,0.732671,-0.380663,0.264710,0.048162,-0.388389,-0.116930


Now, we can write a little function to show the correlations for any word in the list:

In [34]:
def my_correlator(word, corrs, n_corrs = 15):
    values = corrs[word]
    highs = values.sort_values(ascending = False)[:n_corrs] # this puts the most positive number at the top
    lows = values.sort_values(ascending = True)[:n_corrs] # this puts the most negative number at the top
    print('-'*50)
    print('Most strongly positive correlations with \"{}\": '.format(word))
    print(highs)
    print('-'*50)
    print('Most strongly negative correlations with \"{}\": '.format(word))
    print(lows)
    print('-'*50)

In [156]:
my_correlator('hagrid', corrs)

--------------------------------------------------
Most strongly positive correlations with "hagrid": 
hagrid      1.000000
knows       0.993978
owl         0.991770
uncle       0.987799
nervous     0.986590
station     0.986559
nine        0.983492
ordinary    0.983129
barked      0.982662
allowed     0.982459
balls       0.981992
vernon      0.980256
lock        0.980218
everyone    0.979804
privet      0.979674
Name: hagrid, dtype: float64
--------------------------------------------------
Most strongly negative correlations with "hagrid": 
split       -0.966914
closed      -0.959544
longer      -0.939648
of          -0.939006
to          -0.932863
clear       -0.926546
is          -0.913224
invisible   -0.900331
folded      -0.899863
bellowed    -0.899033
free        -0.898842
return      -0.898589
lifted      -0.895958
anymore     -0.895881
below       -0.881991
Name: hagrid, dtype: float64
--------------------------------------------------


What do these results tell us? First, the positive correlates: They show that when 'hagrid' increases, 'knows' changes in a *proportionally similar* way each time.

That is, when 'hagrid' goes up, 'knows' goes up. When 'hagrid' goes down, 'knows' also goes down:

In [67]:
df[['hagrid', 'knows']]

Unnamed: 0_level_0,hagrid,knows
_filename,Unnamed: 1_level_1,Unnamed: 2_level_1
1 Sorcerers Stone.txt,0.00439,0.000338
2 Chamber of Secrets.txt,0.001606,0.000202
3 Prisoner of Azkaban.txt,0.001917,0.0002
4 Goblet of Fire.txt,0.001665,0.000193
5 Order of the Phoenix.txt,0.001495,0.00019
6 Half-Blood Prince.txt,0.001056,0.00016
7 Deathly Hallows.txt,0.000702,0.000133


Strongly negative correlates show an inverse relationship: When 'hagrid' goes down, 'split' goes up:

In [68]:
df[['hagrid', 'split']]

Unnamed: 0_level_0,hagrid,split
_filename,Unnamed: 1_level_1,Unnamed: 2_level_1
1 Sorcerers Stone.txt,0.00439,2.4e-05
2 Chamber of Secrets.txt,0.001606,9.5e-05
3 Prisoner of Azkaban.txt,0.001917,7.6e-05
4 Goblet of Fire.txt,0.001665,8e-05
5 Order of the Phoenix.txt,0.001495,8.5e-05
6 Half-Blood Prince.txt,0.001056,0.000117
7 Deathly Hallows.txt,0.000702,0.000117


As you can already see, one of the problems here is that we sometimes compare relatively common words to relatively rare ones. We can improve on this by cutting our DataFrame again to words with a certain total frequency.

Below, we recalculate our dataframe and only retain words that have 10 or more instances *in each book*.

In [239]:
df = make_dtm(hp_dir, scaled = False)

`.all()` allows us to check a dataframe (or a slice) to see if all of its cells in each column evaluate to `True`.

In [240]:
df['the']

_filename
1 Sorcerers Stone.txt          2073
2 Chamber of Secrets.txt       4275
3 Prisoner of Azkaban.txt      5368
4 Goblet of Fire.txt           9494
5 Order of the Phoenix.txt    11902
6 Half-Blood Prince.txt        7617
7 Deathly Hallows.txt         10341
Name: the, dtype: int64

In [241]:
df['the'] > 10

_filename
1 Sorcerers Stone.txt         True
2 Chamber of Secrets.txt      True
3 Prisoner of Azkaban.txt     True
4 Goblet of Fire.txt          True
5 Order of the Phoenix.txt    True
6 Half-Blood Prince.txt       True
7 Deathly Hallows.txt         True
Name: the, dtype: bool

In [243]:
(df['the'] > 10).all()

True

But in cases where only some books meet our criteria:

In [163]:
df['crumpet'] >= 1

_filename
1 Sorcerers Stone.txt         False
2 Chamber of Secrets.txt      False
3 Prisoner of Azkaban.txt      True
4 Goblet of Fire.txt          False
5 Order of the Phoenix.txt     True
6 Half-Blood Prince.txt       False
7 Deathly Hallows.txt         False
Name: crumpet, dtype: bool

In [164]:
(df['crumpet'] > 10).all()

False

So, if we create a new dataframe with all of our results, we can make another filter for our data:

In [165]:
gt10 = df > 10

In [166]:
gt10.all()[:5]

a                   True
aaaaaaaaargh       False
aaaaaaaarrrrrgh    False
aaaaaaand          False
aaaaaand           False
dtype: bool

In [167]:
df.sum()[gt10.all()]

a            20741.0
about         2455.0
across         548.0
after         1143.0
again         1932.0
against        640.0
air            525.0
all           4116.0
almost         390.0
along          491.0
already        463.0
always         396.0
an            2246.0
and          26557.0
another        688.0
any            886.0
anyone         366.0
anything       696.0
are           1830.0
arms           287.0
around        2225.0
arrived        202.0
as            7477.0
ask            335.0
asked         1090.0
at            8575.0
aunt           364.0
away          1109.0
back          3155.0
bad            224.0
              ...   
went           655.0
were          4198.0
what          3243.0
when          2222.0
where         1176.0
which         1428.0
while          678.0
whispered      428.0
who           2925.0
whole          350.0
why            718.0
will          1317.0
window         428.0
with          6454.0
without        568.0
wizard         479.0
wizards      

This gives us just 457 words of our original 19,911 that meet our criteria for analysis. Again, those criteria are: each word must appear in every single *Harry Potter* book, and in each of its appearances, it must appear 10 or more times.

Let's use this to further cut down our dataframe for correlation:

In [169]:
my_cols = df.sum()[gt10.all()].index # calling index at the end gets the words that meet our criteria

In [170]:
my_cols

Index(['a', 'about', 'across', 'after', 'again', 'against', 'air', 'all',
       'almost', 'along',
       ...
       'would', 'wrong', 'year', 'years', 'yeh', 'yelled', 'yes', 'yet', 'you',
       'your'],
      dtype='object', length=457)

Now, I'm going to recreate my dataframe as a scaled version so that our correlation isn't messed up by the different lengths of the books:

In [172]:
df = make_dtm(hp_dir, scaled = True)

We're going to run the correlation *only on the columns that meet our criteria*, i.e. `my_cols`:

In [173]:
new_corr = df[my_cols].corr()

In [174]:
new_corr

Unnamed: 0,a,about,across,after,again,against,air,all,almost,along,...,would,wrong,year,years,yeh,yelled,yes,yet,you,your
a,1.000000,0.714343,-0.552382,-0.726172,-0.729902,-0.705164,0.131147,0.867714,0.810412,-0.623688,...,-0.849800,0.604382,0.230158,0.926257,0.871785,0.153888,0.572620,0.121581,-0.283874,-0.052842
about,0.714343,1.000000,-0.116899,-0.734824,-0.403272,-0.744638,0.236365,0.862675,0.832297,-0.487316,...,-0.669423,-0.006695,0.281683,0.684524,0.842393,0.052070,0.593145,0.091270,-0.420420,-0.006262
across,-0.552382,-0.116899,1.000000,0.389497,0.744860,0.513963,-0.018243,-0.513688,-0.538109,0.808975,...,0.112851,-0.624386,0.113408,-0.660056,-0.405487,0.589928,-0.705200,0.039700,-0.282798,-0.292703
after,-0.726172,-0.734824,0.389497,1.000000,0.730875,0.809722,-0.061339,-0.899050,-0.840304,0.466760,...,0.663710,0.016309,-0.345733,-0.587964,-0.578245,0.118186,-0.387902,0.346175,0.607782,0.342802
again,-0.729902,-0.403272,0.744860,0.730875,1.000000,0.753237,-0.101307,-0.704302,-0.678470,0.794891,...,0.480243,-0.481969,-0.114225,-0.670582,-0.501183,0.197048,-0.473699,0.361412,0.350159,0.316998
against,-0.705164,-0.744638,0.513963,0.809722,0.753237,1.000000,-0.552792,-0.911428,-0.717889,0.734865,...,0.600051,-0.111678,0.172378,-0.774296,-0.776276,0.269226,-0.690405,-0.034193,0.511674,0.225336
air,0.131147,0.236365,-0.018243,-0.061339,-0.101307,-0.552792,1.000000,0.283929,-0.115110,-0.230598,...,-0.308703,-0.040757,-0.830790,0.367629,0.452084,-0.019428,0.324327,0.599465,-0.377740,-0.228968
all,0.867714,0.862675,-0.513688,-0.899050,-0.704302,-0.911428,0.283929,1.000000,0.896571,-0.658981,...,-0.732781,0.198627,0.131775,0.854738,0.862988,-0.158795,0.696273,0.037649,-0.415762,-0.067095
almost,0.810412,0.832297,-0.538109,-0.840304,-0.678470,-0.717889,-0.115110,0.896571,1.000000,-0.689913,...,-0.534951,0.234821,0.472907,0.739244,0.729581,-0.226041,0.684937,-0.199641,-0.179687,0.128519
along,-0.623688,-0.487316,0.808975,0.466760,0.794891,0.734865,-0.230598,-0.658981,-0.689913,1.000000,...,0.214650,-0.509510,0.087062,-0.757478,-0.643373,0.475868,-0.856010,0.026332,-0.070194,-0.198272


In [175]:
my_correlator('bad', new_corr)

--------------------------------------------------
Most strongly positive correlations with "bad": 
bad           1.000000
seamus        0.968107
pair          0.964503
five          0.961552
start         0.956068
on            0.952860
snapped       0.948121
ten           0.942005
knows         0.935258
got           0.933499
mcgonagall    0.924770
something     0.923745
week          0.917303
getting       0.910475
a             0.903803
Name: bad, dtype: float64
--------------------------------------------------
Most strongly negative correlations with "bad": 
light      -0.911939
own        -0.898202
is         -0.891130
to         -0.876991
of         -0.862705
felt       -0.861041
would      -0.849656
moment     -0.833288
both       -0.823459
still      -0.813622
after      -0.799030
did        -0.776998
hermione   -0.773693
who        -0.766327
done       -0.762777
Name: bad, dtype: float64
--------------------------------------------------


You will find that, as the size of your corpus increases, your correlation values will get smaller.

Heuser and Le-Khac express surprise that some words they expected to more together did not. We can easily check to see whether and how words move together by slightly altering our function to take a list of words:

In [236]:
def do_they_correlate(words, corrs):
    for word in words:
        values = corrs[word] # get correlation data for each word
        words_copy = words.copy() # to not mess up the original word list
        words_copy.remove(word) # no need to test against itself
        
        for test in words_copy:
            result = round(values[test],2)
            output = "{} and {} correlation:".format(test, word)
            spaces = (40 - len(output)) * ' '
            print('{}{}{}'.format(output,spaces,result))
            print('-'*50)

In [237]:
do_they_correlate(['wizard', 'hermione', 'harry', 'ron', 'malfoy', 'dumbledore'], new_corr)

hermione and wizard correlation:        -0.66
--------------------------------------------------
harry and wizard correlation:           0.19
--------------------------------------------------
ron and wizard correlation:             0.0
--------------------------------------------------
malfoy and wizard correlation:          0.22
--------------------------------------------------
dumbledore and wizard correlation:      -0.44
--------------------------------------------------
wizard and hermione correlation:        -0.66
--------------------------------------------------
harry and hermione correlation:         -0.11
--------------------------------------------------
ron and hermione correlation:           0.35
--------------------------------------------------
malfoy and hermione correlation:        -0.41
--------------------------------------------------
dumbledore and hermione correlation:    0.03
--------------------------------------------------
wizard and harry correlation:       

# Creating cohorts

Now we're going to extend to the next part of Heuser and Le-Khac's analysis, where we propose cohorts ourselves:

In [195]:
def word_to_cohort(word, cor_matrix = gale_corr, threshold = 0.5):
    '''
    cohort base represents the user-proposed word to create a cohort.
    population begins with the total number of words in the corpus.
    '''
    
    population = list(cor_matrix.columns)
    population.remove(word) # drop processed word from list of candidates 
    
    values = cor_matrix[word][population]

    if (values < threshold).any(): # not taking absolute values; interested in positively correlated words
        # drop weak links
        dropped = list(values[values < threshold].index)
        for drop in dropped:
            population.remove(drop)

    if len(population) == 0:
        print('No viable cohort for {}'.format(word))
        return

    # using that baseline, test the correlations of each of the other population elements
    # add the one that correlates above the threshold with the greatest number of other elements to the list
    results = {}
    
    for candidate in population:
        values = cor_matrix[candidate][population]
        results[candidate] = len(values[values > threshold])
    
    cohort_words = []
    for k,v in results.items():
        if v == max(results.values()):
            cohort_words.append(k)
    
    print('cohort is: {}'.format(l))
    
    return cohort_words

`average` and `intuition` share a ton of correlates.

In [251]:
def corr_cohort(cohort, scaled_freqs):
    # ^ this is always going to start with a full copy of the original scaled freqs
    combined_vector = scaled_freqs[cohort].sum(axis = 'columns') # sum across the columns to get a new vector
    for x in cohort: # can't drop all of them because some will be repeats in the loop
        try:
            scaled_freqs = scaled_freqs.drop(x, axis = 'columns')
        except:
            continue
            
    scaled_freqs['my_cohort'] = combined_vector
    return scaled_freqs.corr()

In [217]:
words_to_cohort(cohort, gale_copy.corr())

cohort is: ['intuition', 'average', 'my_cohort', 'san', 'twinkle']


['intuition', 'average', 'my_cohort', 'san', 'twinkle']

In [422]:
def words_to_cohort(my_cohort, cor_matrix, threshold = 0.9):
    population = list(cor_matrix.columns)
    
    for word in my_cohort: # on the off-chance words weren't succesffully removed previously
        try:
            population.remove(word)
        except:
            continue
            
    population.remove('my_cohort') # make sure this is not in the column names
    
    values = cor_matrix['my_cohort'][population] # column name always fixed as it is always overwritten

    if (values < threshold).any(): # not taking absolute values; interested in positively correlated words
        # drop weak links
        dropped = list(values[values < threshold].index)
        for drop in dropped:
            population.remove(drop)

    if len(population) == 0:
        print('No viable additional cohort members')
        return 

    # using that baseline, test the correlations of each of the other population elements
    # add the one that correlates above the threshold with the greatest number of other elements to the list
    results = {}

    for candidate in population:
        values = cor_matrix[candidate][population]
        results[candidate] = len(values[values > threshold])

    for k,v in results.items():
        if v == max(results.values()):
            return k

In [252]:
test_scaled_freqs = gale.copy()
test_scaled_freqs.set_index('decade', inplace = True)

In [446]:
test_scaled_freqs = pd.read_csv('/Users/e/Desktop/gale_5years_scaled.csv')
test_scaled_freqs.set_index('fives', inplace = True)
# this only retains 1820-1915
test_scaled_freqs = test_scaled_freqs.iloc[7:-1]

In [284]:
cohort = ['idle', 'cares', 'sober', 'yielding', 'patient', 'kind', 'lithe', 'spotless', 'indifferent', 'offered',
          'enjoying', 'charitable', 'joyful', 'impetuous']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [294]:
cohort = ['god', 'jesus', 'divine', 'holy', 'sacred'] # very sturdy
test_m = corr_cohort(cohort, test_scaled_freqs)

In [310]:
cohort = ['average', 'ordinary', 'normal', 'middle', 'class', 'people']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [316]:
cohort = ['average', 'american'] # lol first result is "fools"
test_m = corr_cohort(cohort, test_scaled_freqs)

In [321]:
cohort = ['citizen'] # excellent results
test_m = corr_cohort(cohort, test_scaled_freqs)

In [323]:
cohort = ['person', 'people'] # very good results
test_m = corr_cohort(cohort, test_scaled_freqs)

In [336]:
cohort = ['average', 'normal', 'straight', 'queer', 'type'] # very good results
test_m = corr_cohort(cohort, test_scaled_freqs)

In [395]:
cohort = ['average', 'normal', 'straight', 'queer', 'type'] # very good results
test_m = corr_cohort(cohort, test_scaled_freqs)

In [393]:
cohort = ['type', 'person', 'generally', 'expect', 'opinion', 'formed', 'contrary', 'expectation', 'every',
          'probable', 'particular']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [356]:
cohort = ['every', 'most', 'numerous', 'opinion', 'highly', 'calculated', 'native',
         'sufficient', 'particulars', 'peculiarly']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [381]:
cohort = ['liberty', 'freedom', 'equally'] # this outputs results pertaining to the restriction of freedom!
test_m = corr_cohort(cohort, test_scaled_freqs)

In [362]:
cohort = ['female', 'domestic', 'graces', 'appear', 'appears', 'sensible']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [366]:
cohort = ['government', 'constitution', 'liberal', 'governed', 'equally']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [369]:
cohort = ['rural', 'city', 'country']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [384]:
cohort = ['middle', 'class', 'people']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [389]:
cohort = ['foreign', 'compared', 'anticipate']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [406]:
# the word most strongly *negatively* correlated with average is 'heaven'
# which says something about the secularization hypothesis...
cohort = ['average', 'nowadays']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [414]:
# strongest correlate of photograph telegram and telegraph is average
# obviously it's sensitive to history
cohort = ['photograph', 'telegram', 'telegraph']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [425]:
cohort = ['love']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [458]:
cohort = ['fear', 'pity', 'joy', 'love', 'hate']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [429]:
cohort = ['slave'] # third correlate is "freely..."
test_m = corr_cohort(cohort, test_scaled_freqs)

In [461]:
cohort = ['america', 'american', 'united', 'states'] # extremely good results
test_m = corr_cohort(cohort, test_scaled_freqs)

In [433]:
cohort = ['average', 'human']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [436]:
cohort = ['intuition', 'guess', 'suspicion']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [444]:
cohort = ['negro']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [454]:
cohort = ['average', 'american'] # lol first result is "fools"
test_m = corr_cohort(cohort, test_scaled_freqs)

In [463]:
cohort = ['united', 'states'] # lol first result is "fools"
test_m = corr_cohort(cohort, test_scaled_freqs)

In [473]:
cohort = ['type','average','citizen'] # not thinking about the discourse correctly; need to go back to texts
test_m = corr_cohort(cohort, test_scaled_freqs)

In [487]:
cohort = ['civil','civilized']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [None]:
cohort = ['character','characters']

In [495]:
cohort = ['hard']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [498]:
cohort = ['anyone']
test_m = corr_cohort(cohort, test_scaled_freqs)

In [None]:
while len(cohort) <= 15:
    cohort.append(words_to_cohort(cohort, test_m))
    print('{}'.format(cohort[-1]))
    test_m = corr_cohort(cohort, test_scaled_freqs)

personality
facing
faced
behind


Now, the above list becomes the basis for the next loop, which requires that you sum the values in the source matrix, generate a new correlation table, and repeat the process.

In [38]:
def correlator(cohort, corrs = gale_corr):#scaled_dtm):
    # 1. Propose a cohort of words in the variable corrs
    # 2. Check the correlation of the cohort against itself
    
    
    # 3. If each of the elements in the cohort correlates
    # 4. Sum their vectors in the scaled dtm
    # 5. Drop the columns for scaled dtm 
    # 6. Calculate new correlations for the summed vectors against the remaining dtm
    # 7. Return the top correlates from the new DTM

In [63]:
cohort = ['average', 'ordinary', 'normal']

In [64]:
values = gale_corr['ordinary'][cohort]

In [75]:
values.abs().sort_values().index[0]

'normal'

In [66]:
(values.abs() < 0.1).any()

True

In [39]:
correlator(['average', 'ordinary'])

TypeError: list indices must be integers or slices, not str

In [None]:
def correlator2(start_word, corrs, scaled_dtm):
    # 1. starting with one word of interest
    # 2. automatically select words that correlate closely with it
    # 3. create a new vector
    # 4. check for words that correlate with the new vector
    # 5. if none cross the threshold, drop the one with the weakest correlation with the rest of the group
    # 6. recalculate the vector with the new terms
    # 7. choose the highest correlate relative to those
    # 8. see if you can successfully add a new element to the cohort
    # 9. repeat until you have a strongly-grouped cohort of ... 20?

In [178]:
gale = '/Users/e/Desktop/gale_decades_scaled.csv'
gale = pd.read_csv(gale)

In [30]:
gale.set_index('decade', inplace=True)

In [32]:
gale_corr = gale.corr()

In [36]:
my_correlator('normal', gale_corr)

--------------------------------------------------
Most strongly positive correlations with "normal": 
normal          1.000000
grimly          0.998200
straightened    0.996595
downstairs      0.996095
anyway          0.995453
worry           0.995070
worried         0.994241
upstairs        0.994029
doesn           0.993871
problems        0.993016
funny           0.992052
anyhow          0.990398
big             0.989432
tucked          0.988951
stared          0.988773
Name: normal, dtype: float64
--------------------------------------------------
Most strongly negative correlations with "normal": 
complexion      -0.961367
and             -0.952962
bear            -0.945628
shall           -0.941061
miserable       -0.940240
gaining         -0.935714
possession      -0.935382
remain          -0.933881
bade            -0.928094
should          -0.921118
comprehend      -0.918889
circumstances   -0.918222
whose           -0.907437
kindred         -0.905886
strive          -0.904338
