Which names tend to co-occur within the same document?

In [66]:
import os
import pandas as pd
from multiprocessing import Pool
import time
import re
from collections import Counter
import itertools

In [2]:
DATA = '/oak/stanford/groups/malgeehe/celebs/chicago_results/chicago_names'

In [3]:
files = [os.path.join(DATA,x) for x in os.listdir(DATA) if x.endswith('.tsv')]

So, let's read in everything, and then start with all of the documents that have > 1 name.

In [9]:
def load_tsv(tsv):
    df = pd.read_csv(tsv, sep = '\t')
    df.columns = ['path', 'name']
    return df

In [11]:
start = time.time()
with Pool() as p:
    L = p.map(load_tsv, files)
print(time.time()-start)

13.610511064529419


In [12]:
df = pd.concat(L)

In [28]:
L = None # free up memory

Find docs with more than one name

In [13]:
g = df.groupby('path').count()

In [14]:
docs = g[g['name'] > 1].index # docs with multiple people

Of these, which are in the relevant time period since we have too many?

In [15]:
meta = pd.read_csv('/oak/stanford/groups/malgeehe/celebs/chicago_results/chicago_1919-1939_meta.csv')

In [16]:
in_period = [os.path.split(x)[1] for x in meta['fullpath']]

In [17]:
candidates = [os.path.split(x)[1].split('.xml')[0] for x in docs]

In [18]:
# fix the chunks
candidates = [x.split('_chunk')[0] for x in candidates]

In [19]:
candidates[0]

'CD_20151209220246_00001_491877180.txt'

In [22]:
len(candidates) == len(docs)

True

In [23]:
# match
in_period[0], candidates[0]

('CT_20170929192812_00001_181362810.txt',
 'CD_20151209220246_00001_491877180.txt')

In [24]:
in_period_multiple_names = list(set(in_period) & set(candidates))

In [25]:
len(in_period_multiple_names)

440078

In [27]:
word_pattern = re.compile(r'C[A-Z]{1}_[0-9]{14}_[0-9]{5}_[0-9]{9}.txt')

In [32]:
def extract_txt(path):
    return re.search(word_pattern, path).group() #tuple: (path, re.search(word_pattern, x).group())

In [56]:
start = time.time()
df['txt'] = [extract_txt(x) for x in df['path']] # is there a way to use a generator to feed this?
print(time.time()-start)

42.642592668533325


Then, filter `df` for elements in the `txt` column that `isin` `in_period_multiple_names`

In [58]:
subset = df[df['txt'].isin(in_period_multiple_names)]

In [60]:
subset.shape[0], len(in_period_multiple_names)

(4625679, 440078)

In [61]:
subset.head()

Unnamed: 0,path,name,txt
8,/scratch/groups/malgeehe/celebs/chicago_corenl...,George Mem,CD_20151209222115_00011_492423658.txt
9,/scratch/groups/malgeehe/celebs/chicago_corenl...,Timmy Eritt,CD_20151209222115_00011_492423658.txt
10,/scratch/groups/malgeehe/celebs/chicago_corenl...,Jack Blackburn,CD_20151209222115_00011_492423658.txt
11,/scratch/groups/malgeehe/celebs/chicago_corenl...,Mike Twin Sul,CD_20151209222115_00011_492423658.txt
12,/scratch/groups/malgeehe/celebs/chicago_corenl...,Joe Was Sure ``,CD_20151209222115_00011_492423658.txt


Ok, and the goal here is to create tuples containing every possible combination of names that appear in the *same* article, and get the total frequencies with which a given pair appears.

Order doesn't matter, so maybe create a tuple like this:

`({name_a, name_b}, 0000)`

Where index `[1]` is a numeric ID for the tuple. (Unless maybe it's just faster to use a `Counter` of sets?)

In [63]:
# counter of sets
articles = subset['txt'].unique()

Way faster if we sort the df and chop for specific rows in advance

In [105]:
sub_sort = subset.sort_values('txt')

In [106]:
g = sub_sort.groupby('txt').count()

In [108]:
g.reset_index(inplace = True)

In [109]:
g.columns

Index(['txt', 'path', 'name'], dtype='object')

In [115]:
rows = g['path'].to_list()

Maybe there would be a way to do this with a generator? Not sure since we need to sum the names in the end.

Another way would be to update the Counter to keep the list size down, and trash the unaggregated data.

In [1]:
def get_pairs(rows):
    START = 0
    N = len(rows)
    C = Counter()
    
    for i,row in enumerate(rows):
        candidates = list(subset[START:START + row]['name'])
        L = extend([frozenset(x) for x in itertools.combinations(candidates, r = 2)])
        START += row
        
        # update Counter
        C += L
        L = None
        
        # progress
        pct = round(i/N, 2)*100
        if pct % 1 == 0:
            print('\r{}%'.format(pct), end = '')
        
    return C

In [2]:
data = get_pairs(rows[:10000])

NameError: name 'rows' is not defined

And counting the pairs:

In [None]:
pair_freqs = Counter(data)

Drop everything with freq < 2