Which names tend to co-occur within the same document?

In [3]:
import os
import pandas as pd
from multiprocessing import Pool
import time
import re
from collections import Counter
import itertools

In [4]:
DATA = '/oak/stanford/groups/malgeehe/celebs/chicago_results/chicago_names'

In [5]:
files = [os.path.join(DATA,x) for x in os.listdir(DATA) if x.endswith('.tsv')]

So, let's read in everything, and then start with all of the documents that have > 1 name.

In [6]:
def load_tsv(tsv):
    df = pd.read_csv(tsv, sep = '\t')
    df.columns = ['path', 'name']
    return df

In [7]:
start = time.time()
with Pool() as p:
    L = p.map(load_tsv, files)
print(time.time()-start)

12.032985210418701


In [8]:
df = pd.concat(L)

In [9]:
L = None # free up memory

Find docs with more than one name

In [10]:
g = df.groupby('path').count()

In [11]:
docs = g[g['name'] > 1].index # docs with multiple people

Of these, which are in the relevant time period since we have too many?

In [12]:
meta = pd.read_csv('/oak/stanford/groups/malgeehe/celebs/chicago_results/chicago_1919-1939_meta.csv')

In [13]:
in_period = [os.path.split(x)[1] for x in meta['fullpath']]

In [14]:
candidates = [os.path.split(x)[1].split('.xml')[0] for x in docs]

In [15]:
# fix the chunks
candidates = [x.split('_chunk')[0] for x in candidates]

In [16]:
candidates[0]

'CD_20151209220246_00001_491877180.txt'

In [17]:
len(candidates) == len(docs)

True

In [18]:
# match
in_period[0], candidates[0]

('CT_20170929192812_00001_181362810.txt',
 'CD_20151209220246_00001_491877180.txt')

In [19]:
in_period_multiple_names = list(set(in_period) & set(candidates))

In [20]:
len(in_period_multiple_names)

440078

In [21]:
word_pattern = re.compile(r'C[A-Z]{1}_[0-9]{14}_[0-9]{5}_[0-9]{9}.txt')

In [22]:
def extract_txt(path):
    return re.search(word_pattern, path).group() #tuple: (path, re.search(word_pattern, x).group())

In [23]:
start = time.time()
df['txt'] = [extract_txt(x) for x in df['path']] # is there a way to use a generator to feed this?
print(time.time()-start)

42.00756359100342


Then, filter `df` for elements in the `txt` column that `isin` `in_period_multiple_names`

In [24]:
subset = df[df['txt'].isin(in_period_multiple_names)]

In [25]:
subset.shape[0], len(in_period_multiple_names)

(4625679, 440078)

In [26]:
subset.head()

Unnamed: 0,path,name,txt
8,/scratch/groups/malgeehe/celebs/chicago_corenl...,George Mem,CD_20151209222115_00011_492423658.txt
9,/scratch/groups/malgeehe/celebs/chicago_corenl...,Timmy Eritt,CD_20151209222115_00011_492423658.txt
10,/scratch/groups/malgeehe/celebs/chicago_corenl...,Jack Blackburn,CD_20151209222115_00011_492423658.txt
11,/scratch/groups/malgeehe/celebs/chicago_corenl...,Mike Twin Sul,CD_20151209222115_00011_492423658.txt
12,/scratch/groups/malgeehe/celebs/chicago_corenl...,Joe Was Sure ``,CD_20151209222115_00011_492423658.txt


In [27]:
# in case below crashes
subset.to_csv('/home/users/fredner/workspace/df.csv')

In [28]:
# counter of sets
articles = subset['txt'].unique()

Way faster if we sort the df and chop for specific rows in advance

In [29]:
sub_sort = subset.sort_values('txt')

In [30]:
g = sub_sort.groupby('txt').count()

In [31]:
g.reset_index(inplace = True)

In [32]:
g.columns

Index(['txt', 'path', 'name'], dtype='object')

In [33]:
rows = g['path'].to_list()

In [34]:
rows[:5]

[11, 2, 19, 2, 2]

Update a Counter to keep memory down

In [119]:
def get_pairs(rows, min_ = 2):
    START = 0
    N = len(rows)
    C = Counter()
    
    for i,row in enumerate(rows):
        candidates = list(subset[START:START + row]['name'])
        L = Counter([frozenset(x) for x in itertools.combinations(candidates, r = 2)])
        START += row
        
        # update Counter
        C += L
        L = None
        
        # loop progress
        pct = round(i/N, 2) * 100
        if pct % 0.1 == 0:
            print('\r{}%'.format(pct), end = '')
        
    # optionally trash values < N at the end
    if min_:
        C = {x : C[x] for x in C if C[x] >= min_}
        
    return C

In [54]:
start = time.time()
data = get_pairs(rows[:round(len(rows)*0.01)])
print(time.time()-start)

100.0%310.0932824611664


Ok, at that rate it would take ~9 hours to process everything, assuming nothing crashes.

Could try setting start values and passing this to multiprocessing. Then just combine all of the resulting Counters.

In [168]:
def make_groups(rows):
    pct = 0.01 # what percentage of the data to process at a time?
    start_idx = list(range(0, len(rows), round(len(rows) * pct)))
    end_idx = start_idx.copy()[1:]
    end_idx.append(len(rows))
    start_sum = [sum(rows[:x]) for x in idx]
    return list(zip(start_idx, end_idx, start_sum))

In [169]:
groups = make_groups(rows)

In [170]:
def get_pairs_groups(group):
    START_IDX = group[0]
    END_IDX = group[1]
    START_SUM = group[2]
    C = Counter()
    
    for i,row in enumerate(rows[START_IDX:END_IDX]):
        candidates = list(subset[START_SUM:START_SUM + row]['name'])
        L = Counter([frozenset(x) for x in itertools.combinations(candidates, r = 2)])
        START_SUM += row
        
        # update Counter
        C += L
        L = None
    
    print('group complete')
    
    return C

In [None]:
start = time.time()
with Pool() as p:
    data = p.map(get_pairs_groups, groups)
print(time.time()-start)

group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete
group complete


This is giving about a 4x speedup, which makes sense. So...

In [152]:
# merge results
C = Counter()
for x in data:
    C += x

In [153]:
len(C)

786976

In [65]:
test = pd.DataFrame(data, index = [0]).T.reset_index()

In [66]:
test.columns = ['pair', 'co-occurrences']

In [70]:
test.sort_values('co-occurrences', ascending = False, inplace = True)

And counting the pairs:

In [None]:
pair_freqs = Counter(data)

Drop everything with freq < 2