Which names tend to co-occur within the same document?

In [1]:
import os
import pandas as pd
from multiprocessing import Pool
import time
import re
from collections import Counter
import itertools

In [None]:
DATA = '/oak/stanford/groups/malgeehe/celebs/chicago_results/chicago_names'

In [None]:
files = [os.path.join(DATA,x) for x in os.listdir(DATA) if x.endswith('.tsv')]

So, let's read in everything, and then start with all of the documents that have > 1 name.

In [None]:
def load_tsv(tsv):
    df = pd.read_csv(tsv, sep = '\t')
    df.columns = ['path', 'name']
    return df

In [None]:
start = time.time()
with Pool() as p:
    L = p.map(load_tsv, files)
print(time.time()-start)

In [None]:
df = pd.concat(L)

In [None]:
L = None # free up memory

Find docs with more than one name

In [None]:
g = df.groupby('path').count()

In [None]:
docs = g[g['name'] > 1].index # docs with multiple people

Of these, which are in the relevant time period since we have too many?

In [None]:
meta = pd.read_csv('/oak/stanford/groups/malgeehe/celebs/chicago_results/chicago_1919-1939_meta.csv')

In [None]:
in_period = [os.path.split(x)[1] for x in meta['fullpath']]

In [None]:
candidates = [os.path.split(x)[1].split('.xml')[0] for x in docs]

In [None]:
# fix the chunks
candidates = [x.split('_chunk')[0] for x in candidates]

In [None]:
candidates[0]

In [None]:
len(candidates) == len(docs)

In [None]:
# match
in_period[0], candidates[0]

In [None]:
in_period_multiple_names = list(set(in_period) & set(candidates))

In [None]:
len(in_period_multiple_names)

In [None]:
word_pattern = re.compile(r'C[A-Z]{1}_[0-9]{14}_[0-9]{5}_[0-9]{9}.txt')

In [None]:
def extract_txt(path):
    return re.search(word_pattern, path).group() #tuple: (path, re.search(word_pattern, x).group())

In [None]:
start = time.time()
df['txt'] = [extract_txt(x) for x in df['path']] # is there a way to use a generator to feed this?
print(time.time()-start)

Then, filter `df` for elements in the `txt` column that `isin` `in_period_multiple_names`

In [None]:
subset = df[df['txt'].isin(in_period_multiple_names)]

In [None]:
subset.shape[0], len(in_period_multiple_names)

In [None]:
subset.head()

In [None]:
# in case below crashes
subset.to_csv('/home/users/fredner/workspace/df.csv')

# Import pairs

In [2]:
# import
subset = pd.read_csv('/home/users/fredner/workspace/df.csv')

In [3]:
subset.head()

Unnamed: 0.1,Unnamed: 0,path,name,txt
0,8,/scratch/groups/malgeehe/celebs/chicago_corenl...,George Mem,CD_20151209222115_00011_492423658.txt
1,9,/scratch/groups/malgeehe/celebs/chicago_corenl...,Timmy Eritt,CD_20151209222115_00011_492423658.txt
2,10,/scratch/groups/malgeehe/celebs/chicago_corenl...,Jack Blackburn,CD_20151209222115_00011_492423658.txt
3,11,/scratch/groups/malgeehe/celebs/chicago_corenl...,Mike Twin Sul,CD_20151209222115_00011_492423658.txt
4,12,/scratch/groups/malgeehe/celebs/chicago_corenl...,Joe Was Sure ``,CD_20151209222115_00011_492423658.txt


In [4]:
# counter of sets
articles = subset['txt'].unique()

Way faster if we sort the df and chop for specific rows in advance

In [5]:
subset = subset.sort_values('txt')

In [6]:
g = subset.groupby('txt').count()

In [7]:
g.reset_index(inplace = True)

In [8]:
g.columns

Index(['txt', 'Unnamed: 0', 'path', 'name'], dtype='object')

In [9]:
rows = g['path'].to_list()

In [10]:
def make_groups(rows):
    L = list(range(0, len(rows), round(len(rows) * 0.001)))
    from_ = 0
    to_ = 0
    groups = []

    for i, x in enumerate(L):
        try:
            group_rows = rows[x:L[i+1]]
            to_ += sum(group_rows)
            groups.append((group_rows, from_, to_))
            from_ += sum(group_rows) # start the next loop where the previous one ended
        except IndexError: # final loop
            group_rows = rows[x:]
            to_ += sum(group_rows)
            groups.append((group_rows, from_, to_))
        
    return groups

In [11]:
groups = make_groups(rows)

In [12]:
def get_pairs_group(group):
    rows = group[0]
    from_ = group[1]
    to_ = group[2] # actually unnecessary
#    df = subset[from_:to_].copy() # also unnecessary
    C = Counter()
    
    for i,row in enumerate(rows):
        candidates = list(subset[from_:from_ + row]['name'])
        L = Counter([frozenset(x) for x in itertools.combinations(candidates, r = 2)])
        from_ += row
        
        # update Counter
        C += L
        L = None
        
    return C

In [13]:
start = time.time()
with Pool() as p:
    data = p.map(get_pairs_group, groups)
print(time.time()-start)

519.9058060646057


In [26]:
len(data) # should be 1001

1001

In [22]:
C = Counter()

for i, x in enumerate(data):
    C += x
    print('\r{}'.format(i), end = '')

1000

In [33]:
test = {k:v for k,v in C.items() if v > 1} # get multiples

In [50]:
df = pd.DataFrame(test, index = [0]).T.reset_index()

In [51]:
print('df done')

df done


In [52]:
df.columns = ['names', 'co-occs']

In [53]:
df.head()

Unnamed: 0,names,co-occs
0,"(Florence Mcclain, Milton Starr)",2
1,"(T. O. B. A., Milton Starr)",6
2,"(Florence Mcclain, Charles Turpin)",2
3,"(Florence Mcclain, T. O. B. A.)",2
4,"(T. O. B. A., Bob Russell)",2


In [54]:
df.sort_values('co-occs', ascending = False, inplace = True)

In [56]:
df.to_csv('/oak/stanford/groups/malgeehe/celebs/chicago_results/celeb_co-occurrences.csv')

# Notes

Because a linear merge is super slow, let's try a parallelized merge

In [15]:
def chunks(lst, n):
    L = []
    for i in range(0, len(lst), n): #round(len(lst)/n)): for total n chunks
        L.append(lst[i:i + n])
    return L

In [16]:
data_chunks = chunks(data, 3)

In [17]:
len(data_chunks)

334

For some reason, multiprocessing fails consistently:

In [19]:
def merge_data(data_chunk):
    C = Counter()
    for x in data_chunk:
        C += x
    return C

In [20]:
# %%prun

# with Pool() as p:
#     counters = p.map(merge_data, data_chunks[:8])

Process ForkPoolWorker-28:
Process ForkPoolWorker-30:
Process ForkPoolWorker-36:
Process ForkPoolWorker-23:
Process ForkPoolWorker-31:
Process ForkPoolWorker-26:
Process ForkPoolWorker-38:
Process ForkPoolWorker-32:
Process ForkPoolWorker-37:
Process ForkPoolWorker-25:
Process ForkPoolWorker-20:
Process ForkPoolWorker-19:
Process ForkPoolWorker-27:
Process ForkPoolWorker-18:
Process ForkPoolWorker-29:
Process ForkPoolWorker-17:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/share/software/user/open/python/3.6.1/lib/python3.6/multiprocessing/p

  File "/share/software/user/open/python/3.6.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/share/software/user/open/python/3.6.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/share/software/user/open/python/3.6.1/lib/python3.6/multiprocessing/queues.py", line 342, in get
    with self._rlock:
  File "/share/software/user/open/python/3.6.1/lib/python3.6/multiprocessing/queues.py", line 342, in get
    with self._rlock:
  File "/share/software/user/open/python/3.6.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/share/software/user/open/python/3.6.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/share/software/user/open/python/3.6.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/share/software/user/open/python/3.6.1/lib/python3.6/multiprocessing/pool.py", line 260, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "/share/software/user/open/python/3.6.1/lib/python3.6/multiprocessing/pool.py", line 602, in get
    self.wait(timeout)
  File "/share/software/user/open/python/3.6.1/lib/python3.6/multiprocessing/pool.py", line 599, in wait
    self._event.wait(timeout)
  File "/share/software/user/open/python/3.6.1/lib/python3.6/threading.py", line 551, in wait
    signaled = self._cond.wait(timeout)
  File "/share/software/user/open/python/3.6.1/lib/python3.6/threading.py", line 295, in wait
    waiter.acquire()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/share/software/user/open/py-jupyter/1.0.0_py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", 

TypeError: must be str, not list

In [None]:
import pickle
with open('/home/users/fredner/celebs/counter.pickle', 'wb') as outputfile:
    pickle.dump(C, outputfile)

In [None]:
test.columns = ['pair', 'co-occurrences']

In [None]:
test.sort_values('co-occurrences', ascending = False, inplace = True)

And counting the pairs:

In [None]:
pair_freqs = Counter(data)

Drop everything with freq < 2

In [None]:
df.columns = ['names', 'co-occs']