# Chicago merge names and preexisting metadata

- Import full names extracted from news articles
- Match filenames with existing metadata filenames
- pd.Merge results
- Export aggregate mentions table
- Export series of annualized tables to traceback mentions

In [1]:
import os
import pandas as pd

In [2]:
# get name files
base = '/scratch/groups/malgeehe/celebs/chicago_people/full_names'
files = [os.path.join(base,x) for x in os.listdir(base) if x.endswith('.tsv')]

In [3]:
len(files)

300

In [4]:
# import data
data = []

for i,file in enumerate(files):
    temp = pd.read_csv(file, sep='\t')
    temp.columns = ['path','person']
    data.append(temp)

9

In [6]:
df = pd.concat(data)

In [None]:
# this kills it on sherlock
df['file_id'] = [os.path.split(x)[-1].split('.txt')[0] for x in df['path']]

In [7]:
# import metadata
meta = pd.read_csv('/scratch/groups/malgeehe/celebs/chicago_meta/chicago_1919-1939_meta.csv')
meta['file_id'] = [os.path.split(x)[-1].split('.txt')[0] for x in meta['fullpath']]
meta['title_normed'] = [x.split('(')[0] for x in meta['title']]

In [8]:
# calculate overlap between groups
overlap = list(set(meta['file_id']) & set(df['file_id']))

In [9]:
# filter date-relevant names
subset = df[df['file_id'].isin(overlap)]
subset = pd.merge(subset,meta,on='file_id')

print('# names: {}'.format(subset.shape[0]))

# names: 321144


In [11]:
# process aggregate mentions of name per year
name_year = subset[['person','year','file_id']]
name_year = name_year.groupby(['person','year']).count()
name_year.reset_index(inplace=True)
name_year.columns = ['person','year','n_mentions']
name_year.sort_values('n_mentions',ascending=False,inplace=True)
name_year.set_index('person',inplace=True)
name_year.head()
# write output here

Unnamed: 0_level_0,year,n_mentions
person,Unnamed: 1_level_1,Unnamed: 2_level_1
N. Clark,1935,73
- Cago,1935,66
N. Y.,1935,64
- Cago,1937,54
W. Madison,1935,53


In [12]:
# process mentions of person per paper per ear
name_paper_year = subset[['person','year','title_normed','file_id']]
name_paper_year = name_paper_year.groupby(['person','year','title_normed']).count()
name_paper_year.reset_index(inplace=True)
name_paper_year.columns=['person','year','paper','n_mentions']
name_paper_year.sort_values('n_mentions',ascending=False,inplace=True)
name_paper_year.head()
#write output here

Unnamed: 0,person,year,paper,n_mentions
203797,N. Clark,1935,Chicago Daily Tribune,73
1510,- Cago,1935,Chicago Daily Tribune,61
210035,N. Y.,1935,Chicago Daily Tribune,57
275510,W. Madison,1935,Chicago Daily Tribune,53
1514,- Cago,1937,Chicago Daily Tribune,53


In [None]:
# create annualized tables with clean meta
for year in range(1919,1940):
    year_meta = subset[subset['year']==year]
    year_overlap = list(set(year_meta['file_id']) & set(df['file_id']))
    year_names = df[df['file_id'].isin(year_overlap)]
    year_table = pd.merge(year_names,year_meta,on='file_id')
    # rename and reorder columns appropriately
    clmns = ['file_id', 'person_y', 'title_normed', 'year', 'objecttype', 'recordtitle',
             'n_valid_words', 'ocr_acc', 'path_y']
    year_table = year_table[clmns]
    year_table.columns = ['file_id','person','paper','year','doc_type','doc_title','n_words','ocr','xml_path']
    out_dir = '/scratch/groups/malgeehe/celebs/chicago_people/results/names_annual'
    filename = '{}_names.csv'.format(year)
    out_path = os.path.join(out_dir,filename)
    year_table.to_csv(out_path)

Annual csvs going to be in GB

We may well want to scale these name mentions. That's an interpretation decision for the group.

# N.B. We processed way more files beyond 1919-1939
...why is it that there are so many fewer files in the metadata table than in the archive? Did I process all of the Tribune and the Defender? That might explain some of the anachronous names...

This might be the problem:

In [None]:
meta = '/scratch/groups/malgeehe/celebs/chicago_meta/chicago_subcorpus_meta.csv'

In [None]:
df = pd.read_csv(meta)

In [None]:
df.shape

Even that only gets to a million! Where are these other files coming from?? The initial Chicago subset was 3.1M. Maybe I mistakenly processed every article in Chicago...?