# Import and preprocess LittleSis dataset

In [1]:
import pandas as pd
import json
import glob

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
def combine_dictionaries(list_of_dicts):
    result = {}
    for d in list_of_dicts:
        result.update(d)
    return result

## Download open sanctions data as .json

In [4]:
# open .json file
f = open('../assets/entities.json')
data = json.load(f)
# Read data into dataframe 
d={}
for ent in data:
    row=pd.DataFrame.from_dict(ent['attributes'], orient='index')
    d[ent['id']-1]=row.transpose().iloc[0]
data=pd.DataFrame.from_dict(d,orient='index')

## Wrangle data

In [5]:
# Filter out non-PERSON entities (for a PERSON only EL model) 
data=data[data['primary_ext']=='Person']

In [6]:
data.shape

(284620, 13)

In [7]:
data['id'].nunique()

284620

In [8]:
data['name'].nunique()

273467

In [9]:
# Insert alias into context field (for later encoding and use in training an EL model) 
data['blurb'] = data['name'] + ' is a ' + data['blurb'] + '.'
# Edit summary field
data['summary'].fillna('').str.split('.').apply(lambda x: '. '.join(x[:2]) + '.')

1005           Mr.  Questrom was the Chairman and CEO of J.
1006      Ms.  Wolf is the former Chairman and CEO of Le...
1007      Ms.  Alvarez is the former Administrator of th...
1008      Mr.  Penner has been a General Partner at Madr...
1009      Mr.  Breyer is a Managing Partner of Accel Par...
                                ...                        
427650                                                    .
427651                                                    .
427652                                                    .
427654                                                    .
427656                                                    .
Name: summary, Length: 284620, dtype: object

In [10]:
# Transform start date into sentence
data.loc[~data['start_date'].isnull(),'start_date_sentence']=data.loc[~data['start_date'].isnull(),'start_date'].apply(lambda x: f'This person was born in {x}.')

In [11]:
# Transform end date into sentence
data.loc[~data['end_date'].isnull(),'end_date_sentence']=data.loc[~data['end_date'].isnull(),'end_date'].apply(lambda x: f'This person died in {x}.')

In [12]:
# Transform types into sentence
data['types']=data['types'].apply(lambda x:' '.join(x)).str.replace('Person','')
data['types']=data['types'].str.replace(' ',', ').apply(lambda x: 'This person is associated with: '+x[1:]+'.')
# Fix typos
data['types']=data['types'].str.replace(', ,',',').str.replace(', \.','.').str.replace('Media, ality','Media Personality')
# Remove sentences without context 
data.loc[data['types']=='This person is associated with: .','types']=''

  """


In [13]:
# Concatenate information from blurb and summary fields together into single context field
data['context'] = data['blurb'].fillna('') + ' ' + data['summary'].fillna('')
data['context']=data['context'].str.replace('\r','').str.replace('\n', '')
data.drop(['blurb','summary', 'updated_at', 'parent_id'],1,inplace=True)

  after removing the cwd from sys.path.


In [14]:
data.shape

(284620, 12)

In [15]:
data.head(2)

Unnamed: 0,id,name,website,primary_ext,start_date,end_date,aliases,types,extensions,start_date_sentence,end_date_sentence,context
1005,1006,Allen I Questrom,,Person,1940-04-13,,"[Allen I Questrom, Mr Allen Irving Questrom]",This person is associated with: Business.,"{'Person': {'name_last': 'Questrom', 'name_fir...",This person was born in 1940-04-13.,,Allen I Questrom is a former chairman & CEO of...
1006,1007,Linda S Wolf,,Person,,,"[Linda S Wolf, Ms Linda S. Wolf]",This person is associated with: Business.,"{'Person': {'name_last': 'Wolf', 'name_first':...",,,Ms. Wolf is the former Chairman and CEO of Le...


In [16]:
# Extract extra contexual info from extensions field
data['extensions']=data['extensions'].apply(lambda r: [r[key] for key in r.keys()])
unique_extension_keys_extension_keys=[]
df=data['extensions'].apply(lambda r: list(combine_dictionaries(r).keys()))
for row in df:
    unique_extension_keys_extension_keys.extend(row)
del(df)
# manual selection of columns in  extension_fields - most other fields were useless, sparse or null
extensions_fields=['birthplace']
valid_indices=data[data['extensions'].apply(lambda x: len(x))>0].index.values
for field in extensions_fields:
    data.loc[valid_indices,field]=data.loc[valid_indices,'extensions'].apply(lambda x: [x[0][field] if field in x[0] else None][0])
# drop column
data.drop(['extensions'],1,inplace=True)

  


In [17]:
# Transform birth place into sentence
data.loc[~data['birthplace'].isnull(),'birthplace']=data.loc[~data['birthplace'].isnull(),'birthplace'].apply(lambda x: f'This person was born in {x}.')

In [18]:
# Concatenate extra info into context field
extra_context_cols=['start_date_sentence','end_date_sentence','types','birthplace']
for col in extra_context_cols:
    data['context'] = data['context'] + ' ' + data[col].fillna('')

## Export dataset

In [19]:
data.to_csv('../assets/lilsis_entities.csv')