# Import and preprocess Open Sanctions dataset

In [1]:
import requests
import json
import pandas as pd
import numpy as np
import glob
import pycountry
import re
import shutil

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
def transform_into_sentence(df,col,sentence,separator=', '): 
    """
    Transform keywords in certain fields into sentences.
    This was done to try to improve the quality of sentence encodings. 
    """
    df.loc[~df[col].isna(),col]=df.loc[~df[col].isna(),col].apply(lambda x: f'{separator}'.join(x))
    df.loc[~df[col].isna(),col]=df.loc[~df[col].isna(),col].apply(lambda x: f'{sentence}{x}.')

def show_not_na(df,col,n_rows=20):
    """
    Helper function to dispay values excluding NaNs
    """
    return df[~df[col].isna()].head(n_rows)

## Download open sanctions data as .json

In [None]:
# Download dataset from website (will have been updated since 2022)
url = "https://data.opensanctions.org/datasets/latest/default/entities.ftm.json"
r = requests.get(url, allow_redirects=True)
open('open_sanctions.json', 'wb').write(r.content)
# Move downloaded file into assets folder
shutil.move('open_sanctions.json','../assets/open_sanctions.json') 

## Import open sanctions data .json

In [5]:
with open('../assets/open_sanctions.json') as f:
    sanctions_data = f.readlines()

In [6]:
json.loads(sanctions_data[0])

{'id': 'acf-00040861bc3f593000830d987d09967ef3503ef1',
 'target': True,
 'first_seen': '2022-05-05T15:26:25',
 'last_seen': '2022-10-05T06:32:14',
 'schema': 'Person',
 'properties': {'notes': ['Russian propagandist: host of news program "Segodnia" ("Today") on NTV'],
  'position': ['Propagandists', 'Federal media employees'],
  'gender': ['male'],
  'name': ['Kolyvanov Egor'],
  'alias': ['Колыванов Егор'],
  'birthDate': ['1980-11-15']},
 'referents': [],
 'datasets': ['ru_acf_bribetakers'],
 'caption': 'Kolyvanov Egor'}

In [7]:
# load all entities into variable
data=[]
for ent_index in range(len(sanctions_data)):
    data.append(json.loads(sanctions_data[ent_index]))
data=pd.DataFrame(data)

In [8]:
data.shape

(431828, 9)

In [9]:
data.head(2)

Unnamed: 0,id,target,first_seen,last_seen,schema,properties,referents,datasets,caption
0,acf-00040861bc3f593000830d987d09967ef3503ef1,True,2022-05-05T15:26:25,2022-10-05T06:32:14,Person,{'notes': ['Russian propagandist: host of news...,[],[ru_acf_bribetakers],Kolyvanov Egor
1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,True,2022-07-09T18:14:08,2022-10-05T06:32:14,Person,"{'name': ['Shipov Sergei Yurievich'], 'notes':...",[],[ru_acf_bribetakers],Shipov Sergei Yurievich


In [10]:
# Drop unwanted columns
data.drop(['target','first_seen','last_seen','datasets', 'referents'],1,inplace=True)

  


## Extract properties field into separate dataframe 

In [11]:
# Create dictionary storing subfields within properties 
unique_keys=[]
for row in data['properties']:
    for key in row.keys():
        if key not in unique_keys:
            unique_keys.append(key)

# Only keep entities with an entry on the name field
df_dict={}
for df_dict_index,row in enumerate(data['properties']):
    for key in unique_keys:
        if key not in row:
            row[key]=''    
    if row['name']:
        df_dict[df_dict_index]=row 

# Cast dictionary as dataframe
properties=pd.DataFrame.from_dict(df_dict,orient='index')
properties=properties.replace('',np.nan)

In [12]:
properties.shape

(244612, 112)

## Wrangle data

In [13]:
# Remove properties (stored as separate df) from data  
data.drop(['properties'],1,inplace=True)
# Standardise KB aliases (first letter to upper case)
data['caption']=data['caption'].str.title()

  


In [14]:
# Drop columns made up entirely of NaN from properties df
cols_to_drop=properties.isna().all(0)
cols_to_drop=list(cols_to_drop[cols_to_drop.values].index)
properties.drop(cols_to_drop,1,inplace=True)

  after removing the cwd from sys.path.


In [15]:
# Create copy of wikidataID field
wikidataIDs=properties['wikidataId'].copy()
wikidataIDs[~wikidataIDs.isna()]=wikidataIDs[~wikidataIDs.isna()].apply(lambda x: ', '.join(x))

In [16]:
# Create copy of website field
websites=properties['website'].copy()
websites[~websites.isna()]=websites[~websites.isna()].apply(lambda x: ', '.join(x))

In [17]:
# manually selected columns to drop (including wikidataID and website)
more_cols_to_drop=[
                'bikCode',
                'dunsCode',
                'callSign',
                'tonnage',
                'grossRegisteredTonnage', 
                'ogrnCode', 
                'innCode',
                'leiCode',
                'swiftBic',
                'ogrnCode', 
                'classification', 
                'program', 
                'sourceUrl', 
                'addressEntity', 
                'imoNumber', 
                'mmsi',
                'registrationNumber',
                'modifiedAt',
                'idNumber',
                'passportNumber',
                'phone',
                'kppCode',
                'vatCode',
                'serialNumber',
                'owner',
                'opencorporatesUrl',
                'taxNumber',
                'flag',
                'status',
                'jurisdiction',
                'wikidataId',
                'email',
                'website',
                'education',
                'type',
                'firstName',
                'secondName',
                'createdAt',
                'middleName',
                'lastName',
                'title',
                'religion',
                'buildDate',
                'model',
                'incorporationDate',
                'previousName',
                'fatherName',
                'motherName',
                'address',
                'legalForm',
                ]

properties.drop(more_cols_to_drop,1,inplace=True)



In [18]:
# Edit crime related topic keyswords and transform them into sentences
topics_translation_dict={
    'crime':'Crime',
    'crime.fraud':'Fraud',
    'crime.cyber':'Cybercrime',
    'crime.fin':'Financial crime',
    'crime.theft':'Theft',
    'crime.war':'War crimes',
    'crime.boss':'Criminal leadership',
    'crime.terror':'Terrorism',
    'crime.traffick':'Trafficking',
    'crime.traffick.drug':'Drug trafficking',
    'crime.traffick.human':'Human trafficking',
    'corp.offshore':'Offshore',
    'corp.shell':'Shell company',
    'gov':'Government',
    'gov.national':'National government',
    'gov.state':'State government',
    'gov.muni':'Municipal government',
    'gov.soe':'State-owned enterprise',
    'gov.igo':'Intergovernmental organization',
    'fin':'Financial services',
    'fin.bank':'Bank',
    'fin.fund':'Fund',
    'fin.adivsor':'Financial advisor',
    'role.pep':'Politician',
    'role.rca':'Close Associate',
    'role.judge':'Judge',
    'role.civil':'Civil servant',
    'role.diplo':'Diplomat',
    'role.lawyer':'Lawyer',
    'role.acct':'Accountant',
    'role.spy':'Spy',
    'role.oligarch':'Oligarch',
    'role.journo':'Journalist',
    'role.act':'Activist',
    'pol.party':'Political party',
    'pol.union':'Union',
    'rel':'Religion',
    'mil':'Military',
    'asset.frozen':'Frozen asset',
    'sanction':'Sanctioned entity',
    'debarment':'Debarred entity',
    'poi':'Person of interest'}

for key,value in topics_translation_dict.items():
    value=f'Associated with {value}.'
    topics_translation_dict[key]=value

properties['topics']=properties['topics'].fillna('').apply(lambda x: [topics_translation_dict[key] for key in x])
properties['topics']=properties['topics'].apply(lambda x: ' '.join(x))

In [19]:
print(list(properties.loc[properties['name'].apply(lambda x: x[0]=='Joe Biden'), 'position'])[:5])
# Fix positions which appear multiple times due to changing dates by removing dates and ordinals from each string in the list.
date_expr = re.compile('\d{4}-\d{4}')
digit_expr = re.compile('\s\d{2}[a-zA-Z]{2}\s')
for expr in [date_expr, digit_expr]:
    # Delete expression from string
    properties.loc[~properties['position'].isna(),'position']=properties.loc[~properties['position'].isna(),'position'].apply(lambda x: [re.sub(expr, ' ', i)  for i in x])
    # Remove parentheses and comma
    properties.loc[~properties['position'].isna(),'position']=properties.loc[~properties['position'].isna(),'position'].apply(lambda x: [i.replace('(','').replace(')','').replace(',','') for i in x])
    # Remove position redundancy
    properties.loc[~properties['position'].isna(),'position']=properties.loc[~properties['position'].isna(),'position'].apply(lambda x: set(x))
print('---')
print(list(properties.loc[properties['name'].apply(lambda x: x[0]=='Joe Biden'), 'position'])[:5])

[['United States senator (1989-1991)', 'United States senator (2005-2007)', 'Senate (member, 1999-2001)', 'Senate (member, 1993-1995)', 'United States senator (1987-1989)', 'United States senator (1997-1999)', 'Senate (member, 1989-1991)', 'Senate (member, 2007-2009)', 'United States senator (1993-1995)', 'United States senator (2007-2009)', 'Senate (member, 1985-1987)', 'United States senator (2009-2009)', 'Senate (member, 1987-1989)', 'Senate (member, 2003-2005)', 'Vice President of the United States (2009-2017)', 'United States senator (1977-1979)', 'Senate (member, 1995-1997)', 'United States senator (1991-1993)', 'President-elect of the United States (2020-2021)', 'council member (1970-1972)', 'United States senator (2001-2003)', 'United States senator (1973-1975)', 'United States senator (1979-1981)', 'United States senator (2003-2005)', 'Senate (member, 2001-2003)', 'United States senator (1995-1997)', 'United States senator (1975-1977)', 'Senate (member, 1997-1999)', 'Senate (m

In [20]:
# Create copy of dates in dateformat (original dates to be converted into sentences) 
properties['birthdate']=properties['birthDate']
properties['deathdate']=properties['deathDate']

In [21]:
# Convert keywords accross fields into full sentences. 
cols_to_sentence={
    'gender':'This person is a ',
    'position':'This person has held these positions: ',
    'birthDate':'This person was born in ',
    'birthPlace': 'This person was born in ',
    'deathDate':'This person died in ',
    'keywords':'This person has worked in: ',
    'sector':'This person worked for: ',
    'publisher':'This person was present in ',
    'pastFlags':'In the past this person was at ',
    'ethnicity':'This person\'s ethnicity is '
}
for col,sentence in cols_to_sentence.items():
    transform_into_sentence(properties,col,sentence)

In [22]:
# Edit most common position abbreviations 
positions_in_full={'Min\.':'Minister',
                   'Dep\.':'Deputy',
                   'Pres\.':'President', 
                   'Chmn\.':'Chairman',
                   'Dir\.':'Director',
                   'Cdr\.':'Commander', 
                   'Sec\.':'Secretary', 
                   'Gen\.':'General',
                   'Col\.':'Colonel', 
                   'Brig\.':'Brigadier',
                   'Lt\.':'Lieutenant'}
for abbv, full in positions_in_full.items():
    properties['position']=properties['position'].str.replace(abbv, full)

  


In [23]:
# Convert country ISO alpha 2 codes into names (UK into United Kingdom)
for regionality in ['country','nationality']:
    properties.loc[~properties[regionality].isna(),regionality]=\
        properties.loc[~properties[regionality].isna(),regionality].apply(lambda x: [country for country in x if len(country)==2])

    exceptions=set()
    countries=set()
    for row in properties.loc[~properties[regionality].isna(),regionality]:
        for country in row:
            try:
                countries.add(pycountry.countries.get(alpha_2=country).name)
            except:
                exceptions.add(country)

    properties.loc[~properties[regionality].isna(),regionality]=\
        properties.loc[~properties[regionality].isna(),regionality].apply(lambda x: 
                                ','.join([pycountry.countries.get(alpha_2=country).name
                                 for country in x if country not in exceptions])
                               )

# Transform country and nationality into sentences
properties.loc[~properties['country'].isna(),'country']=properties.loc[~properties['country'].isna(),'country'].apply(lambda x: f'This person belongs to these countries: {x}.')
properties.loc[~properties['nationality'].isna(),'nationality']=properties.loc[~properties['nationality'].isna(),'nationality'].apply(lambda x: f'This person has these nationalities: {x}.')

In [24]:
# Store alternative aliases
properties['AKA'] = properties['name'] + properties['alias'] + properties['weakAlias']
properties.drop(['name','alias', 'weakAlias'],1,inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
# Concatenate context columns into a single field
context_cols=[
    'position', 
    'gender',
    'birthDate', 
    'country',
    'topics', 
    'birthPlace', 
    'nationality', 
    'sector', 
    'keywords', 
    'deathDate',
    'publisher',
    'pastFlags',  
    'ethnicity'
]

i = 0
for col in context_cols:
    if i==0:
        properties['context']=properties[col].fillna('NAN')
        i+=1
    else:
        properties['context']= properties['context'] + properties[col].fillna('NAN')
properties['context']=properties['context'].str.replace('NAN','')
properties['context']=properties['context'].str.split('.').apply(lambda x:'. '.join(x))

In [26]:
# Drop redundant columns 
properties=properties[['notes','AKA','context','birthdate','deathdate']]

In [27]:
# Edit notes field
properties['notes']=properties['notes'].fillna('').apply(lambda x:' '.join(x))

In [28]:
# Merge wikidataID and website back into dataset
properties=properties.merge(wikidataIDs,left_index=True, right_index=True)
properties=properties.merge(websites,left_index=True, right_index=True)

In [29]:
# Merge properties into dataset
data=data.merge(properties,left_index=True,right_index=True)

In [30]:
data

Unnamed: 0,id,schema,caption,notes,AKA,context,birthdate,deathdate,wikidataId,website
0,acf-00040861bc3f593000830d987d09967ef3503ef1,Person,Kolyvanov Egor,"Russian propagandist: host of news program ""Se...",,This person has held these positions: Federal ...,[1980-11-15],,,
1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,Person,Shipov Sergei Yurievich,"Russian chess player, grandmaster, chess coach...",,This person has held these positions: Athletes...,[1966-04-17],,,
2,acf-001e7e4c0363f08f1e784c230457960b84a6416f,Person,Egorov Ivan Mikhailovich,Deputy of the State Council of the Republic of...,,This person has held these positions: Regional...,[1961-01-21],,,
3,acf-002c208139012c8d93b6298358188d7cadafe648,Person,Goreslavsky Alexey Sergeyevich,Russian journalist and media manager. Helped d...,,This person has held these positions: Investig...,[1977-07-13],,,
4,acf-002cc8fdf8fe41185091a7cb6c598663e7a22eb5,Person,Samoilova Natalya Vladimirovna,"Russian singer, composer. Supported the action...",,This person has held these positions: Celebrit...,[1987-06-24],,,
...,...,...,...,...,...,...,...,...,...,...
412324,wbdeb-692816,LegalEntity,Laxminarayan Construction,,,This person belongs to these countries: India....,,,,
412325,wbdeb-692833,LegalEntity,Quvasoy Mukammal Tamir Llc,,,This person belongs to these countries: Uzbeki...,,,,
412326,wbdeb-692834,LegalEntity,Stroy Montaj Invest,,,This person belongs to these countries: Uzbeki...,,,,
412328,wbdeb-698401,LegalEntity,Göksin Insaat Gida Turizm Bilisim Tüketim Mall...,,,This person belongs to these countries: Turkey...,,,,


In [31]:
# Remove dates from list
data.loc[~data['birthdate'].isna(),'birthdate']=data.loc[~data['birthdate'].isna(),'birthdate'].apply(lambda x: x[0])
data.loc[~data['deathdate'].isna(),'deathdate']=data.loc[~data['deathdate'].isna(),'deathdate'].apply(lambda x: x[0])

In [32]:
data['schema'].value_counts()

Person          218690
Company          16560
LegalEntity       4893
Organization      3659
Vessel             524
Airplane           286
Name: schema, dtype: int64

In [33]:
# Filter out non-PERSON entities (for a PERSON only EL model) 
data=data[data['schema']=='Person']

In [34]:
data.shape

(218690, 10)

In [35]:
# Rename alias field
data=data.rename(columns={'caption':'name'})
# Ensure alias is included in the context field (for later encoding and use in training an EL model) 
min_context_thres=10
min_context_notes_indices=data[data['notes'].str.replace(' ','').apply(len)<min_context_thres].index.values
data.loc[min_context_notes_indices,'notes']=''
empty_notes_indices=data[data['notes'].str.replace(' ','').str.len()==0].index.values
name_in_notes_indices=data[data.apply(lambda x: x['name'].lower() in x['notes'].lower(),axis=1)].index.values
filled_notes_indices=data[data['notes'].str.replace(' ','').str.len()>=min_context_thres].index.values
filled_notes_indices=set(filled_notes_indices).difference(set(name_in_notes_indices))
# Include name in description when context is null
data.loc[empty_notes_indices, 'notes'] = data.loc[empty_notes_indices,'name'].apply(lambda x: f'This person is called {x}.')
# Include name in description when context is not null
data.loc[filled_notes_indices,'notes']=data.loc[filled_notes_indices, 'name'].apply(lambda x: f'{x} is a ') + data.loc[filled_notes_indices,'notes'] + '.'

In [36]:
# Convert crime vocabulary into sentences 
crime_vocab=['murder', 
            'fraud', 
            'corruption',
            'conspiracy',
            'crime', 
            'dealing', 
            'drug', 
            'trafficking', 
            'criminal', 
            'cheating', 
            'forgery', 
            'robbery', 
            'violen', #violent, violence
            'sexual', 
            'rape', 
            'assault', 
            'illegal', 
            'transport',
            'travel']
default_expr = 'is a '
crime_expr = 'was involved in '

for crime in crime_vocab:
    crime_indices = data.loc[data['notes'].str.lower().str.contains(default_expr + crime)].index.values
    data.loc[crime_indices,'notes'] = data.loc[crime_indices,'notes'].str.lower().str.replace(default_expr, crime_expr)

In [37]:
# Concatenate information from context and notes fields together 
data['full_notes']= data['notes'].fillna('') + ' ' + data['context'].fillna('')

## Export dataset

In [38]:
data.to_csv('../assets/open_sanctions_entities.csv')