In [1]:
import requests
import json
import pandas as pd
import numpy as np
import glob
import pycountry
import re

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Download open sanctions data as .json

## Import open sanctions data .json

In [3]:
with open('open_sanctions.json') as f:
    sanctions_data = f.readlines()

In [4]:
json.loads(sanctions_data[0])

{'id': 'acf-00040861bc3f593000830d987d09967ef3503ef1',
 'target': True,
 'first_seen': '2022-05-05T15:26:25',
 'last_seen': '2022-10-05T06:32:14',
 'schema': 'Person',
 'properties': {'notes': ['Russian propagandist: host of news program "Segodnia" ("Today") on NTV'],
  'position': ['Propagandists', 'Federal media employees'],
  'gender': ['male'],
  'name': ['Kolyvanov Egor'],
  'alias': ['Колыванов Егор'],
  'birthDate': ['1980-11-15']},
 'referents': [],
 'datasets': ['ru_acf_bribetakers'],
 'caption': 'Kolyvanov Egor'}

In [5]:
data=[]
for ent_index in range(len(sanctions_data)):
    data.append(json.loads(sanctions_data[ent_index]))

In [6]:
len(data)

431828

In [7]:
data[0]

{'id': 'acf-00040861bc3f593000830d987d09967ef3503ef1',
 'target': True,
 'first_seen': '2022-05-05T15:26:25',
 'last_seen': '2022-10-05T06:32:14',
 'schema': 'Person',
 'properties': {'notes': ['Russian propagandist: host of news program "Segodnia" ("Today") on NTV'],
  'position': ['Propagandists', 'Federal media employees'],
  'gender': ['male'],
  'name': ['Kolyvanov Egor'],
  'alias': ['Колыванов Егор'],
  'birthDate': ['1980-11-15']},
 'referents': [],
 'datasets': ['ru_acf_bribetakers'],
 'caption': 'Kolyvanov Egor'}

In [8]:
data=pd.DataFrame(data)

In [9]:
data.head()

Unnamed: 0,id,target,first_seen,last_seen,schema,properties,referents,datasets,caption
0,acf-00040861bc3f593000830d987d09967ef3503ef1,True,2022-05-05T15:26:25,2022-10-05T06:32:14,Person,{'notes': ['Russian propagandist: host of news...,[],[ru_acf_bribetakers],Kolyvanov Egor
1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,True,2022-07-09T18:14:08,2022-10-05T06:32:14,Person,"{'name': ['Shipov Sergei Yurievich'], 'notes':...",[],[ru_acf_bribetakers],Shipov Sergei Yurievich
2,acf-001e7e4c0363f08f1e784c230457960b84a6416f,True,2022-05-05T15:26:25,2022-10-05T06:32:14,Person,"{'name': ['Egorov Ivan Mikhailovich'], 'alias'...",[],[ru_acf_bribetakers],Egorov Ivan Mikhailovich
3,acf-002c208139012c8d93b6298358188d7cadafe648,True,2022-09-20T01:05:26,2022-10-05T06:32:14,Person,"{'name': ['Goreslavsky Alexey Sergeyevich'], '...",[],[ru_acf_bribetakers],Goreslavsky Alexey Sergeyevich
4,acf-002cc8fdf8fe41185091a7cb6c598663e7a22eb5,True,2022-09-20T01:05:26,2022-10-05T06:32:14,Person,"{'gender': ['female'], 'name': ['Samoilova Nat...",[],[ru_acf_bribetakers],Samoilova Natalya Vladimirovna


In [10]:
data.drop(['target','first_seen','last_seen','datasets'],1,inplace=True)

  """Entry point for launching an IPython kernel.


In [11]:
unique_keys=[]
for row in data['properties']:
    for key in row.keys():
        if key not in unique_keys:
            unique_keys.append(key)

In [12]:
# Only keep entities with an entry on the name field
df_dict={}
for df_dict_index,row in enumerate(data['properties']):
    for key in unique_keys:
        if key not in row:
            row[key]=''    
    if row['name']:
        df_dict[df_dict_index]=row 

In [13]:
len(df_dict)

244612

In [14]:
data.drop(['properties','referents'],1,inplace=True)

  """Entry point for launching an IPython kernel.


In [15]:
# Standardise lower and upper cases in the kb names
data['caption']=data['caption'].str.title()

In [16]:
properties=pd.DataFrame.from_dict(df_dict,orient='index')

In [17]:
properties.shape

(244612, 112)

In [18]:
properties=properties.replace('',np.nan)

In [19]:
# Columns made up entirely of NaN
cols_to_drop=properties.isna().all(0)

In [20]:
cols_to_drop=list(cols_to_drop[cols_to_drop.values].index)

In [21]:
properties.drop(cols_to_drop,1,inplace=True)

  """Entry point for launching an IPython kernel.


In [22]:
# Create copy of wikidataIDs
wikidataIDs=properties['wikidataId'].copy()
wikidataIDs[~wikidataIDs.isna()]=wikidataIDs[~wikidataIDs.isna()].apply(lambda x: ', '.join(x))

In [23]:
# Create copy of websites
websites=properties['website'].copy()
websites[~websites.isna()]=websites[~websites.isna()].apply(lambda x: ', '.join(x))

In [24]:
# manually selected columns to drop
more_cols_to_drop=[
                'bikCode',
                'dunsCode',
                'callSign',
                'tonnage',
                'grossRegisteredTonnage', 
                'ogrnCode', 
                'innCode',
                'leiCode',
                'swiftBic',
                'ogrnCode', 
                'classification', 
                'program', 
                'sourceUrl', 
                'addressEntity', 
                'imoNumber', 
                'mmsi',
                'registrationNumber',
                'modifiedAt',
                'idNumber',
                'passportNumber',
                'phone',
                'kppCode',
                'vatCode',
                'serialNumber',
                'owner',
                'opencorporatesUrl',
                'taxNumber',
                'flag',
                'status',
                'jurisdiction',
                'wikidataId',
                'email',
                'website',
                'education',
                'type',
                'firstName',
                'secondName',
                'createdAt',
                'middleName',
                'lastName',
                'title',
                'religion',
                'buildDate',
                'model',
                'incorporationDate',
                'previousName',
                'fatherName',
                'motherName',
                'address',
                'legalForm',
                ]

In [25]:
properties.drop(more_cols_to_drop,1,inplace=True)

  """Entry point for launching an IPython kernel.


In [26]:
# Convert topics into sentences
topics_translation_dict={
    'crime':'Crime',
    'crime.fraud':'Fraud',
    'crime.cyber':'Cybercrime',
    'crime.fin':'Financial crime',
    'crime.theft':'Theft',
    'crime.war':'War crimes',
    'crime.boss':'Criminal leadership',
    'crime.terror':'Terrorism',
    'crime.traffick':'Trafficking',
    'crime.traffick.drug':'Drug trafficking',
    'crime.traffick.human':'Human trafficking',
    'corp.offshore':'Offshore',
    'corp.shell':'Shell company',
    'gov':'Government',
    'gov.national':'National government',
    'gov.state':'State government',
    'gov.muni':'Municipal government',
    'gov.soe':'State-owned enterprise',
    'gov.igo':'Intergovernmental organization',
    'fin':'Financial services',
    'fin.bank':'Bank',
    'fin.fund':'Fund',
    'fin.adivsor':'Financial advisor',
    'role.pep':'Politician',
    'role.rca':'Close Associate',
    'role.judge':'Judge',
    'role.civil':'Civil servant',
    'role.diplo':'Diplomat',
    'role.lawyer':'Lawyer',
    'role.acct':'Accountant',
    'role.spy':'Spy',
    'role.oligarch':'Oligarch',
    'role.journo':'Journalist',
    'role.act':'Activist',
    'pol.party':'Political party',
    'pol.union':'Union',
    'rel':'Religion',
    'mil':'Military',
    'asset.frozen':'Frozen asset',
    'sanction':'Sanctioned entity',
    'debarment':'Debarred entity',
    'poi':'Person of interest'}

for key,value in topics_translation_dict.items():
    value=f'Associated with {value}.'
    topics_translation_dict[key]=value

properties['topics']=properties['topics'].fillna('').apply(lambda x: [topics_translation_dict[key] for key in x])
properties['topics']=properties['topics'].apply(lambda x: ' '.join(x))

In [27]:
def transform_into_sentence(df,col,sentence,separator=', '): 
    df.loc[~df[col].isna(),col]=df.loc[~df[col].isna(),col].apply(lambda x: f'{separator}'.join(x))
    df.loc[~df[col].isna(),col]=df.loc[~df[col].isna(),col].apply(lambda x: f'{sentence}{x}.')

In [28]:
def show_not_na(df,col,n_rows=20):
    return df[~df[col].isna()].head(n_rows)

In [29]:
data.columns

Index(['id', 'schema', 'caption'], dtype='object')

In [30]:
# Remove dates and ordinals from each string in the list
date_expr = re.compile('\d{4}-\d{4}')
digit_expr = re.compile('\s\d{2}[a-zA-Z]{2}\s')
for expr in [date_expr, digit_expr]:
    # Delete expression from string
    properties.loc[~properties['position'].isna(),'position']=properties.loc[~properties['position'].isna(),'position'].apply(lambda x: [re.sub(expr, ' ', i)  for i in x])
    # Remove parentheses and comma
    properties.loc[~properties['position'].isna(),'position']=properties.loc[~properties['position'].isna(),'position'].apply(lambda x: [i.replace('(','').replace(')','').replace(',','') for i in x])
    # Remove position redundancy
    properties.loc[~properties['position'].isna(),'position']=properties.loc[~properties['position'].isna(),'position'].apply(lambda x: set(x))

In [31]:
# export dates
properties['birthdate']=properties['birthDate']
properties['deathdate']=properties['deathDate']

In [32]:
cols_to_sentence={
    'gender':'This person is a ',
    'position':'This person has held these positions: ',
    'birthDate':'This person was born in ',
    'birthPlace': 'This person was born in ',
    'deathDate':'This person died in ',
    'keywords':'This person has worked in: ',
    'sector':'This person worked for: ',
    'publisher':'This person was present in ',
    'pastFlags':'In the past this person was at ',
    'ethnicity':'This person\'s ethnicity is '
}
for col,sentence in cols_to_sentence.items():
    transform_into_sentence(properties,col,sentence)

In [33]:
# Fix most common position abbreviations 
positions_in_full={'Min\.':'Minister',
                   'Dep\.':'Deputy',
                   'Pres\.':'President', 
                   'Chmn\.':'Chairman',
                   'Dir\.':'Director',
                   'Cdr\.':'Commander', 
                   'Sec\.':'Secretary', 
                   'Gen\.':'General',
                   'Col\.':'Colonel', 
                   'Brig\.':'Brigadier',
                   'Lt\.':'Lieutenant'}
for abbv, full in positions_in_full.items():
    properties['position']=properties['position'].str.replace(abbv, full)

  


In [34]:
# Convert country ISO alpha 2 codes into names

In [35]:
for regionality in ['country','nationality']:
    properties.loc[~properties[regionality].isna(),regionality]=\
        properties.loc[~properties[regionality].isna(),regionality].apply(lambda x: [country for country in x if len(country)==2])

    exceptions=set()
    countries=set()
    for row in properties.loc[~properties[regionality].isna(),regionality]:
        for country in row:
            try:
                countries.add(pycountry.countries.get(alpha_2=country).name)
            except:
                exceptions.add(country)

    properties.loc[~properties[regionality].isna(),regionality]=\
        properties.loc[~properties[regionality].isna(),regionality].apply(lambda x: 
                                ','.join([pycountry.countries.get(alpha_2=country).name
                                 for country in x if country not in exceptions])
                               )

In [36]:
# Transform country and nationality into sentences
properties.loc[~properties['country'].isna(),'country']=properties.loc[~properties['country'].isna(),'country'].apply(lambda x: f'This person belongs to these countries: {x}.')
properties.loc[~properties['nationality'].isna(),'nationality']=properties.loc[~properties['nationality'].isna(),'nationality'].apply(lambda x: f'This person has these nationalities: {x}.')

In [37]:
properties['AKA'] = properties['name'] + properties['alias'] + properties['weakAlias']

In [38]:
properties.drop(['name','alias', 'weakAlias'],1,inplace=True)

  """Entry point for launching an IPython kernel.


In [39]:
context_cols=[
    'position', 
    'gender',
    'birthDate', 
    'country',
    'topics', 
    'birthPlace', 
    'nationality', 
    'sector', 
    'keywords', 
    'deathDate',
    'publisher',
    'pastFlags',  
    'ethnicity'
]

In [40]:
i = 0
for col in context_cols:
    if i==0:
        properties['context']=properties[col].fillna('NAN')
        i+=1
    else:
        properties['context']= properties['context'] + properties[col].fillna('NAN')
properties['context']=properties['context'].str.replace('NAN','')
properties['context']=properties['context'].str.split('.').apply(lambda x:'. '.join(x))

In [41]:
properties=properties[['notes','AKA','context','birthdate','deathdate']]

In [42]:
properties['notes']=properties['notes'].fillna('').apply(lambda x:' '.join(x))

In [43]:
properties=properties.merge(wikidataIDs,left_index=True, right_index=True)
properties=properties.merge(websites,left_index=True, right_index=True)

In [44]:
data=data.merge(properties,left_index=True,right_index=True)

In [45]:
# Remove dates from list
data.loc[~data['birthdate'].isna(),'birthdate']=data.loc[~data['birthdate'].isna(),'birthdate'].apply(lambda x: x[0])
data.loc[~data['deathdate'].isna(),'deathdate']=data.loc[~data['deathdate'].isna(),'deathdate'].apply(lambda x: x[0])

In [46]:
data['schema'].value_counts()

Person          218690
Company          16560
LegalEntity       4893
Organization      3659
Vessel             524
Airplane           286
Name: schema, dtype: int64

In [47]:
data=data[data['schema']=='Person']

In [48]:
data=data.rename(columns={'caption':'name'})

In [49]:
data.shape

(218690, 10)

In [50]:
## Ramzan Akhmadovitch Kadyrov test case ##

In [51]:
# Include name in context info
min_context_thres=10
min_context_notes_indices=data[data['notes'].str.replace(' ','').apply(len)<min_context_thres].index.values
data.loc[min_context_notes_indices,'notes']=''
empty_notes_indices=data[data['notes'].str.replace(' ','').str.len()==0].index.values
name_in_notes_indices=data[data.apply(lambda x: x['name'].lower() in x['notes'].lower(),axis=1)].index.values
filled_notes_indices=data[data['notes'].str.replace(' ','').str.len()>=min_context_thres].index.values
filled_notes_indices=set(filled_notes_indices).difference(set(name_in_notes_indices))
# Include name in description when context is null
data.loc[empty_notes_indices, 'notes'] = data.loc[empty_notes_indices,'name'].apply(lambda x: f'This person is called {x}.')
# Include name in description when context is not null
data.loc[filled_notes_indices,'notes']=data.loc[filled_notes_indices, 'name'].apply(lambda x: f'{x} is a ') + data.loc[filled_notes_indices,'notes'] + '.'

In [52]:
crime_vocab=['murder', 
            'fraud', 
            'corruption',
            'conspiracy',
            'crime', 
            'dealing', 
            'drug', 
            'trafficking', 
            'criminal', 
            'cheating', 
            'forgery', 
            'robbery', 
            'violen', #violent, violence
            'sexual', 
            'rape', 
            'assault', 
            'illegal', 
            'transport',
            'travel']
default_expr = 'is a '
crime_expr = 'was involved in '

for crime in crime_vocab:
    crime_indices = data.loc[data['notes'].str.lower().str.contains(default_expr + crime)].index.values
    data.loc[crime_indices,'notes'] = data.loc[crime_indices,'notes'].str.lower().str.replace(default_expr, crime_expr)

In [53]:
data['full_notes']= data['notes'].fillna('') + ' ' + data['context'].fillna('')

In [54]:
data.to_csv('../../kb_datasets/open_sanctions_entities.csv')