In [1]:
import jellyfish
from fuzzywuzzy import fuzz
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import requests
from newspaper import Article
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import dateutil.parser as parser
from geopy.geocoders import Nominatim
import pycountry
import time
from datetime import date
from dateutil.relativedelta import relativedelta
import pinyin

nlp = en_core_web_sm.load()
nltk.download('vader_lexicon')

df = pd.read_excel("NUS sample names_V2.xlsx", engine="openpyxl")
df = df.where(pd.notnull(df), None)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lionel/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Name to be screened must be in English
# Alias names can only handle Chinese characters , else return None
def preprocess_df_to_dict(df):
    def get_year(date):
        try:
            parser_obj = parser.parse(str(date))
            return parser_obj.year
        except:
            return None

    def get_month(date):
        if len(str(date))>4:
            try:
                return parser.parse(str(date)).month
            except:
                return None
        else:
            return None
            
    def get_day(date):
        if len(str(date))>4:
            try:
                return parser.parse(str(date)).day
            except:
                return None
        else:
            return None
    
    def isEnglish(s):
        try:
            s.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            return False
        else:
            return True    
    
    df_dict_list = df.to_dict('records')
    cleaned_dict_list = []
    for record in df_dict_list:
        
        alias = record['Alias name']
        if alias is not None:
            alias_is_english = isEnglish(alias)
            if alias_is_english is False:
                try:
                    alias = pinyin.get(alias, format='strip', delimiter=' ')
                except:
                    alias = None
        current_record = {
            'name': record['Name to be screened'],
            'alias' : alias,
            'year_of_birth': get_year(record['Date of birth']),
            'month_of_birth': get_month(record['Date of birth']),
            'day_of_birth': get_day(record['Date of birth']),
            'gender': record['Gender'],
            'nationality': record['Nationality'],
            ### delete these later on, for testing only###
            'type_of_error': record['Type of variation (if any)'],
            'actual_name': record['Actual name'],
        }
        cleaned_dict_list.append(current_record)
    return cleaned_dict_list

In [69]:
def ER_name_matching(name1, name2):
    def split_name_list(name):
        name = name.lower()
        output = name.split(" ")
        return output

    def preprocess_name(names_dict, word):
        for key, value in names_dict.items():
            if word in value:
                return key
        else:
            return word

    def stitch_name(list1):
        output = ''
        for x in range(len(list1)):
            if x==0:
                output += list1[x]
            else:
                output += ' ' + list1[x]
        return output

    def phonetic_comparison(list1, list2):
        meta_list1 = []
        meta_list2 = []
        nysiis_list1 = []
        nysiis_list2 = []
        for name_1 in list1:
            meta_list1.append(jellyfish.metaphone(name_1))
            nysiis_list1.append(jellyfish.nysiis(name_1))
        for name_2 in list2:
            meta_list2.append(jellyfish.metaphone(name_2))
            nysiis_list2.append(jellyfish.nysiis(name_2))
        if (set(meta_list1) == set(meta_list2)) or (set(nysiis_list1) == set(nysiis_list2)):
            return True
        else:
            return False
    
    def excel_to_dict(excel_file):
        excel_df = pd.read_excel(excel_file)
        excel_df.value.apply(str)
        before_transformation = dict(zip(excel_df.key, excel_df.value))
        dictionary = {key: [val for val in value.split(',')] for key, value in before_transformation.items()}
        return dictionary
            
    names_dict = excel_to_dict('names_dict.xlsx') 
    
    # START #
    ### Change this if needed ###
    threshold = 89
    #############################
    
    split_list_1 = split_name_list(name1)
    split_list_2 = split_name_list(name2) 
 
    # if len(split_list_1) != len(split_list_2):
    #     return None
    
    for i in range(len(split_list_1)):
        split_list_1[i] = preprocess_name(names_dict, split_list_1[i])        
    for i in range(len(split_list_2)):
        split_list_2[i] = preprocess_name(names_dict, split_list_2[i])
    
    stitched_name1 = stitch_name(split_list_1)
    stitched_name2 = stitch_name(split_list_2)
    
    # 1st layer of testing: Token Sort Ratio with threshold
    score1 = fuzz.token_sort_ratio(stitched_name1, stitched_name2)
    if score1 >= threshold:
        # score_list.append(score1)
        return score1
        # do something
# 4) 2nd layer of testing - Metaphone and NYSIIS phonetic encoding - DONE
    else: 
        try:
            matched_phonetic = phonetic_comparison(split_list_1, split_list_2)
            if matched_phonetic:
                return threshold # assumption that phonetic match will give threshold score
            else: 
                return None
        except:
            return None
        
    try:
        return score1
    except:
        pass

In [3]:
df_dict = preprocess_df_to_dict(df)
test_record_1 = df_dict[0]
test_record_18 = df_dict[15]
print(test_record_18)

{'name': 'Lange Vivian', 'alias': None, 'year_of_birth': 1997, 'month_of_birth': None, 'day_of_birth': None, 'gender': 'Female', 'nationality': 'Singapore', 'type_of_error': '-', 'actual_name': 'Lange Vivian'}


In [6]:
outputs = []
for x in df_dict:
    if 'Mun San' in x['name']:
        outputs.append(x)
        
test = outputs[0]
test

{'name': 'Son Mun San',
 'alias': None,
 'year_of_birth': 1951,
 'month_of_birth': None,
 'day_of_birth': None,
 'gender': '-',
 'nationality': None,
 'type_of_error': '-',
 'actual_name': 'Son Mun San'}

# ALGO

In [90]:
import time

def sanction_screening(client):
    def split_name_list(name):
        name = name.lower()
        output = name.split(" ")
        return output

    def preprocess_name(names_dict, word):
        for key, value in names_dict.items():
            if word in value:
                return key
        else:
            return word

    def stitch_name(list1):
        output = ''
        for x in range(len(list1)):
            if x==0:
                output += list1[x]
            else:
                output += ' ' + list1[x]
        return output
    
    def excel_to_dict(excel_file):
        excel_df = pd.read_excel(excel_file)
        excel_df.value.apply(str)
        before_transformation = dict(zip(excel_df.key, excel_df.value))
        dictionary = {key: [val for val in value.split(',')] for key, value in before_transformation.items()}
        return dictionary

    start = time.time()
    names_dict = excel_to_dict('names_dict.xlsx') 
    sanction_list_dict = pd.read_csv("cleaned_indiv_sanction_list.csv").to_dict('records')
    
    client_name = client['name']
    split_client_name = split_name_list(client_name)

    for i in range(len(split_client_name)):
        split_client_name[i] = preprocess_name(names_dict, split_client_name[i])        
    stitched_client_name = stitch_name(split_client_name)
    
    for record in sanction_list_dict:
        current_sanc_name = record['name']
        split_sanction_name = split_name_list(current_sanc_name)
        if len(split_client_name) != len(split_sanction_name):
            continue
        for i in range(len(split_sanction_name)):
            split_sanction_name[i] = preprocess_name(names_dict, split_sanction_name[i])
        
        stitched_sanc_name = stitch_name(split_sanction_name)
        
        if abs(len(stitched_client_name) - len(stitched_sanc_name))>3:
            # print(stitched_client_name, stitched_sanc_name)
            continue
        
        try:
            flag = ER_name_matching(client_name, current_sanc_name)
            # print("go")
        except:
            continue
        else:
            if flag is None:
                continue
            if flag > 0:
                end = time.time()
                print("Process completed in " + str(round((end-start),2)) + " seconds - Found a match in the sanction list with a score of " + str(flag))
                print(record)
                # print(stitched_client_name, stitched_sanc_name)
                return True
    end = time.time()
    print("Process completed in " + str(round((end-start),2)) + " seconds - with no match on the sanction list")
    return False    

print(sanction_screening(test))

Process completed in 1.29 seconds - Found a match in the sanction list with a score of 100
{'Unnamed: 0': 2040, 'name': 'son mun san', 'title': 'External Affairs Bureau Chief, General Bureau of Atomic Energy', 'dob': '23-Jan-51', 'pob': nan, 'nationality': nan, 'citizenship': nan, 'aliases': nan}
True


In [74]:
print(sanction_screening(test1))

Process completed in 8.22984504699707- Found a match in the sanction list with a score of 94
{'Unnamed: 0': 1902, 'name': 'chaudhry aamir ali', 'title': nan, 'dob': '3-Aug-86', 'pob': nan, 'nationality': 'pakistan', 'citizenship': nan, 'aliases': "['huzaifa']"}
True


In [54]:

test1 = {'name' : 'aamir ali chaudary'} # in the sanction list
test2 = {'name' : 'lionel lew'}
test3 = {'name' : 'murov evgen aleksyevic'}

In [57]:
test4 = 'murov evgeniy alekseyevich'
test5 = 'murov evgen aleksyevic'
abs(len(test4)-len(test5))

4

In [75]:
print(sanction_screening(test3))

Process completed in 9.4784677028656- Found a match in the sanction list with a score of 94
{'Unnamed: 0': 2223, 'name': 'murov evgeniy alekseyevich', 'title': 'Director of the Federal Protective Service of the Russian Federation; Army General', 'dob': '18-Nov-45', 'pob': 'zvenigorod, moscow, russia', 'nationality': nan, 'citizenship': nan, 'aliases': nan}
True


In [65]:
print(len('murov evgen alekseyevic'))
print(len('murov evgeniy alekseyevich'))


23
26


In [85]:
print(ER_name_matching("aamir ali chaudary", "amir ali chaundaary"))

94


# Name Matching