In [12]:
import jellyfish
from fuzzywuzzy import fuzz
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import requests
from newspaper import Article
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import dateutil.parser as parser

nlp = en_core_web_sm.load()
nltk.download('vader_lexicon')
df = pd.read_excel("NUS sample names_V2.xlsx", engine="openpyxl")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lionel/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Web Scraper

In [13]:
# Main Function
def search_articles_on_individual(individual_dict, no_of_articles=10):
    def generate_link(person_dict, attributes_used = ['name', 'alias'], keywords=['crimes', 'sentenced']):
        link_start = "https://www.google.com/search?q="
        link_end = "&sxsrf=ALeKk01K1bOuJFHjy4HBARo1cRpUYakYPg:1629640327633&source=lnms&tbm=nws&sa=X&sqi=2&ved=2ahUKEwiu29um48TyAhWGqpUCHYuoAlcQ_AUoAnoECAEQBA&biw=1441&bih=718&dpr=2" 
        link_query = ""

        for attributes in attributes_used:
            temp_attr = person_dict[attributes]
            if temp_attr is not None:
                temp_attr = str(temp_attr)
                link_query += temp_attr.replace(' ', '+') + '+'       
                
        links = []
        for keyword in keywords:
            temp_search_link = link_start + link_query + keyword + link_end + "&num=" + str(no_of_articles)
            links.append(temp_search_link)
        return links
    
    def article_extraction(link):
        article = Article(link)
        article.download()
        try:
            article.parse()
        except:
            pass
        return article.text
    def parse(text):
        #try:     
        doc = nlp(text)
        tags = [[X.text, X.label_] for X in doc.ents]
        labels = [x.label_ for x in doc.ents]
        items = [x.text for x in doc.ents]

        return tags
    def find_names(tags):
        names = []
        for tag in tags:
            if tag[1] == 'PERSON' and tag[0] not in names:
                names.append(tag[0])
        return names
    def sentiment_analysis(text):
        return vader.polarity_scores(text)

    
    link = generate_link(individual_dict)
    
    unique_links_checker = []
    
    output = []
    for x in link:
        print("SEARCHING THIS", x)
        req = Request(x, headers = {'User-Agent': 'Mozilla/5.0'})

        webpage = urlopen(req).read()

        links = []
        with requests.Session() as c:
            soup = BeautifulSoup(webpage, 'html5lib')
            #print(soup)
            for item in soup.find_all('div', attrs = {'class': "ZINbbc xpd O9g5cc uUPGi"}):
                current_dict = {}
                raw_link = (item.find('a', href = True)['href'])
                link = (raw_link.split("/url?q=")[1]).split('&sa=U&')[0]
                if link not in unique_links_checker:
                    unique_links_checker.append(link)
                    title = (item.find('div',attrs = {'class': 'BNeawe vvjwJb AP7Wnd'})).get_text()
                    description  = (item.find('div',attrs = {'class': 'BNeawe s3v9rd AP7Wnd'}).get_text())
                    time = description.split(" · ")[0]
                    #print(description)
                    descript = description.split(" · ")[1]
                    current_dict['title'] = title
                    current_dict['time'] = time
                    current_dict['description'] = descript
                    current_dict['link'] = link
                    current_dict['text'] = article_extraction(link)
                    # current_dict['sentiment?'] = sentiment_analysis(article_extraction(link))
                    parsed_description = parse(description)
                    names_in_description = find_names(parsed_description)
                    
                    current_dict['desc_names'] = names_in_description
                    
                    parsed_text = parse(article_extraction(link))
                    names_in_text = find_names(parsed_text)
                    
                    current_dict['text_names'] = names_in_text
                    output.append(current_dict)
                else:
                    pass
    return output


new_words = {
    'crushes': 10,
    'beats': 5,
    'misses': -5,
    'trouble': -10,
    'falls': -100,
    'bankrupt': -100,
    'fraud': -100,
    'crime': -100,
    'prison': -100,
    'embezzlement': -100,
    'money_trail':-100,
    'jail': -100,
    'sentenced':-100,
    'forfeiture': -100,
    'ill-gotten gain':-100,
    'laundering':-100
}


vader = SentimentIntensityAnalyzer()

vader.lexicon.update(new_words)

In [14]:
def preprocess_df_to_dict(df):
    def get_year(date):
        try:
            parser_obj = parser.parse(str(date))
            return parser_obj.year
        except:
            return None
    def get_month(date):
        if len(str(date))>4:
            try:
                return parser.parse(str(date)).month
            except:
                return None
        else:
            return None
    def get_day(date):
        if len(str(date))>4:
            try:
                return parser.parse(str(date)).day
            except:
                return None
        else:
            return None    
    
    df_dict_list = df.to_dict('records')
    cleaned_dict_list = []
    for record in df_dict_list:   
        current_record = {
            'name': record['Name to be screened'],
            'alias' : None if 'nan' else record['alias'],
            'year_of_birth': get_year(record['Date of birth']),
            'month_of_birth': get_month(record['Date of birth']),
            'day_of_birth': get_day(record['Date of birth']),
            'gender': None if 'nan' else record['Gender'],
            'nationality': None if 'nan' else record['Nationality'],
            ### delete these later on, for testing only###
            'type_of_error': record['Type of variation (if any)'],
            'actual_name': record['Actual name'],
        }
        cleaned_dict_list.append(current_record)
    return cleaned_dict_list

In [15]:
df_dict = preprocess_df_to_dict(df)
test_record_1 = df_dict[0]
print(test_record_1)

{'name': 'Chia Teck Leng', 'alias': None, 'year_of_birth': 1960, 'month_of_birth': None, 'day_of_birth': None, 'gender': None, 'nationality': None, 'type_of_error': '-', 'actual_name': 'Chia Teck Leng'}


In [16]:
test_query = search_articles_on_individual(test_record_1, 10)
pd.DataFrame(test_query)

SEARCHING THIS https://www.google.com/search?q=Chia+Teck+Leng+crimes&sxsrf=ALeKk01K1bOuJFHjy4HBARo1cRpUYakYPg:1629640327633&source=lnms&tbm=nws&sa=X&sqi=2&ved=2ahUKEwiu29um48TyAhWGqpUCHYuoAlcQ_AUoAnoECAEQBA&biw=1441&bih=718&dpr=2&num=10
SEARCHING THIS https://www.google.com/search?q=Chia+Teck+Leng+sentenced&sxsrf=ALeKk01K1bOuJFHjy4HBARo1cRpUYakYPg:1629640327633&source=lnms&tbm=nws&sa=X&sqi=2&ved=2ahUKEwiu29um48TyAhWGqpUCHYuoAlcQ_AUoAnoECAEQBA&biw=1441&bih=718&dpr=2&num=10


Unnamed: 0,title,time,description,link,text,desc_names,text_names
0,Guilty As Charged: Chia Teck Leng led a double...,63 months ago,Asia Pacific Breweries executive Chia Teck Len...,https://www.straitstimes.com/singapore/courts-...,This story was first published in July 2015 in...,[Chia Teck Leng],"[Chia Teck Leng, Chia, Tay Yong Kwang, Tay, Li..."
1,Guilty As Charged: Shocking crimes that have s...,64 months ago,Chia Teck Leng secretly led a double life as a...,https://www.straitstimes.com/singapore/courts-...,The following stories were first published in ...,[Chia Teck Leng],"[Ms Jenny Cheok Cheng Kid, Mimi Wong, Mimi Won..."
2,John Soh could face longest-ever jail term for...,54 months ago,"SINGAPORE (March 3): Chia Teck Leng, a former ...",https://www.theedgemarkets.com/article/john-so...,"SINGAPORE (March 3): Chia Teck Leng, a former ...",[Chia Teck Leng],"[Chia Teck Leng, Chia, John Soh Chee Wen, Soh,..."
3,Accountant who embezzled $40m a compulsive gam...,27 months ago,While Ewe did not amass riches from his crimes...,https://www.straitstimes.com/singapore/courts-...,SINGAPORE - Certified public accountant Ewe Pa...,[Chia Teck Leng],"[Ewe Pang Kooi, Ewe, Chan Seng, Michael Khoo -..."
4,"Sherlock Holmes of Shenton Way, Brunch - THE B...",38 months ago,PwC forensics leader Chan Kheng Tek adds that ...,https://www.businesstimes.com.sg/brunch/sherlo...,,"[Chan Kheng, Chia Teck Leng]",[]
5,A higher standard of financial prudence and re...,123 months ago,Chia Teck Leng (“Chia”) was the finance manage...,https://www.lexology.com/library/detail.aspx%3...,,"[Chia Teck Leng, Chia]",[]
6,Accountant who embezzled $40 million a compuls...,27 months ago,"In March, he was convicted of 50 charges of cr...",https://www.asiaone.com/singapore/accountant-w...,SINGAPORE - Certified public accountant Ewe Pa...,[Chia Teck Leng],"[Ewe Pang Kooi, Ewe, Chan Seng, Michael Khoo -..."
7,Traffic police officer dies in accident in Ser...,51 months ago,— with Chia So Chai and Leng Lim. 4.6K. 1.7K,https://www.straitstimes.com/singapore/acciden...,SINGAPORE - A traffic police officer died in a...,"[Chia So Chai, Leng Lim]","[Sergeant Nadzrie Matin, Sergeant Nadzrie]"
8,"When being mobile means quality of life, Lates...",40 months ago,"Mr Jason Peh, an occupational therapist at Kho...",https://www.tnp.sg/news/singapore/when-being-m...,“It allows them to connect with their social n...,"[Jason Peh, Thang Leng Leng]","[Thang Leng Leng, Goh Lam Chie, Warren Chew, V..."
9,COVID-19 pandemic (2019-21),10 months ago,"March 4, 2021 Aaron Leong, Joanne B. Cole, Lau...",https://collections.plos.org/collection/covid-19/,Biology & Life Sciences\n\nThis Collection of ...,"[Aaron Leong, Joanne B. Cole, Laura N. Brenner...",[]


In [21]:
type(test_query)

list

# Name Matching

In [38]:
def ER_name_matching(name1, name2):
    def split_name_list(name):
        name = name.lower()
        output = name.split(" ")
        return output
    def preprocess_name(names_dict, word):
        for key, value in names_dict.items():
            if word in value:
                return key
        else:
            return word
    def stitch_name(list1):
        output = ''
        for x in range(len(list1)):
            if x==0:
                output += list1[x]
            else:
                output += ' ' + list1[x]
        return output
    def phonetic_comparison(list1, list2):
        meta_list1 = []
        meta_list2 = []
        nysiis_list1 = []
        nysiis_list2 = []
        for name_1 in list1:
            meta_list1.append(jellyfish.metaphone(name_1))
            nysiis_list1.append(jellyfish.nysiis(name_1))
        for name_2 in list2:
            meta_list2.append(jellyfish.metaphone(name_2))
            nysiis_list2.append(jellyfish.nysiis(name_2))
        if (set(meta_list1) == set(meta_list2)) or (set(nysiis_list1) == set(nysiis_list2)):
            return True
        else:
            return False
    names_dict = {
        'gan' : ['kan','ban'],
        'shawn' : ['sean', 'shon', 'shaun'],
        'huang' : ['wang'],
        'michael' : ['mikael'],
        'joko' : ['djoko'],
        'budy' : ['budi'],
        'jang' : ['chang'],
        'song' : ['sung', 'seong'],
        'jo' : ['cho'],
        'jun' : ['chun'],
        'yong' : ['ryong'],
        'jong' : ['jung'],
        'hyeok' : ['hyok'],
        'mun' : ['moon'],
        'zhi' : ['zih'],
        'qian' : ['chian'],
        'kuin' : ['quin'],
        # Short/Long form
        'bernard' : ['bern', 'bernie'],
        'senior' : ['sr.', 'snr.', 'sr', 'snr'],
        'junior' : ['jr.', 'jnr.', 'jr', 'jnr'],
        'michael' : ['mike','mikey'],
        'barbara' : ['barb'],
        'catherine' : ['cathy', 'kathy'],
        'kate' : ['cate'],
        'clayton' : ['clay'],
        'clifford' : ['cliff'],
        'clinton' : ['cliff'],
        'curtis' : ['curt'],
        'kurtis' : ['kurt'],
        'daniel' : ['dan'],
        'deborah' : ['deb', 'debbie'],
        'dennis' : ['denny'],
        'nicholas' : ['nick', 'nic', 'nikolas', 'nicolas', 'nickolas', 'nicky'],
        'dominic' : ['dom', 'dominik', 'dominick'],
        'douglas' : ['doug'],
        'donald' : ['don', 'donnie'],
        'gabrielle' : ['gabby', 'gabi'],
        'gabriel' : ['gabe', 'gaby'],
        'gwendolyn' : ['gwen', 'gwendy'],
        'herbert' : ['herb'],
        'howard' : ['howie'],
        'hunter' : ['hunt'],
        'javier' : ['havier'],
        'jacob' : ['jake'],
        'josephine' : ['jo','josephina'],
        'joshua' : ['josh'],
        'jessica' : ['jess'], #KIV
        'madeline' : ['magdaline', 'maddy', 'mad', 'madelina', 'maddie'],
        'marvin' : ['marv'],
        'marcus' : ['marc', 'markus'],
        'matthew' : ['matty', 'matt'],
        'maximillian' : ['max','maxi','maxie','maxim', 'maxam', 'maxus'],
        'megan' : ['meg'],
        'mitchell' : ['mitch'],
        'natasha' : ['tasha', 'tash'],
        'nelson' : ['nel'],
        'oliver' : ['olivier', 'ollie'],
        'oscar' : ['ozzy'],
        'pamela' : ['pam'],
        'patricia' : ['patty', 'patsy'],
        'penelope' : ['penny'],
        'percipval' : ['percy'],
        'peter' : ['pete', 'petey'],
        'phillip' : ['philip', 'pilip', 'filip', 'filipo', 'phil'],
        'quincy' : ['quin','quinn'],
        'rachel' : ['rach'],
        'raphael' : ['ralph', 'rafi'],
        'raymond' : ['ray'],
        # 'richard' : ['rich', 'rick', 'dicky', 'dick'], 
        'ronald' : ['ron', 'ronny'],
        'roxanne' : ['rox', 'roxy', 'roxie'],
        'russell' : ['russ'],
        'samuel' : ['sam'],
        'sebastian' : ['seb', 'sebby'],
        'sidney' : ['sydney','sid'],
        'spencer' : ['spence'],
        'stanley' : ['stan'],
        'stephan' : ['steph', 'stef', 'stefan'],
        'steven' : ['steve', 'stevie', 'stevo'],
        'stuart' : ['stuie', 'stu'],
        'spencer' : ['spence'],
        'susan' : ['sue', 'susie', 'suzy'],
        'sylvia' : ['sylvie'],
        'sophia' : ['sofie', 'sofia', 'sophie'],
        'tracy' : ['tracey'],
        'teresa' : ['teri', 'terri', 'terry', 'terrie'],
        'thomas' : ['tom', 'tommmy'],
        'tiana' : ['tia'],
        'theodore' : ['teddy', 'ted', 'theo'],
        'tiffany' : ['tiff', 'tiffy', 'tiffani'],
        'timothy' : ['tim', 'timo', 'timmy'],
        'tobias' : ['toby', 'tobi'],
        'trenton' : ['trent'],
        'trevor' : ['trev'],
        'travis' : ['trav'],
        'tyrone' : ['tyron'],
        'victoria' : ['vicky', 'vick', 'vicki'],
        'wallace' : ['wally'],
        'wesley' : ['wes'],
        'william' : ['will', 'willy'],
        'xander' : ['xan'],
        'xavier' : ['xav', 'xavi'],
        'zachary' : ['zach', 'zac', 'zacharias', 'zack', 'zach'],
        'zane' : ['zayn', 'zain'],
        'zoe' : ['zoey'],
    }    
    
    # START #
    ### Change this if needed ###
    threshold = 89
    #############################
    
    split_list_1 = split_name_list(name1)
    split_list_2 = split_name_list(name2)    
    
    for i in range(len(split_list_1)):
        split_list_1[i] = preprocess_name(names_dict, split_list_1[i])
    for i in range(len(split_list_2)):
        split_list_2[i] = preprocess_name(names_dict, split_list_2[i])
    
    stitched_name1 = stitch_name(split_list_1)
    stitched_name2 = stitch_name(split_list_2)
    
    # 1st layer of testing: Token Sort Ratio with threshold
    score1 = fuzz.token_sort_ratio(stitched_name1, stitched_name2)
    if score1 >= threshold:
        # score_list.append(score1)
        return score1
        # do something
# 4) 2nd layer of testing - Metaphone and NYSIIS phonetic encoding - DONE
    else: 
        matched_phonetic = phonetic_comparison(split_list_1, split_list_2)
        if matched_phonetic:
            return threshold # assumption that phonetic match will give threshold score
        else: 
            return None
    
    try:
        return score1
    except:
        pass

In [39]:
def entity_recognition_scoring(queried_name, list_of_article_dicts):
    output=[]
    for article in list_of_article_dicts:
        # Descriptions
        current_desc_names = article['desc_names']
        matched = False
        for desc_name in current_desc_names:
            score = ER_name_matching(queried_name, desc_name)
            if score is not None:
                matched=True
            if matched:
                break
        if not matched:
            # try checking text names
            current_text_names = article['text_names']
            for text_name in current_text_names:
                score = ER_name_matching(queried_name, text_name)
                if score is not None:
                    matched=True
                if matched:
                    break
        if matched:
            # keep article
            article['name_matching_score'] = score
            output.append(article)
    return output
            
        
    ##### ADD IN SECONCARY IDENTIFIERS #######
    
    ##### ADD IN SCORING SYSTEM HERE #######    

In [40]:
name_matched = entity_recognition_scoring(test_record_1['name'], test_query)
pd.DataFrame(name_matched)

Unnamed: 0,title,time,description,link,text,desc_names,text_names,name_matching_score
0,Guilty As Charged: Chia Teck Leng led a double...,63 months ago,Asia Pacific Breweries executive Chia Teck Len...,https://www.straitstimes.com/singapore/courts-...,This story was first published in July 2015 in...,[Chia Teck Leng],"[Chia Teck Leng, Chia, Tay Yong Kwang, Tay, Li...",100
1,Guilty As Charged: Shocking crimes that have s...,64 months ago,Chia Teck Leng secretly led a double life as a...,https://www.straitstimes.com/singapore/courts-...,The following stories were first published in ...,[Chia Teck Leng],"[Ms Jenny Cheok Cheng Kid, Mimi Wong, Mimi Won...",100
2,John Soh could face longest-ever jail term for...,54 months ago,"SINGAPORE (March 3): Chia Teck Leng, a former ...",https://www.theedgemarkets.com/article/john-so...,"SINGAPORE (March 3): Chia Teck Leng, a former ...",[Chia Teck Leng],"[Chia Teck Leng, Chia, John Soh Chee Wen, Soh,...",100
3,Accountant who embezzled $40m a compulsive gam...,27 months ago,While Ewe did not amass riches from his crimes...,https://www.straitstimes.com/singapore/courts-...,SINGAPORE - Certified public accountant Ewe Pa...,[Chia Teck Leng],"[Ewe Pang Kooi, Ewe, Chan Seng, Michael Khoo -...",100
4,"Sherlock Holmes of Shenton Way, Brunch - THE B...",38 months ago,PwC forensics leader Chan Kheng Tek adds that ...,https://www.businesstimes.com.sg/brunch/sherlo...,,"[Chan Kheng, Chia Teck Leng]",[],100
5,A higher standard of financial prudence and re...,123 months ago,Chia Teck Leng (“Chia”) was the finance manage...,https://www.lexology.com/library/detail.aspx%3...,,"[Chia Teck Leng, Chia]",[],100
6,Accountant who embezzled $40 million a compuls...,27 months ago,"In March, he was convicted of 50 charges of cr...",https://www.asiaone.com/singapore/accountant-w...,SINGAPORE - Certified public accountant Ewe Pa...,[Chia Teck Leng],"[Ewe Pang Kooi, Ewe, Chan Seng, Michael Khoo -...",100


In [30]:
a = False
if a is False:
    print("F")

F
