In [1]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import requests
from newspaper import Article
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nelly.loh/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
## Types of Financial Crimes & Keywords
# fraud
# electronic crime
# money laundering
# terrorist financing
# bribery
# corruption
# market abuse 
# information security
# insider dealing
# identity theft
# money mule
# national crime agency
# tipping off

# References [Keywords]
# https://www.int-comp.org/careers/your-career-in-financial-crime-prevention/what-is-financial-crime/
# https://www.shireleasing.co.uk/wp-content/uploads/2020/01/Financial-Crime-Key-Words.pdf

# References [Methodology]
# https://towardsdatascience.com/keyword-extraction-process-in-python-with-natural-language-processing-nlp-d769a9069d5c

In [3]:
news_search = ['fraud', 'electronic+crime', 'money+laundering', 'terrorist+financing', 'bribery', 'corruption', 'market+abuse', 'information+security', 'insider+dealing', 'identity+theft', 'money+mule', 'national+crime+agency', 'tipping+off']

In [7]:
def generate_link(identifier_dict, no_of_articles):
    link_start = "https://www.google.com/search?q="
    link_end = "&sxsrf=ALeKk01K1bOuJFHjy4HBARo1cRpUYakYPg:1629640327633&source=lnms&tbm=nws&sa=X&sqi=2&ved=2ahUKEwiu29um48TyAhWGqpUCHYuoAlcQ_AUoAnoECAEQBA&biw=1441&bih=718&dpr=2" 
    link_query = ""
    for identifier in identifier_dict.values():        
        link_query += identifier.replace(' ','+') + '+'
    return link_start + link_query + link_end + "&num=" + str(no_of_articles)

def article_extraction(link):
    article = Article(link)
    article.download()
    try:
        article.parse()
    except:
        pass
    return article.text

analyzer = SentimentIntensityAnalyzer()

def sentiment_analysis(text):
    return analyzer.polarity_scores(text)

In [8]:
# Main Function
def search_articles_on_text(search_text, no_of_articles):
    def generate_link(search_text, no_of_articles):
        link_start = "https://www.google.com/search?q=" + search_text
        link_end = "&sxsrf=ALeKk01K1bOuJFHjy4HBARo1cRpUYakYPg:1629640327633&source=lnms&tbm=nws&sa=X&sqi=2&ved=2ahUKEwiu29um48TyAhWGqpUCHYuoAlcQ_AUoAnoECAEQBA&biw=1441&bih=718&dpr=2" 
        link_query = ""
        return link_start + link_query + link_end + "&num=" + str(no_of_articles)
    
    link = generate_link(search_text, no_of_articles)
    req = Request(link, headers = {'User-Agent': 'Mozilla/5.0'})

    webpage = urlopen(req).read()

    links = []
    with requests.Session() as c:
        soup = BeautifulSoup(webpage, 'html5lib')
        #print(soup)
        for item in soup.find_all('div', attrs = {'class': "ZINbbc xpd O9g5cc uUPGi"}):
            current_dict = {}
            raw_link = (item.find('a', href = True)['href'])
            link = (raw_link.split("/url?q=")[1]).split('&sa=U&')[0]
            title = (item.find('div',attrs = {'class': 'BNeawe vvjwJb AP7Wnd'})).text
            description  = (item.find('div',attrs = {'class': 'BNeawe s3v9rd AP7Wnd'}).get_text())
            time = description.split(" · ")[0]
            #print(description)
            descript = description.split(" · ")[1]
            current_dict['title'] = title
            current_dict['time'] = time
            current_dict['description'] = descript
            current_dict['link'] = link
            current_dict['text'] = article_extraction(link)
            current_dict['sentiment'] = sentiment_analysis(article_extraction(link))
            links.append(current_dict)
    return links

In [9]:
df = pd.DataFrame()
for search in news_search:
    articles = search_articles_on_text(search, 20)
    df = df.append(articles)



In [10]:
df

Unnamed: 0,title,time,description,link,text,sentiment
0,Exclusive: Swift launches double financing fra...,12 hours ago,Swift is trialling a new trade finance fraud p...,https://www.gtreview.com/news/fintech/exclusiv...,Privacy Policy\n\nOur privacy commitments\n\nT...,"{'neg': 0.008, 'neu': 0.92, 'pos': 0.071, 'com..."
1,Fraud warning for young: 'I had £700 stolen',2 days ago,Anti-fraud charity the Fraud Advisory Panel is...,https://www.bbc.com/news/business-58499998,"""I was aware my real bank would never ask for ...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
2,OCBC Bank's fraud detection recovers SG$8 mill...,2 weeks ago,While cybercriminals still manage to avoid the...,https://techwireasia.com/2021/08/ocbc-banks-fr...,Fraud detection solutions continue to grow in ...,"{'neg': 0.177, 'neu': 0.755, 'pos': 0.068, 'co..."
3,ED arrests Rajiv Saxena in connection with ban...,3 hours ago,... the Enforcement Directorate (ED) in a conn...,https://sg.news.yahoo.com/ed-arrests-rajiv-sax...,"Rajiv Saxena (File Pic)\n\nNew Delhi [India], ...","{'neg': 0.083, 'neu': 0.86, 'pos': 0.057, 'com..."
4,The role of auditors in fighting fraud,5 hours ago,Corporate fraud remains a growing threat to ev...,https://gulfbusiness.com/the-role-of-auditors-...,Corporate fraud remains a growing threat to ev...,"{'neg': 0.135, 'neu': 0.708, 'pos': 0.157, 'co..."
...,...,...,...,...,...,...
15,Geelong avoided the ‘end of days’ 11 years ago...,53 mins ago,... the last time it was thought the club was ...,https://www.foxsports.com.au/afl/afl-2021-geel...,"Geelong is facing the “end of days”, but must ...","{'neg': 0.057, 'neu': 0.859, 'pos': 0.084, 'co..."
16,SEC Games Complete Men's Hoops Schedule,3 days ago,The Rebels host Ole Miss alum Mike White and t...,https://olemisssports.com/news/2021/9/9/mens-b...,2021-22 OLE MISS MEN'S HOOPS SEC SCHEDULE DATE...,"{'neg': 0.099, 'neu': 0.861, 'pos': 0.04, 'com..."
17,Birmingham 2-0 Derby County: Troy Deeney makes...,2 days ago,... expertly tipping away a dipping cross from...,https://www.skysports.com/football/birmham-vs-...,Birmingham continued their impressive climb up...,"{'neg': 0.042, 'neu': 0.812, 'pos': 0.146, 'co..."
18,You’re Nuts: What is the toughest game on Ohio...,2 days ago,"This week, we took one huge step towards colle...",https://www.landgrantholyland.com/2021/9/10/22...,"This week, we took one huge step towards colle...","{'neg': 0.034, 'neu': 0.852, 'pos': 0.114, 'co..."


In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('wordnet')

db = df['text'].tolist()
array_text = []

for sentence in db:
    tokenized_words = word_tokenize(sentence)
    tokenized_sentence = []
    for word in tokenized_words:
        tokenized_sentence.append(lemmatizer.lemmatize(word.lower()))
    tokenized_sentence = " ".join(tokenized_sentence)
    sentence_clean = tokenized_sentence.replace("\n", " ").replace("\'", "")
    array_text.append(sentence_clean)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nelly.loh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
# YAKE

In [14]:
import yake
kw_extractor = yake.KeywordExtractor()
language = "en"
max_ngram_size = 2
deduplication_threshold = 0.7 #repetition of words is allowed in keywords
numOfKeywords = 5
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
for j in range(len(array_text)):
    keywords = custom_kw_extractor.extract_keywords(array_text[j])
    print("Keywords of article", str(j+1), "\n", keywords)

Keywords of article 1 
 [('personal data', 0.0001363594930937236), ('exporta publishing', 0.00042019631953918283), ('data protection', 0.00045036239239012354), ('personal information', 0.0005889852037361924), ('data', 0.0009264967200959569)]
Keywords of article 2 
 [('one-time passcode', 0.006073385605980323), ('remote access', 0.008476777234338777), ('real bank', 0.014410137163119993), ('bank', 0.06565386046861375), ('otp', 0.07769634652849257)]
Keywords of article 3 
 [('fraud detection', 0.0004685542097902067), ('detection solution', 0.0008939786717409632), ('ocbc bank', 0.0009920343917039776), ('anti-fraud program', 0.001166518498758537), ('customer activity', 0.0015899640962379763)]
Keywords of article 4 
 [('central bank', 0.0005619222660194742), ('moser baer', 0.0010372569208450234), ('bank loan', 0.0010822413543773027), ('rajiv saxena', 0.0014175747486877338), ('deepak puri', 0.001664720613594593)]
Keywords of article 5 
 [('financial centre', 0.001370562225476095), ('financial

In [15]:
## KEYBERT

In [16]:
from keybert import KeyBERT
kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
for j in range(len(array_text)):
    keywords = kw_extractor.extract_keywords(array_text[j], stop_words='english')
    print("Keywords of article", str(j+1), "\n", keywords)

Keywords of article 1 
 [('online', 0.3912), ('login', 0.3902), ('browser', 0.3886), ('internet', 0.3823), ('password', 0.3581)]
Keywords of article 2 
 [('password', 0.2511), ('bank', 0.1386), ('laptop', 0.1043), ('remote', 0.0515), ('passcode', -0.0204)]
Keywords of article 3 
 [('fraudsters', 0.2314), ('cybercriminals', 0.2023), ('crime', 0.0689), ('pandemic', 0.0572), ('fraud', 0.0535)]
Keywords of article 4 
 [('arrested', 0.2387), ('dubai', 0.1978), ('businessman', 0.1837), ('2019', 0.1582), ('extradited', 0.1553)]
Keywords of article 5 
 [('corruption', 0.2124), ('crime', 0.1375), ('fraud', 0.1335), ('accountant', 0.122), ('annually', 0.0894)]
Keywords of article 6 
 [('fintechs', 0.3499), ('online', 0.2507), ('2021', 0.2434), ('neobank', 0.2361), ('atm', 0.2258)]
Keywords of article 7 
 [('fraud', 0.1893), ('pandemic', 0.1464), ('fraudulent', 0.1326), ('fintech', 0.1166), ('fingerprinting', 0.0882)]
Keywords of article 8 
 [('fraudsters', 0.2297), ('facebook', 0.1398), ('epidem

In [17]:
## RAKE

In [18]:
from rake_nltk import Rake
rake_nltk_var = Rake()
for j in range(len(array_text)):
    rake_nltk_var.extract_keywords_from_text(array_text[j])
    keyword_extracted = rake_nltk_var.get_ranked_phrases()[:5]
    print("Keywords of article", str(j+1), "\n", keyword_extracted)

Keywords of article 1 
 ['enjoyable customised experience whilst allowing u', '799 1585 59 data protection policy', 'team handling personal data must ensure', 'data protection policy ensures exporta publishing', 'may also store information including ip address']
Keywords of article 2 
 ['real bank would never ask', 'nt ask', 'wa aware', 'time passcode', 'remote access']
Keywords of article 3 
 ['group operational risk team ha extended', 'based analytic technique including predictive modeling', 'monitoring wa done across multiple channel', 'company experiencing six fraud incident', 'network link analysis amongst others']
Keywords of article 4 
 ['former madhya pradesh chief minister kamal nath', 'also sought bail cancellation stating', 'alleged bank loan fraud case', '600 crore scam case relating', 'bank loan case related']
Keywords of article 5 
 ['climate change fraud ha always existed', 'auditor must remain professionally sceptical', 'new normal following covid', 'king abdullah finan

In [19]:
# GENSIM

In [20]:
from gensim.summarization import keywords
for j in range(len(array_text)):
    print("Keywords of article", str(j+1), "\n", keywords(array_text[j]))

Keywords of article 1 
 information
informing
informed
personal data
include
includes
site
gtr
exporta
cookie
cooky
service
personally
use
useful
www
market including magazine
marketing
purpose
protection
protects
protected
protect
protecting
privacy policy
user
collect
collecting
collected
collection
company
website
customised
customise
browser
technical
secure
security
securely
certain
time
content
legal
record
recorded
recording
business
law
lawful
track
tracking
provider
provide
providing
provided
store
stored
process
processed
processing
receiving
received
transferred
transfer
receive enewsletters
necessary consent
unauthorised
enewsletter
practice
event
customer
statement
seeking
loss
kingdom
access
accessed
accessible
rate
news
new
area
disclose
disclosing
disclosed
handle
handled
handling
reasonably
reasonable
reason
individual
finance
pixel
publication
public
subject
london
training
place
placed
conference
offering
offer
described
relevant
Keywords of article 2 
 remote
Keywor