In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
rows = []
for year in range(2017, 2020):
    for page_num in range(1, 5):
        url = f"https://www.sfc.hk/edistributionWeb/gateway/EN/news-and-announcements/news/enforcement-news/?year={year}&page={page_num}"
    
        response = requests.get(url)
        doc = BeautifulSoup(response.text)

        cases = doc.find_all('tr')
        for case in cases[1:]:
            subjects = case.find_all('a')
            for subject in subjects[:-1]:
                row = {}
                row['Date'] = case.find('td').text.strip()
                row['Subject'] = subject.text.strip()
                row['Action'] = (case.find_all('a')[-1].text.strip())
                row['URL'] = case.find_all('a')[-1]['href']
                rows.append(row)

df = pd.DataFrame(rows)

In [3]:
df = df[['Date', 'Subject', 'Action', 'URL']]

# Scraping each page

In [4]:
def scrape_page(row):
    url = f"https://www.sfc.hk/edistributionWeb/gateway/EN/news-and-announcements/news/enforcement-news/{row['URL']}"
    
    response = requests.get(url)
    doc = BeautifulSoup(response.text)

    page = {}
    Summary = doc.find_all('p')[1].text.strip()
    page['Summary'] = Summary = doc.find_all('p')[1].text.strip()

    return pd.Series(page)

In [5]:
scraped_df = df.apply(scrape_page, axis=1)

In [6]:
df = df.merge(scraped_df, left_index=True, right_index=True)

In [7]:
df.loc[df['Action'].str.contains("restriction notices"),'Category'] = 'Targetting clients'
df.loc[df['Action'].str.contains("regulatory breaches"),'Category'] = 'Regulatory breaches'
court_key = ["Tribunal", "Court"]
for w in court_key:
    df.loc[df['Summary'].str.contains(w),'Category'] = 'Court'
df.loc[df['Summary'].str.contains("regulatory breaches"),'Category'] = 'Regulatory breaches'
df.loc[df['Action'].str.contains("ban\b"),'Category'] = 'Banned from industry'
df.loc[df['Action'].str.contains("sponsor failures"),'Category'] = 'Sponsor failures'
df.loc[df['Action'].str.contains("suspend"),'Category'] = 'License suspension'
df.loc[df['Action'].str.contains("revoke"),'Category'] = 'License revocation'
df.loc[df['Action'].str.contains("short selling"),'Category'] = 'Illegal short selling'
df.loc[df['Action'].str.contains("anti-money laundering"),'Category'] = 'AML related'
df.loc[df['Action'].str.contains("cross trade related failures"),'Category'] = 'Cross trade related failures'
df.loc[df['Summary'].str.contains("proper records"),'Category'] = 'Record failure'
df.loc[df['Action'].str.contains("trading systems"),'Category'] = 'Trading system failure'
df.loc[df['Summary'].str.contains("required registration"),'Category'] = 'Without registration'
df.loc[df['Summary'].str.contains("internal control"),'Category'] = 'Internal control failure'

In [8]:
df['Fine'] = df.Action.str.extract(".*\$(.*) million.*", expand=False)
df.loc[df['Fine'].isnull(),'Fine'] = df.Action.str.extract(".*\$(.*\d).*", expand=False)
df['Fine'] = df.Fine.str.replace(',', '')
df['Fine'] = df.Fine.astype(float)
df.loc[df['Fine'] < 1000,'Fine'] = df.Fine * 1000000

In [9]:
df['Penalty_Period'] = df.Summary.str.extract(".*for (.*) month.*", expand=False)
df.loc[df['Penalty_Period'].isnull(),'Penalty_Period'] = df.Action.str.extract(".*for (.*) year.*", expand=False)
df.loc[df['Penalty_Period'].isnull(),'Penalty_Period'] = df.Action.str.extract(".*for (life).*", expand=False)
df['Penalty_Period'] = df.Penalty_Period.replace({
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    '10': 10,
    'life': 1000
})
df.loc[df['Action'].str.contains("year"),'Penalty_Period'] = df.Penalty_Period * 12

In [10]:
df.to_csv("All-analysis by case.csv", index=False)