In [0]:
# RESTART KERNEL AFTER RUNNING THIS CELL
!pip install pandas --upgrade

# IMPORT MODULES
import urllib.request
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from ast import literal_eval
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# GET UNIQUE ASSETCODES FROM UNION OF NEWS DATA AND MARKET DATA

# READ NEWS DATA AND MARKET DATA
news_train_df = pd.read_pickle("news_train_df.p")
market_train_df = pd.read_hdf("market_train_df.h5")

codeList = []
# GET LIST OF ASSET CODES MENTIONED IN THE NEWS DATA
for i in range(len(news_train_df['assetCodes'])):
    x = literal_eval(news_train_df['assetCodes'][i])
    for j in range(len(x)):
        codeList.append(x.pop())
# GET ASSET CODES FROM MARKET DATA
codeList.extend(list(market_train_df['assetCode']))
# GET UNIQUE ASSET CODES
codeList = list(set(codeList))

# PUT LIST INTO A DATAFRAME
df = pd.DataFrame(codeList, columns = ["AssetCode"])
# df.to_csv('AssetCodes.csv', header=False, index=False)

**Scraping from Thomson Reuters**

In [0]:
df['ReutersURL'] = 'https://www.reuters.com/finance/stocks/company-profile/' + df['AssetCode'].astype(str)

# SCRAPE DESCRIPTION FROM REUTERS
reuters_description = []

for url in df['ReutersURL']:
    try: 
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)
        html = response.read().decode('unicode_escape')
        soup = BeautifulSoup(html, "html.parser")
        container = soup.find_all(name="div", attrs={"class":"moduleBody"})
        
        description = container[1].text.strip().split("\n")[0]
        reuters_description.append(description)
        
    except:
        # pass
        reuters_description.append("")
        
ReutersData = pd.DataFrame(reuters_description, columns = ["ReutersDescription"])
# ReutersData.to_csv('ReutersData.csv', header=False, index=False)

**Scraping from Yahoo Finance**

In [0]:
# READ UNIQUE ASSETCODES FROM UNION OF NEWS DATA AND MARKET DATA
# df = pd.read_csv("AssetCodes.csv", names = ["AssetCode"])

# EXTRACT STOCK TICKER FOR YAHOO FINANCE
df['YahooTicker'] = df['AssetCode'].str.split(".").str[0]

# GENERATE YAHOO FINANCE URL
df['YahooURL'] = "https://sg.finance.yahoo.com/quote/"+ df['YahooTicker'].astype(str) + "/profile?p=" + df['YahooTicker'].astype(str)

# SCRAPE DESCRIPTION, SECTOR, INDUSTRY, EMPLOYEES FROM YAHOO FINANCE
yahoo_description = []
yahoo_sector = []
yahoo_industry = []
yahoo_employees = []

for url in df['YahooURL']:
    try:
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)
        html = response.read().decode('unicode_escape')
        soup = BeautifulSoup(html, "html.parser")
        container = soup.find_all(name="div", attrs={"class":"Mb(25px)"})
        
        # COMPANY DESCRIPTION
        container1 = soup.find_all(name="section", attrs={"class":"quote-sub-section Mt(30px)"})
        description = container1[0].find_all(name = "p", attrs={"class":"Mt(15px) Lh(1.6)"})[0].text.strip()
        yahoo_description.append(description)
        
        # COMPANY SECTOR
        sector = container[0].find_all(name="span", attrs={"class":"Fw(600)"})[0].text.strip()
        yahoo_sector.append(sector)
        
        # COMPANY INDUSTRY
        industry = container[0].find_all(name="span", attrs={"class":"Fw(600)"})[1].text.strip()
        yahoo_industry.append(industry)
        
        # COMPANY EMPLOYEES
        employees = container[0].find_all(name="span", attrs={"class":"Fw(600)"})[2].text.strip()
        yahoo_employees.append(employees)
        
    except:
        # pass
        yahoo_description.append("")
        yahoo_sector.append("")
        yahoo_industry.append("")
        yahoo_employees.append("")

# PUT SCRAPED DATA INTO DATAFRAME
YahooData = pd.DataFrame(
    {'YahooDescription': yahoo_description,
     'YahooSector': yahoo_sector,
     'YahooIndustry': yahoo_industry,
     'YahooEmployees': yahoo_employees
    })

# PUT TO CSV
# YahooData.to_csv('YahooData.csv', header=True, index=False)


**Scraping from FinViz**

In [0]:
# GENERATE FINVIZ URL - USES THE SAME TICKER AS YAHOO FINANCE
df['FinvizURL'] = "https://finviz.com/quote.ashx?t=" + df['YahooTicker'].astype(str)

# SCRAPE DESCRIPTION, SECTOR, INDUSTRY, EMPLOYEES FROM FINVIZ
finviz_description = []
finviz_sector = []
finviz_industry = []
finviz_employees = []

for url in df['FinvizURL']:
    try: 
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)
        html = response.read().decode('unicode_escape')
        soup = BeautifulSoup(html, "html.parser")
        
        # COMPANY DESCRIPTION
        description = soup.find("td", attrs={"class":"fullview-profile"}).text.strip()
        finviz_description.append(description)
        
        # COMPANY SECTOR
        sector = soup.findAll("a", attrs={"class":"tab-link"})[13].text.strip()
        finviz_sector.append(sector)
        
        # COMPANY INDUSTRY
        industry = soup.findAll("a", attrs={"class":"tab-link"})[14].text.strip()
        finviz_industry.append(industry)
        
        # COMPANY EMPLOYEES
        employees = soup.findAll("tr", attrs={"class":"table-dark-row"})[8].findAll("td", attrs={"class":"snapshot-td2"})[0].text
        finviz_employees.append(employees)
        
    except:
        # pass
        finviz_description.append("")
        finviz_sector.append("")
        finviz_industry.append("")
        finviz_employees.append("")
        
# PUT SCRAPED DATA INTO DATAFRAME
FinvizData = pd.DataFrame(
    {'FinvizDescription': finviz_description,
     'FinvizSector': finviz_sector,
     'FinvizIndustry': finviz_industry,
     'FinvizEmployees': finviz_employees
    })

# PUT TO CSV
# FinvizData.to_csv('FinvizData.csv', header=True, index=False)

**Scraping from SEC**

In [0]:
# GENERATE SEC URL - USES THE SAME TICKER AS YAHOO FINANCE
df['SECURL'] = "https://www.sec.gov/cgi-bin/browse-edgar?CIK="+ df['YahooTicker'].astype(str) + "&owner=exclude&action=getcompany"

# SCRAPE DESCRIPTION FROM SEC
sec_industry = []

for url in df['SECSite']:
    try: 
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)
        html = response.read().decode('unicode_escape')
        soup = BeautifulSoup(html, "html.parser")
        
        # COMPANY INDUSTRY
        industryCode = soup.find("div", attrs={"class":"companyInfo"}).find("p", attrs={"class":"identInfo"}).find("a").text.strip()
        sec_industry.append(industryCode)
        
    except:
        # pass
        sec_industry.append("")

SECData = pd.DataFrame(SEC_industry, columns = ["SECIndustry"])
# SECData.to_csv('SECData.csv', header=False, index=False)

**Combined Industry Data**

In [0]:
# COMBINE RESULTS FOR DESCRIPTION TOGETHER
# WITH DESCRIPTION FROM REUTERS AS THE BASE, FOLLOWED BY YAHOO AND FINVIZ
for i in range(len(ReutersData)):
    if type(ReutersData['ReutersDescription'][i]) == float:
        if type(YahooData['YahooDescription'][i]) == str:
            replace = YahooData['YahooDescription'][i]
            ReutersData['ReutersDescription'] = replace
        else:
            if type(Finviz['FinvizDescription'][i]) == str:
                replace = Finviz['FinvizDescription'][i]
                ReutersData['ReutersDescription'] = replace
            else:
                continue
    else:
        continue
        
description = ReutersData['ReutersDescription']

In [0]:
# COMBINE RESULTS FOR SECTOR TOGETHER
# WITH SECTOR FROM YAHOO AS THE BASE, FOLLOWED BY FINVIZ
for i in range(len(YahooData)):
    if type(YahooData['YahooSector'][i]) == float:
        if type(FinvizData['FinvizSector'][i]) == str:
            replace = FinvizData['FinvizSector'][i]
            YahooData['YahooSector'][i] = replace
        else:
            continue
    else:
        continue
        
sector = YahooData['YahooSector']

In [0]:
# COMBINE RESULTS FOR INDUSTRY TOGETHER
# WITH INDUSTRY FROM SEC AS THE BASE, FOLLOWED BY YAHOO AND FINVIZ, AND THE SECTORS
for i in range(len(SECData)):
    if type(SECData['SECIndustry'][i]) == float:
        if type(YahooData['YahooIndustry'][i]) == str:
            replace = YahooData['YahooIndustry'][i]
            SECData['SECIndustry'][i] = replace
        else:
          if type(Finviz['FinvizIndustry'][i]) == str:
            replace = FinvizData['FinvizIndustry'][i]
            SECData['SECIndustry'][i] = replace
          else:
            if type(YahooData['YahooSector'][i]) == str:
              replace = YahooData['YahooSector'][i]
              SECData['SECIndustry'][i] = replace
            else:
              continue
    else:
        continue
        
industry = SECData['SECIndustry']

In [0]:
# COMBINE RESULTS FOR EMPLOYEES TOGETHER
# WITH EMPLOYEES FROM YAHOO AS THE BASE, FOLLOWED BY FINVIZ
for i in range(len(YahooData)):
    if type(YahooData['YahooEmployees'][i]) == float:
        if type(FinvizData['FinvizEmployees'][i]) == str:
            replace = FinvizData['FinvizEmployees'][i]
            YahooData['YahooEmployees'][i] = replace
        else:
            continue
    else:
        continue
        
employees = YahooData['YahooEmployees']

In [0]:
# COMBINED RESULTS
IndustryData = pd.DataFrame(
    {'Description': description,
     'Industry': industry,
     'Employees': employees
    })
IndustryData.to_csv('IndustryData.csv', header=False, index=False)