# Information retrieval
The data is coming from TXT, CSV and JSON files that contain the tickers for a stock exchange. The observed exchanges are the NASDAQ, NYSE, AMEX, FSE, Bolsa de Madrid, London SE, Toronto SE, Tokyo SE.

In [1]:
import os
import json
import re
import csv
import requests
from bs4 import BeautifulSoup


headers1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
# alternative header
headers2 = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"}

### Data preprocessing: get the ticker data to download

In [10]:
# NASDAQ, NYSE, AMEX
for exchange in ['amex', 'nyse', 'nasdaq']:
    with open(f'02_data_ticker/{exchange}.json') as f:
        if exchange == 'amex':
            amex = json.load(f)
        elif exchange == 'nyse':
            nyse = json.load(f)
        elif exchange == 'nasdaq':
            nasdaq = json.load(f)

In [11]:
# Frankfurt Stock Exchange (FSE)
fse = []
with open('02_data_ticker/frankfurt.csv') as f:
    reader = csv.reader(f)
    # omit the header row
    next(reader)

    for row in reader:
        # access the string in the list and split the entries
        entry = row[0].split(';')
        try:
            # add ".de" otherwise it is not find on Yahoo Finance
            ticker = entry[1] + ".de"
            fse.append([entry[2], ticker])

        except IndexError as e:
            print(e, row)
            pass

In [12]:
# Bolsa de Madrid
madrid = []
with open('02_data_ticker/madrid.csv') as f:
    reader = csv.reader(f)
    next(reader)

    for row in reader:
        entry = row[0].split(';')
        try:
            ticker = entry[4][:-1] + ".mc"
            madrid.append([entry[0], ticker])

        except IndexError as e:
            print(e, row)
            pass

In [27]:
# London Stock Exchange (LSE)
lse = []
with open('02_data_ticker/london.csv') as f:
    reader = csv.reader(f)
    next(reader)

    for row in reader:
        entry = row[0].split(';')
        try:
            ticker = entry[4][:-1] + ".l"
            lse.append([entry[0], ticker])

        except IndexError as e:
            print(e, row)
            pass

In [14]:
# Toronto Stock Exchange (TSE)
toronto = []
with open('02_data_ticker/toronto_se.txt') as f:
    reader = csv.reader(f)
    # omit the first rows
    next(reader)
    for row in reader:
        try:
            entry = row[0].split("\t")
            ticker = entry[0]
            sep = ':'
            stripped = ticker.split(sep, maxsplit=1)[0]
            ticker = stripped + ".TO"
            toronto.append([entry[1], ticker])

        except IndexError as e:
            print(e, row)
            pass

In [15]:
# Tokyo Stock Exchange (TSE)
tokyo = []
with open('02_data_ticker/tokyo_se.csv') as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        try:
            ticker = row[0][3:] + ".T"
            tokyo.append([row[1], ticker])

        except IndexError as e:
            print(e, row)
            pass

## Download the ticker variables from Yahoo Finance

In [None]:
# options for tickers: amex, nyse, nasdaq, fse, madrid, lse, toronto, tokyo
tickers = fse
url = 'https://finance.yahoo.com/quote/{}/profile?p={}'
data = []
count = 1

for row in tickers:
    try:
        symbol = row[1]
        response = requests.get(url.format(symbol, symbol),headers=headers1)  # url profile website is scraped

        soup = BeautifulSoup(response.text, 'html.parser')
        pattern = re.compile(r'\s--\sData\s--\s')
        script_data = soup.find('script', text=pattern).contents[0]
        start = script_data.find("context")-2
        json_data = json.loads(script_data[start:-12])

        sector = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['sector']
        description = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['longBusinessSummary']

        data.append({"name": row[0], "ticker": row[1], 'sector': sector,"description": description})
        print("added: ", row[0])
        count += 1

    except KeyError as e:
        print("not found: ", row[0])
        pass

print("Results:", len(tickers), "tickers" , count, "added and", len(tickers)-count, "not found")

# save the data list into a JSON file
with open(f'data/{tickers}.json', 'w') as f:
    json.dump(data, f)

## Inspection of the downloaded data

In [17]:
file = 'london'
with open (f'04_extracted_data/{file}.json', encoding='utf8') as f:
    data = json.load(f)

print(type(data))
data[0]

<class 'list'>


{'name': 'ABCAM PLC',
 'ticker': 'ABC.l',
 'sector': 'Healthcare',
 'description': 'Abcam plc, a life science company, focuses on identifying, developing, and distributing reagents and tools for scientific research, diagnostics, and drug discovery. Its principal products include primary and secondary antibodies; conjugated antibodies and conjugation kits; singleplex and multiplex immunoassays; proteins and peptides that include cytokines; edited cell lines and lysates; and various other products, including cellular activity kits, miRNA kits, biochemicals, and cell signaling pathway tools. The company serves scientists and researchers in academic institutions and research institutes, as well as in pharmaceutical, biotechnology, and diagnostics companies. It has operations in the Americas, Europe, the Middle East, Africa, China, Japan, and rest of the Asia Pacific. The company sells its products online. Abcam plc was incorporated in 1998 and is headquartered in Cambridge, the United King

## Consolidate datasets

In [33]:
path_to_json = 'C:\\Users\\esse\\OneDrive\\Dokumente\\GitHub\\project_NLP\\04_extracted_data\\'

final_list = []

for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
    print(file_name)
    with open(path_to_json + file_name) as json_file:
        data = json.load(json_file)
        for i in data:
            final_list.append(i)

print("extracted company descriptions and sectors: ", len(final_list))

amex.json
frankfurt.json
london.json
madrid.json
nasdaq1.json
nasdaq2.json
nyse.json
tokyo.json
toronto.json
extracted company descriptions and sectors:  9856


In [34]:
with open('data_raw.json', 'w') as f:
    json.dump(final_list, f)