# Information retrieval
The data is coming from TXT, CSV and JSON files that contain all tickers at a Stock Exchange. The observed exchanges are the NASDAQ, NYSE, AMEX, FSE, Bolsa de Madrid, London, Toronto, Tokyo.

In [39]:
import os
import json
import re
import csv
import requests
from bs4 import BeautifulSoup


headers1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
headers2 = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"}

### Data preprocessing: get the ticker data to download

In [32]:
# NASDAQ, NYSE, AMEX
for exchange in ['amex', 'nyse', 'nasdaq']:
    with open(f"data_ticker/{exchange}.json") as f:
        if exchange == 'amex':
            amex = json.load(f)
        elif exchange == 'nyse':
            nyse = json.load(f)
        elif exchange == 'nasdaq':
            nasdaq = json.load(f)

In [None]:
# Frankfurt Stock Exchange (FSE)
fse = []
with open("data_ticker/frankfurt.csv") as f:
    reader = csv.reader(f)
    # omit the header row
    next(reader)

    for row in reader:
        # access the string in the list and split the entries
        entry = row[0].split(';')
        try:
            # add ".de" otherwise it is not find on Yahoo Finance
            ticker = entry[1] + ".de"
            fse.append([entry[2], ticker])

        except IndexError as e:
            print(e, row)
            pass

In [19]:
# Bolsa de Madrid
madrid = []
with open("data_ticker/madrid.csv") as f:
    reader = csv.reader(f)
    next(reader)

    for row in reader:
        entry = row[0].split(';')
        try:
            ticker = entry[4][:-1] + ".mc"
            madrid.append([entry[0], ticker])

        except IndexError as e:
            print(e, row)
            pass

In [21]:
# London Stock Exchange (LSE)
lse = []
with open("data_ticker/london.csv") as f:
    reader = csv.reader(f)
    next(reader)

    for row in reader:
        entry = row[0].split(';')
        try:
            ticker = entry[4][:-1] + ".e"
            lse.append([entry[0], ticker])

        except IndexError as e:
            print(e, row)
            pass

In [25]:
# Toronto Stock Exchange (TSE)
toronto = []
with open("data_ticker/toronto_se.txt") as f:
    reader = csv.reader(f)
    # omit the first rows
    next(reader)
    for row in reader:
        try:
            entry = row[0].split("\t")
            ticker = entry[0]
            sep = ':'
            stripped = ticker.split(sep, maxsplit=1)[0]
            ticker = stripped + ".TO"
            toronto.append([entry[1], ticker])

        except IndexError as e:
            print(e, row)
            pass

In [27]:
# Tokyo Stock Exchange (TSE)
tokyo = []
with open("data_ticker/tokyo_se.csv") as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        try:
            ticker = row[0][3:] + ".T"
            tokyo.append([row[1], ticker])

        except IndexError as e:
            print(e, row)
            pass

## Download the ticker variables from Yahoo Finance

In [None]:
# amex, nyse, nasdaq, fse, madrid, lse, toronto, tokyo
tickers = amex
url = 'https://finance.yahoo.com/quote/{}/profile?p={}'
data = []
count = 1

for row in tickers:
    try:
        symbol = row[1]
        response = requests.get(url.format(symbol, symbol),headers=headers1)  # url profile website is scraped

        soup = BeautifulSoup(response.text, 'html.parser')
        pattern = re.compile(r'\s--\sData\s--\s')
        script_data = soup.find('script', text=pattern).contents[0]
        start = script_data.find("context")-2
        json_data = json.loads(script_data[start:-12])

        sector = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['sector']
        description = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['longBusinessSummary']

        data.append({"name": row[0], "ticker": row[1], 'sector': sector,"description": description})
        print("added: ", row[0])
        count += 1

    except KeyError as e:
        print("not found: ", row[0])
        pass

print("Results:", len(tickers), "tickers" , count, "added and", len(tickers)-count, "not found")

# save the data list into a JSON file
with open(f'data/{tickers}.json', 'w') as f:
    json.dump(data, f)

## Validation of the downloaded data

In [None]:
with open ('data/tokyo.json', encoding='utf8') as file:
    data = json.load(file)

print(type(data))
data[0]

## Consolidate datasets

In [None]:
path_to_json = 'C:\\Users\\esse\\OneDrive\\Dokumente\\studium\\semester2\\NLP\\project\\data\\'

final_list = []

for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
    print(file_name)
    with open(path_to_json + file_name) as json_file:
        data = json.load(json_file)
        for i in data:
            final_list.append(i)

print(len(final_list))

In [None]:
with open('data/data_complete.json', 'w') as f:
    json.dump(final_list, f)