# Solution - Pogány László

The reason of selecting __Jupyter/IPython Notebook__ against a simple Python project is the self documentation support. 
Although the solutions implemented in this way can't be integrated directly into the existing operative systems (without further work), but it is much easier to demonstrate the whole process and the implemented functionalities step by step.

This solution installs the external libraries, downloads the required historical data automatically and computes the relevant metrics.

## Installing prerequisites

In [None]:
!python --version
# Python 3.5.2 :: Anaconda 4.2.0 (64-bit)

The __pandas__ library is used for storing the data in dataframes.
<br />
The __beautifulsoup4__ library is used for scrapping data from web.
<br />
The __pandas-datareader__ library is used for downloading __Yahoo Finance__ and __Google Finance__ datasets.<br />
Further information: https://github.com/pydata/pandas-datareader

In [None]:
!pip install datetime
!pip install bs4
!pip install requests
!pip install numpy
!pip install scipy
!pip install pandas
!pip install pandas-datareader
!pip install intervaltree
!pip install beautifulsoup4

## Downloader and preprocessor

Predefined directory names for the application.
The downloads and the calculated outputs are placed into special directories inside the repository.

In [None]:
import os

# constants and directory structure for the application
OUTPUT_DIRECTORY_PATH   = r'..\out'
LOG_DIRECTORY_PATH      = os.path.join(OUTPUT_DIRECTORY_PATH, "logs")
DOWNLOAD_DIRECTORY_NAME = os.path.join(OUTPUT_DIRECTORY_PATH, "data")
DATABASE_DIRECTORY_NAME = os.path.join(OUTPUT_DIRECTORY_PATH, "meta")

DATABASE_FILE_NAME      = 'metadata.db'

def createDirectory(path):
    if not os.path.exists(path):
        os.makedirs(path)
        
createDirectory(OUTPUT_DIRECTORY_PATH)
createDirectory(LOG_DIRECTORY_PATH)
createDirectory(DOWNLOAD_DIRECTORY_NAME)
createDirectory(DATABASE_DIRECTORY_NAME)

The __PlatformLogger__ function provides logging services for the modules/classes. <br />
The __Utils__ and __InputPreprocessor__ classes are implementing common functionalities for data manipulation in form of static helper functions.

In [None]:
from datetime import datetime
import doctest
import logging
import logging.handlers

doctest.testmod()


loggers = {}
def PlatformLogger(moduleName):
    """ Provides logging functionalities.
    
    The class configures the log service, the log handlers and returns a logger object.
    """
    
    global loggers

    if loggers.get(moduleName):
        
        return loggers.get(moduleName)
    
    else:
        
        # create logger with moduleName parameter
        logger = logging.getLogger(moduleName)
        logger.setLevel(logging.DEBUG)

        # create console handler with a higher log level
        ch = logging.StreamHandler()
        ch.setLevel(logging.INFO)

        # create file handler which logs even debug messages
        fh = logging.FileHandler(os.path.join(LOG_DIRECTORY_PATH, 'error.log'))
        fr = logging.handlers.RotatingFileHandler(os.path.join(LOG_DIRECTORY_PATH, 'event.log'))
        fh.setLevel(logging.WARN)
        fr.setLevel(logging.DEBUG)

        # create formatter and add it to the handlers
        formatter = logging.Formatter('[%(asctime)s] - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        fr.setFormatter(formatter)
        ch.setFormatter(formatter)

        # add the handlers to the logger
        logger.addHandler(fh)
        logger.addHandler(fr)
        logger.addHandler(ch)
        
        # updating loggers
        loggers.update(dict(name=logger))

        return logger



class Utils(object):
    """ The class contains a collection of useful functions in relation to minor data manipulations """
    
    
    @staticmethod
    def checkDatetimeParam(dt, paramName):
        assert (type(dt) is datetime), \
            paramName + " parameter is not datetime, it is a(n) %s" % type(dt)
    
    
    @staticmethod
    def convertDatetimeStrFormat(dtString, fromPattern, toPattern):
        """ The method converts a datetime object represented in formatted string to another formatted string 
        
        >>> Utils.convertDatetimeStrFormat('1987-08-14', "%Y-%m-%d", "%d/%m/%Y")
        '14/08/1987'
        >>> Utils.convertDatetimeStrFormat('5/1/1990', "%d/%m/%Y", "%Y-%m-%d")
        '1990-01-05'
        """
        
        return datetime.strptime(dtString, fromPattern).strftime(toPattern)
    
    
    @staticmethod
    def convertDateTimeToString(dt, datetime_pattern = "%Y%m%d"):
        """ Converter for datetime types
        
        In case of datetime parameter returns a formatted datetime string, otherwise returns the original
        
        >>> Utils.convertDateTimeToString(datetime(2016,1,2,3,4,5))
        '20160102'
        >>> Utils.convertDateTimeToString(datetime(2016,1,2,3,4,5), "%Y%m%d-%H%M%S")
        '20160102-030405'
        >>> Utils.convertDateTimeToString('2016-01-01')
        '2016-01-01'
        >>> Utils.convertDateTimeToString(2016)
        Traceback (most recent call last):
          ...
        AssertionError: dt parameter is not string nor datetime, it is a(n) <class 'int'>
        """
        
        # checking parameter type
        assert (type(dt) is datetime or type(dt) is str), \
            "dt parameter is not string nor datetime, it is a(n) %s" % type(dt)
        
        # converversion with formatting datetime to string if needed
        dtStr = dt
        if(type(dt) is datetime):
            dtStr = dt.strftime(datetime_pattern)
            
        return dtStr



class InputPreprocessor(object):
    """ The class contains a collection of functions in relation to preprocessing of the historical data """
    
    

The downloading process of the datasets is implemented by using a __DownloadAdapter__ instance.
<br />
The __YahooFinanceDownloadAdapter__ and __GoogleFinanceDownloadAdapter__ classes are implementing download datasource specific solutions.

In [None]:
from abc import ABC, abstractmethod
from datetime import datetime
import requests
import pandas as pd
import pandas_datareader as pdr
import io
import doctest

doctest.testmod()


class DownloadAdapter(ABC):
    """ Base class for download adapters
    
    The subclasses are inherit the 'downloaderFunction' abstract method.
    The abstract method's implementation will specify the real download method.
    """
    
    def __init__(self, datasourceName):
        super().__init__()
        self.datasourceName = datasourceName
    
    
    @abstractmethod
    def downloaderFunction(self, symbolCode, startDateTime, endDateTime):
        """ This method will be called by the data manager """
        pass



class YahooFinanceDownloadAdapter(DownloadAdapter):
    """ Specifies Yahoo Finance downloading functionalities by implementing DownloadAdapter """
    
    def __init__(self):
        super().__init__('yahoo')
    
    
    def downloaderFunction(self, symbolCode, startDateTime, endDateTime):
        """ This method downloads the historical data from Yahoo Finance
        """
        
        # checking parameter type
        Utils.checkDatetimeParam(startDateTime, 'startDateTime')
        Utils.checkDatetimeParam(endDateTime, 'endDateTime')
        
        # downloads the historical data
        data = pdr.get_data_yahoo(symbolCode, start=startDateTime, end=endDateTime)
        
        # drops an unnecessary column
        del data['Adj Close']
        
        # sorting table
        data.sort_index(inplace=True)
        
        return data
        


class GoogleFinanceDownloadAdapter(DownloadAdapter):
    """ Specifies Google Finance downloading functionalities by implementing DownloadAdapter 
    
    This method implements an URL based solution. The reason of not using 'pandas_datareader' library here is that
    it seems not to work on older data. According to the experiments a templated direct URL can dowload much more
    historical data.
    """
    
    def __init__(self):
        super().__init__('google')
    
    
    DOWNLOAD_URL_PATTERN = "http://finance.google.ca/finance/historical?q=%s:%s&startdate=%s&enddate=%s&output=csv"
    DATE_QUERY_PATTERN   = "%m+%d+%Y"
    
    
    def downloaderFunction(self, symbolCode, startDateTime, endDateTime):
        """ This method downloads the historical data from Google Finance
        
        >>> gfda = GoogleFinanceDownloadAdapter()
        >>> resultSet = gfda.downloaderFunction("GOOGL", datetime(2016,1,1), datetime(2017,1,1))
        >>> len(resultSet)
        252
        >>> resultSet = gfda.downloaderFunction("GOOGL", datetime(1990,1,1), datetime(2017,1,1))
        >>> len(resultSet)
        3114
        """
        
        # checking parameter type
        Utils.checkDatetimeParam(startDateTime, 'startDateTime')
        Utils.checkDatetimeParam(endDateTime, 'endDateTime')
        
        # getting the data 
        startdate = startDateTime.strftime(self.DATE_QUERY_PATTERN)
        enddate = endDateTime.strftime(self.DATE_QUERY_PATTERN)
        
        stock_url = "http://finance.google.ca/finance/historical?q=" + \
                    symbolCode + "&startdate=" + startdate + "&enddate=" + enddate + "&output=csv"
        raw_response = requests.get(stock_url).content
        
        # reading data into pandas dataframe
        data = pd.read_csv(io.StringIO(raw_response.decode('utf-8')))
                
        # identifying the name of the first column
        keyColumnName = data.columns[0]    # According to the downloaded CSV this sould be a column named 'Date'

        # formatting Date column
        def changeDateStr(string):
            dt = datetime.strptime(string, '%d-%b-%y')
            return dt
        data[keyColumnName] = data[keyColumnName].apply(changeDateStr)

        # use Date column as key
        data.set_index(keyColumnName, inplace=True)

        # sorting table
        data.sort_index(inplace=True)

        return data
        


Global function __getSP400ListFromWikipedia()__ searches in the web for current list of SP400 companies.

The reason of using a webscrapper is that the symbols of the SP400 are changing in time and the script always needs the current values.
<br />
The current values may be found here: <https://en.wikipedia.org/wiki/List_of_S%26P_400_companies>

In [None]:
# a fresh list of 400 companies (changes by time, needs to be scraped at runtime)
URL_WIKIPEDIA_SP400 = "https://en.wikipedia.org/wiki/List_of_S%26P_400_companies"

In [None]:
from bs4 import BeautifulSoup
import urllib.request
import doctest

doctest.testmod()


def getSP400ListFromWikipedia():
    """ Returns current SP400 symbol codes from Wikipedia
    
    Downloads HTML content, selects the first table and returns the values from the first column of the selected table.
    
    >>> len(getSP400ListFromWikipedia())
    400
    """
    
    TABLE_NUMBER = 0   # the first table's content should be downloaded from wikipedia
    
    with urllib.request.urlopen(URL_WIKIPEDIA_SP400) as response:
        
        # downloading html content
        html = response.read()
        
        # parsing html content
        htmlSoup = BeautifulSoup(html, 'lxml')
        if htmlSoup is None:
            return None
        
        # finding table in html code
        tableHtml = htmlSoup.findAll('table', class_='wikitable sortable')[TABLE_NUMBER]
        tableSoup = BeautifulSoup(str(tableHtml), 'lxml')
        
        # filtering relevant values from table
        return [x.td.a.text for x in tableSoup('tr') if x.td]
        

## Data layer

An instance of the __DataManager__ class can download historical data by using the __DownloadAdapter__'s method.
<br />
The downloaded historical data are loaded into pandas dataframes, the contents of the dataframes are saved into CSV files in ___data___ directory.
<br />
Because the download process sometimes fails and not all of the ticker symbols can be found in both of the datasources, therefore a retry logic is implemented which tries to download the files several times, but after that it gives up the process.
<br />
According to the task's criteria each OHLC record should be downloaded exactly once, therefore a metadata about the properties of the dowload is being kept in an __sqlite__ database. The data is downloaded only if there was no download before for the given symbol name, datasource and the given interval. The downloaded CSV files are kept separately in the _data_ folder (where the name of the files are the keys which contain the symbol name, the interval and the datasource) until a new download process identifies their segmentation and merges them into a common file (if segments are overlaping by time).

In [None]:
from datetime import datetime, timedelta
import itertools
import doctest
import random
import sqlite3
import os
doctest.testmod()


class DataManager:
    """ Downloads and manages historical OHLC and volume data
    
    Downloads each record exactly only once
    Stores the downloaded history as CSV files locally
    
    TODO: Stores the downloaded history's index in SQL database locally
    TODO: Makes queries in the database after the index, checks the already stored data, if data is not present downloads it
    """
    
    FILE_NAME_PATTERN = "%s.%s_%s.%s.csv"
    
    def __init__(self):
        self.logger = PlatformLogger('DataManager')
        self.dbPath = os.path.join(DATABASE_DIRECTORY_NAME, DATABASE_FILE_NAME)
        self.__initializeDataBase()
        
        
    
    def __initializeDataBase(self):
        
        conn = sqlite3.connect(self.dbPath)
        conn.execute('''create table if not exists METADATA
                (ID              INTEGER PRIMARY KEY,
                 SYMBOL_CODE     CHAR(10)   NOT NULL,
                 DATASOURCE      TEXT       NOT NULL,
                 FILE_NAME       TEXT       NOT NULL,
                 INTERVAL_START  TEXT       NOT NULL,
                 INTERVAL_END    TEXT       NOT NULL);''')
        conn.close()
    
    
    
    def saveDataframeToCsv(self, symbol, dataframe, startDateTime, endDateTime, datasource):
        
        fileName = DataManager.FILE_NAME_PATTERN % (symbol, \
                                        Utils.convertDateTimeToString(startDateTime), \
                                        Utils.convertDateTimeToString(endDateTime), \
                                        datasource)
        filePath = os.path.join(DOWNLOAD_DIRECTORY_NAME, fileName)
        dataframe.to_csv(filePath, sep=',', encoding='utf-8')
        return filePath
    
    
    
    def saveMetadataToDatabase(self, symbol, datasource, filename, intervalStart, intervalEnd):
        
        conn = sqlite3.connect(self.dbPath)
        conn.execute("INSERT INTO METADATA (SYMBOL_CODE, DATASOURCE, FILE_NAME, INTERVAL_START, INTERVAL_END) \
              VALUES ('" + symbol + "', '" + datasource + "', '" + filename + "', '" 
                          + intervalStart + "', '" + intervalEnd + "' )");
        conn.commit()
        conn.close()


        
    def queryDatabase(self, queryString):
        
        conn = sqlite3.connect(self.dbPath)
        cursor = conn.execute(queryString)
        result = cursor.fetchall()
        conn.close()
        return result
    
    
    
    def calculateIntervalsToDownload(plannedIntervalToDownload, alreadyStoredIntervals):
        """ Generates time intervals which were not dowloaded before
        
        Gets an interval (fromDatetime, toDatetime), check the already donwloaded intervals and computes the intervals
        which were never downloaded before.
        """
        
        deltatime = timedelta(days=1)
        ti = IntervalTree([Interval(plannedIntervalToDownload[0], plannedIntervalToDownload[1])])
        ti.merge_overlaps()
        for storedInterval in alreadyStoredIntervals:
            print(storedInterval)
            ti.chop(storedInterval[0] - deltatime, storedInterval[1] + deltatime)
        return sorted(ti)
    
    
    
    def downloadSymbolData(self, listOfSymbols, adapterList, intervalStart, intervalEnd, maxRetryCnt = 10):
        """ Tries to download data for ticker symbols
        
        The function generates download tasks for each (symbol, adapter, max_retry) triplets and according to 
        the generated task list it tries to download the historical data. The donwloaded data is saved into
        the 'data' directory, the metadata (e.q. interval borders, filename etc.) is saved into database.
        """
             
        adapterNames = list(map(lambda x: x.datasourceName, adapterList))
        
        # if the adapters are not uniquely named raise error
        if (len(adapterList) != len(set(adapterNames))):
            raise ValueError('The adapters are not unique.')
        
        # generate the tasks for downloading
        dataToDownload = set([x for x in itertools.product(listOfSymbols, adapterNames, [maxRetryCnt])])
        
        # executing the download tasks, where a download task looks like:
        # (symbolCode, datasource, cntOfRemainingRetries)    e.g. ('AAPL', 'google', '5')
        while ( 0 < len(dataToDownload)):

            # seleting a random task
            selectedTask = random.sample(set(dataToDownload), 1)[0]
            symbol, datasource, retryCnt = selectedTask
            dataToDownload.discard(selectedTask)
            
            try:
                for adapter in adapterList:
                    
                    # execute the download process with the proper adapter and save file and metadata
                    if (adapter.datasourceName == datasource):
                        data = adapter.downloaderFunction(symbol, intervalStart, intervalEnd)                    
                        filePath = self.saveDataframeToCsv(symbol, data, intervalStart, intervalEnd, datasource)
                        self.saveMetadataToDatabase(symbol, datasource, filePath, \
                                        Utils.convertDateTimeToString(intervalStart), \
                                        Utils.convertDateTimeToString(intervalEnd))
                        self.logger.info("downloaded: " + str((symbol, datasource)) + "\t" + filePath)
                        break
            
            except:
                
                # if the selected download task was not executed, the retry count is decremented
                if (retryCnt > 1):
                    dataToDownload.add((symbol, datasource, retryCnt-1))
                counter = maxRetryCnt - retryCnt + 1
                self.logger.warning("failed to download [" + str(counter) + "/" + str(maxRetryCnt) + "]: " \
                               + str((symbol, datasource)))
                
    
    
    def downloadData(self, downloaderFunction, datasource=None):
        """ Returns a method with curried parameters which can download historical data """
        
        def downloader(symbolCode, startDateTime, endDateTime):
            """ Returns a closure with fixed downloaderFunction and datasource """
            return downloaderFunction(symbolCode, startDateTime, endDateTime, datasource)
        
        return downloader
    
    
    
    def getData(self, symbolOrSymbolList, startDateTime, endDateTime):
        pass
    


## Downloading data

In [None]:
# interval border constants
DATA_START          = datetime(2016,1,1)
DATA_END            = datetime(2017,1,1)
DATA_VARIANCE_START = datetime(2016,2,11)
DATA_VARIANCE_END   = datetime(2016,11,8)
DATA_MINMAX_START   = datetime(2016,1,18)    # including
DATA_MINMAX_END     = datetime(2016,10,18)   # excluding
DATA_STD_START      = datetime(2016,4,17)
DATA_STD_END        = datetime(2016,12,15)

In [None]:
# execution test
listOfSymbols = getSP400ListFromWikipedia()
truncList1 = listOfSymbols[:3]
truncList2 = listOfSymbols[10:13]

gfda = GoogleFinanceDownloadAdapter()
yfda = YahooFinanceDownloadAdapter()

dm1 = DataManager()
dm1.downloadSymbolData(truncList1, [gfda, yfda], DATA_START, DATA_END, 5)
dm2 = DataManager()
dm2.downloadSymbolData(truncList2, [gfda, yfda], DATA_START, DATA_END, 5)

In [None]:
dm.queryDatabase("select * from METADATA")

## Processing