# Solution - Pogány László

This solution installs the external libraries, downloads the required historical data automatically and computes the relevant metrics.

## Installing prerequisites

In [None]:
!python --version
# Python 3.5.2 :: Anaconda 4.2.0 (64-bit)

The __pandas__ library is used for storing the data in dataframes.
<br />
The __beautifulsoup4__ library is used for scrapping data from web.
<br />
The __pandas-datareader__ library is used for downloading __Yahoo Finance__ and __Google Finance__ datasets.<br />
Further information: https://github.com/pydata/pandas-datareader

In [None]:
!pip install numpy
!pip install scipy
!pip install pandas
!pip install pandas-datareader
!pip install beautifulsoup4

## Downloader and preprocessor

The __Utils__ and __InputPreprocessor__ classes are implementing common functionalities for data manipulation. They define some static helper functions.

In [None]:
from datetime import datetime
import doctest
doctest.testmod()


class Utils(object):
    """ The class contains a collection of useful functions in relation to minor data manipulations """
       
    @staticmethod
    def convertDatetimeStrFormat(dtString, fromPattern, toPattern):
        """ The method converts a datetime object represented in formatted string to another formatted string 
        
        >>> Utils.convertDatetimeStrFormat('1987-08-14', "%Y-%m-%d", "%d/%m/%Y")
        '14/08/1987'
        >>> Utils.convertDatetimeStrFormat('5/1/1990', "%d/%m/%Y", "%Y-%m-%d")
        '1990-01-05'
        """
        
        return datetime.strptime(dtString, fromPattern).strftime(toPattern)



class InputPreprocessor(object):
    """ The class contains a collection of functions in relation to preprocessing of the historical data """
    
    

The downloading process of the datasets is implemented by using a __DownloadAdapter__ instance.
<br />
The __YahooFinanceDownloadAdapter__ and __GoogleFinanceDownloadAdapter__ classes are implementing download datasource specific solutions.

In [None]:
from abc import ABC, abstractmethod
from datetime import datetime
import urllib.request
import pandas as pd
import io
import doctest
doctest.testmod()


class DownloadAdapter(ABC):
    """ Base class for download adapters
    
    The subclasses are inherit the 'downloaderFunction' abstract method.
    The abstract method's implementation will specify the real download method.
    """
    
    def __init__(self):
        super().__init__()
    
    
    @abstractmethod
    def downloaderFunction(self, symbolCode, startDateTime, endDateTime, datasource = None, other = None):
        """ This method will be called by the data manager """
        pass



class YahooFinanceDownloadAdapter(DownloadAdapter):
    """ Specifies Yahoo Finance downloading functionalities by implementing DownloadAdapter """
    
    def __init__(self):
        super().__init__()
    
    
    def downloaderFunction(self, symbolCode, startDateTime, endDateTime, datasource, other):
        pass



class GoogleFinanceDownloadAdapter(DownloadAdapter):
    """ Specifies Google Finance downloading functionalities by implementing DownloadAdapter 
    
    This method implements an URL based solution. The reason of not using 'pandas_datareader' library here is that
    it seems not to work on older data. According to the experiments a templated direct URL can dowload much more
    historical data.
    """
    
    def __init__(self):
        super().__init__()
        
    
    DOWNLOAD_URL_PATTERN = "http://finance.google.ca/finance/historical?q=%s:%s&startdate=%s&enddate=%s&output=csv"
    DATE_QUERY_PATTERN   = "%b+%d+%Y"
    
    
    def downloaderFunction(self, symbolCode, startDateTime, endDateTime, datasource, other):
        """ This method downloads the historical data from Google Finance
        
        >>> gfda = GoogleFinanceDownloadAdapter()
        >>> resultSet = gfda.downloaderFunction("GOOGL", datetime(2016,1,1), datetime(2017,1,1), None, "NASDAQ")
        >>> len(resultSet)
        252
        >>> resultSet = gfda.downloaderFunction("GOOGL", datetime(1990,1,1), datetime(2017,1,1), None, "NASDAQ")
        >>> len(resultSet)
        3114
        """
        
        def checkDatetimeParam(dt, paramName):
            assert (type(dt) is datetime), \
                paramName + " parameter is not datetime, it is a(n) %s" % type(dt)
                
        # checking parameter type
        checkDatetimeParam(startDateTime, 'startDateTime')
        checkDatetimeParam(endDateTime, 'endDateTime')
        
        # converting datetime into query format
        startDateTime = startDateTime.strftime(self.DATE_QUERY_PATTERN)
        endDateTime   = endDateTime.strftime(self.DATE_QUERY_PATTERN)
        
        # downloading data by using the templated URL
        csvUrl = self.DOWNLOAD_URL_PATTERN % (other, symbolCode, startDateTime, endDateTime)
        with urllib.request.urlopen(csvUrl) as response:
            byteContent = response.read().decode("utf-8")
            
            # reading data into pandas dataframe
            data = pd.read_csv(io.StringIO(byteContent), sep=',')
            
            # identifying the name of the first column
            keyColumnName = data.columns[0]    # According to the downloaded CSV this sould be a column named 'Date'
            
            # formatting Date column
            def changeDateStr(string):
                dt = datetime.strptime(string, '%d-%b-%y')
                return dt
            data[keyColumnName] = data[keyColumnName].apply(changeDateStr)
                        
            # use Date column as key
            data.set_index(keyColumnName, inplace=True)
            
            return data

Global function __getSP400ListFromWikipedia()__ searches in the web for current list of SP400 companies.

The reason of using a webscrapper is that the symbols of the SP400 are changing in time and the script always needs the current values.
<br />
The current values may be found here: <https://en.wikipedia.org/wiki/List_of_S%26P_400_companies>

In [None]:
# a fresh list of 400 companies (changes by time, needs to be scraped at runtime)
URL_WIKIPEDIA_SP400 = "https://en.wikipedia.org/wiki/List_of_S%26P_400_companies"

In [None]:
from bs4 import BeautifulSoup
import urllib.request
import doctest
doctest.testmod()


def getSP400ListFromWikipedia():
    """ Returns current SP400 symbol codes from Wikipedia
    
    Downloads HTML content, selects the first table and returns the values from the first column of the selected table.
    
    >>> len(getSP400ListFromWikipedia())
    400
    """
    
    TABLE_NUMBER = 0   # the first table's content should be downloaded from wikipedia
    
    with urllib.request.urlopen(URL_WIKIPEDIA_SP400) as response:
        
        # downloading html content
        html = response.read()
        
        # parsing html content
        htmlSoup = BeautifulSoup(html, 'lxml')
        if htmlSoup is None:
            return None
        
        # finding table in html code
        tableHtml = htmlSoup.findAll('table', class_='wikitable sortable')[TABLE_NUMBER]
        tableSoup = BeautifulSoup(str(tableHtml), 'lxml')
        
        # filtering relevant values from table
        return [x.td.a.text for x in tableSoup('tr') if x.td]
        

## Data layer

An instance of the __DataManager__ class can download historical data by using the __DownloadAdapter__'s method.
<br />
The downloaded historical data are loaded into pandas dataframes.
The pandas dataframes are saved in form of CSV files and SQL data tables.

In [None]:
from datetime import datetime
import doctest
doctest.testmod()


class DataManager:
    """ Downloads and manages historical OHLC and volume data
    
    Downloads each record exactly only once
    Stores the downloaded history as CSV files locally
    
    TODO: Stores the downloaded history in SQL database locally
    TODO: Makes queries in the database after already stored data, if data is not present downloads it
    """
    
    import pandas_datareader as pdr
    
    def __init__(self):
        pass
    
    
    def convertDateTimeToString(self, dt, datetime_pattern = "%Y%m%d"):
        """ Converter for datetime types
        
        In case of datetime parameter returns a formatted datetime string, otherwise returns the original
        
        >>> x = DataManager()
        >>> x.convertDateTimeToString(datetime(2016,1,2,3,4,5))
        '20160102'
        >>> x.convertDateTimeToString(datetime(2016,1,2,3,4,5), "%Y%m%d-%H%M%S")
        '20160102-030405'
        >>> x.convertDateTimeToString('2016-01-01')
        '2016-01-01'
        >>> x.convertDateTimeToString(2016)
        Traceback (most recent call last):
          ...
        AssertionError: dt parameter is not string nor datetime, it is a(n) <class 'int'>
        """
        
        # checking parameter type
        assert (type(dt) is datetime or type(dt) is str), \
            "dt parameter is not string nor datetime, it is a(n) %s" % type(dt)
        
        # converversion with formatting datetime to string if needed
        dtStr = dt
        if(type(dt) is datetime):
            dtStr = dt.strftime(datetime_pattern)
            
        return dtStr
    
    
    def generateCsvFileName(self, dataSource, symbolCode, startDateTime, endDateTime, 
                            filename_pattern = "%s_%s_start=%s_end=%s.download.csv"):
        """ CSV file name generator
        
        Uses the sysmbol code, a starting and an ending date to generate a filename.
        
        >>> x = DataManager()
        >>> x.generateCsvFileName("google", "AAPL", "2016-01-01", "2017-01-01")
        'google_AAPL_start=2016-01-01_end=2017-01-01.download.csv'
        >>> x.generateCsvFileName("google", "AAPL", "2016-01-01", "2017-01-01", "download.%s.%s.%s.%s.csv")
        'download.google.AAPL.2016-01-01.2017-01-01.csv'
        >>> x.generateCsvFileName("google", "AAPL", datetime(2016,1,1), datetime(2017,1,1))
        'google_AAPL_start=20160101_end=20170101.download.csv'
        >>> x.generateCsvFileName(1000, "AAPL", "2016-01-01", "2017-01-01")
        Traceback (most recent call last):
          ...
        AssertionError: dataSource parameter is not string, it is a(n) <class 'int'>
        """
        
        # checking parameter type
        assert (type(dataSource) is str), \
            "dataSource parameter is not string, it is a(n) %s" % type(dataSource)
        
        # converting dates to string
        startDateTimeStr = self.convertDateTimeToString(startDateTime)
        endDateTimeStr   = self.convertDateTimeToString(endDateTime)
        
        return filename_pattern % (dataSource, symbolCode, startDateTimeStr, endDateTimeStr)
        
    
    def downloadData(self, downloaderFunction, datasource=None):
        """ Returns a method with curried parameters which can download historical data """
        
        def downloader(symbolCode, startDateTime, endDateTime):
            """ Returns a closure with fixed downloaderFunction and datasource """
            return downloaderFunction(symbolCode, startDateTime, endDateTime, datasource)
        
        return downloader
    
    
    def saveDataframeToCsv(self, symbol, dataframe, startDateTime=None, endDateTime=None):
        pass
    
    
    def getData(self, symbolOrSymbolList, startDateTime, endDateTime):
        pass
    



## Processing

In [None]:
# interval border constants
DATA_MEAN_START     = datetime(2016,1,1)
DATA_MEAN_END       = datetime(2017,1,1)
DATA_VARIANCE_START = datetime(2016,2,11)
DATA_VARIANCE_END   = datetime(2016,11,8)
DATA_MINMAX_START   = datetime(2016,1,18)    # including
DATA_MINMAX_END     = datetime(2016,10,18)   # excluding
DATA_STD_START      = datetime(2016,4,17)
DATA_STD_END        = datetime(2016,12,15)