In [1]:
from nyse_dates_prds import *
from sys import stdout
import cPickle as pickle
import pandas as pd
import numpy as np
import datetime
import urllib
import time

print "LOADED"

LOADED


In [2]:
def returns_create(highlowclose, periods, start):
    """
    This calculates the return values which is the closing price divided by the previous
    period's closing price minus 1. This gets us a percentage increase or decrease over
    that time period.
    """
    rets_dict, count = {}, 0
    for name in highlowclose.keys():
        df     = pd.DataFrame()
        close  = highlowclose[name]['Closes']
        # Return periods include 1,2,3,4,and 5 minute intervals that other indicators dont
        prds   = [1,2,3,4,5] + periods[name][0]
        
        # Create our return values for each different period length, replacing any infinis
        # that occur due to a bug in pandas when dividing by zero.
        for x in range(32):
            df[x] = (((close / close.shift(prds[x])) - 1.).fillna(0)).replace([np.inf], 0)
        
        # Starting value is used because this is an update method, updated from that point
        rets_dict[name] = df[start:]
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "RET DONE"
    return rets_dict

def per_create(highlowclose, periods, start):
    """
    Calculates the price to earnings ratio which is the closing price divided by the
    difference between the closing price and the previous period's closing price.
    """
    per_dict, count = {}, 0
    for name in highlowclose.keys():
        df     = pd.DataFrame()
        cl     = highlowclose[name]['Closes']
        prds   = periods[name][0]
        
        # For each different period, first subtract current closing price by previous
        # periods closing price, replacing any zero with NaN to prevent divide by zero
        # bug in pandas and then convert NaN back to zero after the series division done
        for x in range(32): 
            clshf = (cl - cl.shift(prds[x])).replace(0,np.NaN)
            df[x] = (cl / clshf).fillna(0)
        
        per_dict[name] = df[start:]
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "PER DONE"
    return per_dict

def kdo_create(highlowclose, periods, start):  
    """
    This is the stochastic oscillator calculation, which takes each periods max and min
    value. Then we first subtract the closing price minus the minimum, then we subtract
    the max from the min. We then divide our first calculation by the second and multiply
    by 100 which gets us the K oscillator. Then we take the rolling mean value of the small
    period of the k values to get our D oscillator.
    """
    d_dict, count = {}, 0
    for name in highlowclose.keys(): 
        d_df       = pd.DataFrame()
        high, low  = highlowclose[name]['Highs'], highlowclose[name]['Lows']
        close      = highlowclose[name]['Closes']
        prds       = periods[name][1]
        
        # For each period, there's a small and large period, large for k oscillator calc
        # and the small for the d oscillator using the k oscillators.
        for x in range(32):
            num  = prds[x]
            num2 = prds[x+1]
            # Get rolling period minimums and maximums for lows and highs respectively
            prev_max  = high.rolling(window = num2, center = False).max()
            prev_min  = low.rolling(window  = num2, center = False).min()
        
            cl  =  close    - prev_min
            hl  = (prev_max - prev_min).replace(0., np.NaN)
            
            # K and D oscillators, the D which is important to us
            k_df    = ((cl / hl) * 100.).fillna(0.)
            d_df[x] = (k_df.rolling(window=num, center=False).mean()).fillna(0.)
            
        d_dict[name] = d_df[start:]
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "KDO DONE"
    return d_dict

def cci_create(highlowclose, periods, start):
    """
    This is our commodity channel index calculation
    """
    cci_dict, count, constant = {}, 0, 0.015
    for name in highlowclose.keys():
        df   = pd.DataFrame()
        # Typical price == closing + high + low / 3 at each time step for ref
        typ  = highlowclose[name]['Typical']
        prds = periods[name][0]
        
        for x in range(32):
            num = prds[x]    
            # Get rolling standard deviations and means
            typ_std  = typ.rolling(window = num, center = False).std()
            typ_mean = typ.rolling(window = num, center = False).mean()
            ttmean   = typ - typ_mean
            # Replace any zeros with NaN to avoid divide by zero bug in next step
            ctmad    = (constant * typ_std).replace(0.,np.NaN)
            df[x]    = (ttmean / ctmad).fillna(0.)
        
        cci_dict[name] = df[start:]      
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "CCI DONE"
    return cci_dict

def vol_create(highlowclose, periods, start):
    """
    This is the volatility indicator calculation. We get this by taking the period return 
    values for that company, and then taking the standard deviation of each periods returns,
    and multiplying that by the square root of the period length
    """
    vol_dict, count = {}, 0
    for name in highlowclose.keys():
        vol_df = pd.DataFrame()
        close  = highlowclose[name]['Closes']
        prds   = periods[name][2]
        
        for x in range(32): 
            # Uses two different periods, small and large, small for return calc portion,
            # and large for the actual vol calculation
            num       = prds[x]
            num2      = prds[x+3]
            rets      = (close / close.shift(num) - 1.).fillna(0.)
            vol_df[x] = rets.rolling(window=num2, center=False).std() * np.sqrt(num2)
        
        vol_dict[name] = vol_df[start:]        
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "VOL DONE"
    return vol_dict

def bol_create(highlowclose, periods, start):
    """
    This is the slightly altered bollinger band calculation. We get this by finding the 
    periods mean and standard deviation values, adding those together and multiplying by 
    two to get the upper band values. Rather than also calculating a lower band and using
    it for if the rolling mean goes above/below, we do a modified version that works 
    better for our purpose in our case, just taking the ratio between the rolling mean 
    and the upper band. We then calculate the difference between closing prices and mean
    prices, then subtract the upper band values by the mean prices, and then divide these
    two answers to get our final bollinger band values.
    """
    bol_dict, count = {}, 0
    for name in highlowclose.keys():
        df    = pd.DataFrame()
        close = highlowclose[name]['Closes']
        prds  = periods[name][0]
        
        for x in range(32):
            num   = prds[x]
            # Get rolling period means and standard deviations
            rm    = (close.rolling(window=num, center=False).mean()).fillna(0)
            rstd  = (close.rolling(window=num, center=False).std()).fillna(0)
            # Create an upper band (lower band is rm - rstd*2) but since our bollinger
            # calculation is slightly modified we dont need lower band.
            upper = rm + rstd * 2. 
            clrm  = close - rm
            uprm  = (upper - rm).replace(0.,np.NaN)
            df[x] = (clrm / uprm).fillna(0.)
            
        bol_dict[name] = df[start:]        
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "BOL DONE"
    return bol_dict

def mom_create(highlowclose, periods, start):
    """
    This is the momentum calculator, which is simply the the difference between current
    closing price, and closing price x periods ago, multiplied by 100
    """
    mom_dict, count = {}, 0
    for name in highlowclose.keys():
        df     = pd.DataFrame()
        close  = highlowclose[name]['Closes']
        prds   = periods[name][0]
        
        for x in range(32):
            num   = prds[x]
            df[x] = (((close - close.shift(num)) / close.shift(num)) * 100.).fillna(0.)
        
        mom_dict[name] = df[start:]        
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "MOM DONE"
    return mom_dict

def sma_create(highlowclose, periods, start):
    """
    This is the simple moving average calculation. This is simply the mean closing price
    during that period.
    """
    sma_dict, count = {}, 0
    for name in highlowclose.keys():
        df    = pd.DataFrame()
        close = highlowclose[name]['Closes']
        prds  = periods[name][0]
        
        for x in range(32):
            num   = prds[x]
            # Rolling mean of closing prices, filling NaN's with zero
            df[x] = (close.rolling(window=num, center=False).mean()).fillna(0.)
            
        sma_dict[name] = df[start:]        
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "SMA DONE"
    return sma_dict

def aro_create(highlowclose, periods, start):
    """
    This is the aroon indicator calculation. For each periods values, you how many periods it has
    been since you've had the max during that period as well as the min, and subtract from 
    eachother
    """
    aro_dict, count = {}, 0
    for name in highlowclose.keys():
        df     = pd.DataFrame()
        close  = highlowclose[name]['Closes']
        prds   = periods[name][0]
        
        for x in range(32):
            num   = prds[x]
            # Using an applied method for each period to cut down on computation time
            df[x] = close.rolling(window=num, center=False).apply(aro_apply, args=(num,))
            
        aro_dict[name] = df[start:]        
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "ARO DONE"
    return aro_dict
def aro_apply(df, prd):
    # For each rolling period, find the number of days since last period min/max, then
    # divide by the period length and finally multiply by 100 to get the upper/lower aroon bands
    # Then subtract the upper by lower values
    up      = ((prd - df.argmax()) / float(prd)) * 100.
    down    = ((prd - df.argmin()) / float(prd)) * 100.
    return up - down

def mac_create(highlowclose, periods, start):
    """
    This is the moving average convergence divergence calculation. To get this we take the 
    expected moving average of the smaller period, and the larger period. Subtract from eachother,
    then take the expected moving average of the smallest period on those calculated values to get
    an buy/sell indicator of the macd calculation and use both of these.
    """
    mac_dict, mac_dict2, count = {}, {}, 0
    for name in highlowclose.keys():
        macd_df, macd_df2  = pd.DataFrame(), pd.DataFrame()
        close              = highlowclose[name]['Closes']
        prds               = periods[name][2]
        
        for x in range(32):
            # This indicator uses 3 period lengths in its calculation, a small, medium, and large
            num, num2, num3 = prds[x], prds[x+1], prds[x+3]
            
            # ewm stands for expected weighted moving means that its a rolling calculation which
            # weights more recent vals more heavily than less recent. 
            # Get our mid and large rolling ewm averages of closing prices, subtract them, then
            # get our modified version of the macd which is the ewm average of the macd, div macd 
            # by this and subtract one
            mid    = close.ewm(ignore_na=False, span=num2, min_periods=0, adjust=True).mean()
            large  = close.ewm(ignore_na=False, span=num3, min_periods=0, adjust=True).mean()
            macd   = large - mid
            small  = macd.ewm(ignore_na=False, span=num, min_periods=0, adjust=True).mean()
            result = (macd / small) - 1.
            
            # we calculate two indicators for this one.
            macd_df[x]  = macd
            macd_df2[x] = result
        
        mac_dict[name]  = macd_df[start:]  
        mac_dict2[name] = macd_df2[start:]
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "MAC DONE"
    return mac_dict, mac_dict2
    
def adx_create(highlowclose, periods, start):
    """
    This is the average directional index calculation. Get the true range values, then
    get the positive and negative directional movment and put these in a dataframe that
    is used for precalculation purposes during our real-time calculations. You'll keep
    the directional index as well for the same purpose. This is the most complicated 
    indicator we have so don't fret if you dont follow it right away.
    """
    tr_df_dict, adx_dict, adx_dict2, count = {}, {}, {}, 0
    for name in highlowclose.keys():
        adx_df, dx_df = pd.DataFrame(), pd.DataFrame()
        
        high, low = highlowclose[name]['Highs'], highlowclose[name]['Lows']
        close     = highlowclose[name]['Closes']
        prds      = periods[name][0]
        
        for x in range(32):
            trpm_df = pd.DataFrame()
            num     = prds[x]
            
            tr_df      = pd.DataFrame()
            # Get our true range values by taking the maximum value of the below three calcs
            tr_df[0]   = (high - low)
            tr_df[1]   = (high - close.shift(num)).fillna(0.)
            tr_df[2]   = (low  - close.shift(num)).fillna(0.)
            true_range = tr_df.max(axis=1)
            
            # calculate positive and negative directional indicators
            plus_dm   = (high - high.shift(num)).fillna(0.)
            minus_dm  = (low.shift(num) - low).fillna(0.)
            # For each pos/neg direc inds, if pos >= than minus, take positive, else take neg
            plus_dm   = pd.Series(np.where(plus_dm >= minus_dm, plus_dm, 0.),index=plus_dm.index)
            minus_dm  = pd.Series(np.where(minus_dm >= plus_dm, minus_dm, 0.),index=plus_dm.index)
            # If any values are below 0, set them to 0
            plus_dm[plus_dm < 0]   = 0.
            minus_dm[minus_dm < 0] = 0.
        
            # Put the true_range and pos/neg directional indicators into a dataframe so these 
            # can provide precalculated values for the real-time calculations
            trpm_df[0] = true_range
            trpm_df[1] = plus_dm
            trpm_df[2] = minus_dm
            
            # Dictionary key value is the company plus the period value
            tr_name = name+str(x)
            # We only need a certain number of precalculated values for each entry.
            tr_df_dict[tr_name] = trpm_df[len(true_range)-num-1:]
            
            # Calculate first the average true range, using the expected weighted mean of the
            # true ranges, then calculate the positive and negative directional movements
            atr    = true_range.ewm(ignore_na=False, span=num, min_periods=0, adjust=True).mean()
            pos_dm = plus_dm.ewm(ignore_na=False, span=num, min_periods=0, adjust=True).mean()
            neg_dm = minus_dm.ewm(ignore_na=False, span=num, min_periods=0, adjust=True).mean()

            # Calculate the positive and negative directional indexes, replacing divide by zero
            # infinity bug by zero if any.
            pos_di  = ((pos_dm / atr) * 100.).replace([np.inf],0.)
            neg_di  = ((neg_dm / atr) * 100.).replace([np.inf],0.)
            # Calculate the directional index, replacing any zero infinity bugs by zero
            dx_df[x] = (abs(pos_di - neg_di) / (pos_di + neg_di)).replace([np.inf],0.)
            
            # Calculate the average weighted moving average directional index
            adx_df[x] = dx_df[x].ewm(ignore_na=False, span=num, min_periods=0, 
                                     adjust=True).mean() * 100.
        
        adx_dict[name] = adx_df[start:]
        adx_dict2[name] = dx_df[start:]
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "ADX DONE"
    return adx_dict, adx_dict2, tr_df_dict

def rsi_create(highlowclose, periods, start):
    """
    This is the relative strength index calculation. Take the positive and
    negative values of the deltas and set everything else to 0. Then get the
    positive and negative mean values of the deltas. After this, divide these
    two series and get the rsi by adding 1, dividing 100 by this, and subtracting
    this from 100.
    """
    rsi_dict, count = {}, 0
    for name in highlowclose.keys():
        rsi_df = pd.DataFrame()
        
        close   = highlowclose[name]['Closes']
        prds    = periods[name][2]

        for x in range(32):
            # Uses both small and large periods
            num  = prds[x]
            num2 = prds[x+3]
            # Get closing price differences using small period
            deltas  = (close - close.shift(num)).fillna(0.)
            
            # set up/down df as values above/below zero setting others to 0
            up, down = deltas.copy(), deltas.copy()
            up[up < 0]     = 0
            down[down > 0] = 0

            # Get rolling means of up/down dfs with down dfs taking absolute vals
            rolup_df   =  up.rolling(window=num2, center=False).mean()
            roldown_df = (down.rolling(window=num2, center=False).mean()).abs()

            rol_updown = (rolup_df / roldown_df).replace([np.inf],0.)
            rsi_df[x] = (100. - (100. / (1. + rol_updown))).replace([np.inf],0.)

        rsi_dict[name]  = rsi_df[start:]
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    print "RSI DONE"
    return rsi_dict
   
print "LOADED"

LOADED


In [6]:
def resample_intraday(hlc):
    """
    Make the indexes the same for each company, meaning we resample the times to be at the 
    beginning of each minute, rather than sometime during that minute. The reason we have
    to resample one trading day at a time rather than on the whole dataframe is because
    when you resampled on everything at once, it would add in the hours before and after
    our trading hours, meaning before 9:30AM and after 4:00PM. The reason I chose to take
    the the day's resampling is because it would base it on the first and last time. If I 
    chose to instead just remove the hours before and after trading hours was because it
    would add data that I didn't necessarily have and not allow me to know I was missing 
    it. For example, some days would might be missing from my data source and it would
    automatically feed it information for that day and I wouldn't know it did this. For 
    this reason I chose to resample for each day so if I was missing values it wouldn't
    automatically interpolate data without notice.
    """
    new_hlc, count = {}, 0
    for key, value in hlc.iteritems():
        new_df = pd.DataFrame()

        # Ensure we don't have duplicate rows that curiously occurred several times
        value  = value.reset_index().drop_duplicates(subset='index', 
                                                     keep='last').set_index('index')
        # Take our index from our dataframe and convert the first and last timestamps
        # to yyyy-mm-dd format getting rid of the time after the date
        index  = value.index
        start  = datetime.datetime.strptime((str(index[0])[:10]), '%Y-%m-%d').date()
        end    = datetime.datetime.strptime((str(index[-1])[:10]), '%Y-%m-%d').date()
        # Call our trading days function that will give us a list of trading days 
        # between our two given dates we feed it.
        rs2    = NYSE_tradingdays2(start, end)
        # How many trading days between the two dates
        length = len(rs2[:])

        for x in range(length):
            # For each trading timestamp, first convert to date only in str format instead
            # of both date and time which is given as 00:00:00
            date   = str(rs2[x])[:10]
            # Retrieve that date's values from our dataframe
            day    = value[date]
            # Resample each day, setting the time value to be the top of every minute
            # This was done to match datetimes since dataframes usually were a second 
            # or two off between companies which meant it was harder to call a datetime
            # for one or more companies at once.
            reday  = day.resample('T').pad().fillna(method='bfill')
            # Series are mutable, so we need to create a new dataframe with this new series
            new_df = new_df.append(reday)  

        new_hlc[key] = new_df
        stdout.write("\r%d" % count)
        stdout.flush() 
        count += 1
    return new_hlc

def add_new_data(intra_data):
    """
    Get our new data, starting from the next trading day after our last trading day's data. We
    call retrieve_bonnet_data function passing this next date, then adjust the highs and lows 
    to be the day's highs and lows, not just that minutes highs and lows by calling our 
    adjust_bonnet_high_lows function, passing the data that was retrieved from our 
    retrieve_bonnet_data call, and this returns our adjusted data. We then change the
    index datetime strings to pandas timestamp indexes to help in our future calculations. 
    Finally we calculate the typical values for these new values and return the this new data as
    well as the old data.
    """
    # Get a companies stock symbol, doesn't matter which as long as the stocks have the same
    # last update date
    comp_key  = intra_data.keys()[0]
    new_adj   = {}
    # Previous date is the last day that we updated our highlowclose dictionary since present
    prev_date = str((intra_data[comp_key].index)[-1])[:10]
    # End date is just set as a year from the last update, just for simple way for future date
    end_date  = str(int(prev_date[:4])+1) + prev_date[4:]
    start     = datetime.datetime.strptime(prev_date, '%Y-%m-%d').date()
    end       = datetime.datetime.strptime(end_date, '%Y-%m-%d').date()
    rs2       = NYSE_tradingdays2(start, end)
    # From date will be next trading day after our last update
    from_date = str(rs2[1])[:10]
    # Grab today's date to then send it to our today_func
    today     = str(datetime.date.today())
    
    # Get at least an extra trading day into the future because if not done, you won't get
    # the current days data
    to_date         = today_func(today, rs2)
    # Retrieve our data from our data source
    data, small_lst = retrieve_bonnet_data(from_date, to_date, intra_data)
    # Adjust our highs and lows to be day highs/lows rather than minute highs/lows
    adj             = adjust_bonnet_high_lows(data, small_lst)
    
    # Convert from string timestamps to pandas timestamps
    for key, value in adj.iteritems():
        new_indx = []
        
        for each in value.index:
            new_indx.append(pd.Timestamp(each))
        
        # Create our typical values columns for each company
        value.index      = new_indx
        value['Typical'] = (value['Highs'] + value['Lows'] + value['Closes']) / 3.
        new_adj[key]     = value
    return intra_data, new_adj

def today_func(today, rs2):
    """
    Calculate the next trading day so that when you grab data it gets you today's
    data as well, because if you plug today's date into that url grab, it will only
    grab up until the end of yesterday's data.
    """
    to_date   = 'None'
    while to_date == 'None':
        for x in range(len(rs2[:])):
            if str(rs2[x])[:10] == today:
                to_date = str(rs2[x+1])[:10]
                break
            
        if to_date == 'None':
            if int(today[len(today)-2:]) < 9:
                tomor = '0' + str(int(today[len(today)-2:])+1)
            elif int(today[len(today)-2:]) > 28:
                print "TO_DATE FAIL"
                return
            else:
                tomor = str(int(today[len(today)-2:])+1)
            today = today[:len(today)-2]+tomor
    return to_date

def retrieve_bonnet_data(fromd, today, intra):
    """
    This function retrieves our new intraday trading data from our data source thanks to
    The Bonnot Gang site, that gives us free intraday data, which the have available from
    mid 2011 to present, but we just grab the data we need.
    
    We grab each of our companies we're calculating for's data, and convert the .csv format to
    our dataframe format.
    """
    pages   = {}
    small_lst = []
    # NOTE THAT THE TICKERS USED TO RETRIEVE HAVE 3 DIFFERENCE FROM THE YAHOO TICKERS, THE
    # DJI, GSPC, IXIC ALL DON'T INCLUDE THE ^ PRIOR TO THE SYMBOL
    tickers  = ['BPOP','FITB','HBAN','CMCSA','EBAY',
                      'AAPL','AMAT','BRCD','CSCO','GOOG','INTC',
                      'LVLT','MSFT','MU','NVDA','ORCL','QCOM',
                      'SIRI','WIN','YHOO','BHP','BP',
                      'RIO','XOM','GE','F','MO','XRX','GS','JPM',
                      'LYG','MS','RF','USB','WFC','MRK','PFE','LMT',
                      'MGM','AMD','GLW','HPQ','S','T',
                      'USO', 'GLD', 'SPY','DJI', 'GSPC', 'IXIC']
    
    for name in tickers:
        # If not the 3 indexes
        if name in intra.keys():
            page = 'http://www.thebonnotgang.com/quotes/q.php?timeframe=1m&dayFrom='+\
                   fromd+'&dayTo='+today+'&symbol='+name
        # If it is, we'll rename them back to their yahoo format for the dictionary
        elif (name=='DJI')or(name=='GSPC')or(name=='IXIC'):
            page = 'http://www.thebonnotgang.com/quotes/q.php?timeframe=1m&dayFrom='+\
                   fromd+'&dayTo='+today+'&symbol='+name

            if (name != 'DJI') and (name != 'GSPC') and (name != 'IXIC'):
                name = name
            else:
                name = '^' + name
        
        # Retrieve a list of symbols were using with the updated index names
        small_lst.append(name)
        # Get our data from our data source using our provided url, setting that 
        # stocks values into a dictionary with its name as the entry
        pages[name] = urllib.urlopen(page).read()
    
    # Then take the steps to convert data from strings containing data we don't 
    # need to just the data we need in the format we need it. Floats for highs,
    # lows, and closes as well as the dates for each entries
    complete_list = []
    for each in small_lst:
        current = pages[each]
        start   = current.find('2016')
        end     = len(current)
        test    = True
        df_list = []

        while test == True:
            date_end = current.find(';',start)
            open_end = current.find(';',date_end+1)
            high_end = current.find(';',open_end+1)
            low_end  = current.find(';',high_end+1)
            close_end = current.find(';',low_end+1)

            date  = current[start:date_end]
            try:
                high  = float(current[open_end+1:high_end].replace(',','.'))
            except:
                print each
                raise
            low   = float(current[high_end+1:low_end].replace(',','.'))
            close = float(current[low_end+1:close_end].replace(',','.'))

            t = int(date[11:13])-4
            if t < 10:
                t = '0'+str(t)
            else:
                t = str(t)
            date = date[0:11]+t+date[13:]

            day = [date, high, low, close]
            df_list.append(day)

            start = current.find('2016-', close_end)
            if start == -1:
                break
        
        # Put all these values into a dataframe renaming the columns
        # Set the Time column to be the index
        df = pd.DataFrame(df_list, columns=['Time','Highs','Lows','Closes'])
        df = df.set_index('Time')
        
        # Append each dataframe to our list, then return that list with the list of names
        complete_list.append(df)
    return complete_list, small_lst

def adjust_bonnet_high_lows(adjusted_intra, tickers):
    """
    We need to convert our days high and low values for each day to be the day high and low rather
    than the minute high and low values.
    """
    new_adjusted_intra = {}
    for df, z in zip(adjusted_intra, range(len(adjusted_intra))):
        # Adjust our higs/lows to day highs/lows instead of minute highs/lows and combine into
        # new dataframe with closes and our index
        highs  = df['Highs'].values.tolist()
        lows   = df['Lows'].values.tolist()
        closes = df['Closes'].values.tolist()
        index  = df.index
        prev   = 0

        for x in xrange(1,len(highs)):
            if index[x][8:10] == index[x-1][8:10]:
                highs[x] = max(highs[prev:x+1])
                lows[x]  = min(lows[prev:x+1])
            else:
                prev = x

        df         = pd.DataFrame([index,highs,lows,closes]).T
        df.columns = ['Time','Highs','Lows','Closes']
        df         = df.set_index(['Time'])
        
        # Create our dictionary with our dataframes containing all of our new data
        new_adjusted_intra[tickers[z]] = df
    return new_adjusted_intra

print "LOADED"

LOADED


In [7]:
def get_new_data(update=False):
    """
    Get the new data we need by calling our add_new_data function, and then
    resample that data w/ our resample_intraday function. Then return the 
    new data as well as the old data, and also the short date which is the
    data about 130 trading day's prior to present that we need to calculate
    all our data, but only use that much so we don't have to calculate more
    data than needed.
    """
    # Retrieve our previous high/low/close/typical dataframes dictionary
    opp      = open('Pickles/pickleadjustedintracomplete.pickle','rb')
    new_hlc = pickle.load(opp)
    opp.close()
    
    if update == True:
        hlc = {}
        opp = open('Pickles/onlyupdatedintra.pickle','rb')
        already_updated = pickle.load(opp)
        opp.close()
        
        for key, val in new_hlc.iteritems():
            if key not in already_updated.keys():
                hlc[key] = val
        
        intra, adj    = add_new_data(hlc)
        resampled_adj = resample_intraday(adj)
    else:
        intra, adj    = add_new_data(new_hlc)
        resampled_adj = resample_intraday(adj)
    
    return intra, resampled_adj

def update_hlc(intra, resampled_adj):
    """
    Calculate new indicator values since our last day calculated up to
    present. Then return these indicators for use in updating our files.
    Adx_d3 is a precalculated dictionary with dataframes that needed to be put together
    due to several factors that may eventually be seperated like the others but currently
    isn't.
    """
    highlowclose  = {}
    # Current day
    end_date      = str(datetime.date.today())
    # Date 2 years prior for enough data to calculate new indicator values since update
    prev_date     = str(int(end_date[:4])-2)+end_date[4:]
    start         = datetime.datetime.strptime(prev_date, '%Y-%m-%d').date()
    end           = datetime.datetime.strptime(end_date, '%Y-%m-%d').date()
    # Version1 of trading days function is used for period calculation, other used
    # for getting the date 2 years ago to give us a shortened version of our 
    # highlowclose dictionary
    rs            = NYSE_tradingdays()
    rs2           = NYSE_tradingdays2(start, end)
    short         = str(rs2[0])[:10]
    hlc_cols      = ['Highs','Lows','Closes','Typical']

    # First append our new highs/lows/closes/typicals for each stock to historical data
    for key,value in intra.iteritems():
        intra[key] = intra[key][hlc_cols].append(resampled_adj[key][hlc_cols])
    # Then retrieve only the last 2 years worth of data for each since we don't need 
    # more than that for indicator calculations unless the last update has been more
    # than roughly 5.5 months ago.
    for key,value in intra.iteritems():
        highlowclose[key] = value[short:].reset_index().drop_duplicates(subset='index', 
                                                            keep='last').set_index('index')

    # prd_lst is in number of days, while the plnums are in minutes.
    # You need many different period lengths for each indicator.
    prd_lst  = [1,  2,  3,  5,  8,  10, 12, 14, 16, 20, 25,  30,  40,  50,  80,  125]
    plnums   = [10, 14, 16, 18, 20, 25, 30, 40, 50, 75, 100, 125, 150, 200, 250, 300]
    plnums2  = [8, 10, 14, 16, 18, 20, 25, 30, 40, 50, 75,  100, 125, 150, 200, 250, 300, 350]
    plnums3  = [6, 8, 10, 14, 16, 18, 20, 25, 30, 40, 50,  75,  100, 125, 150, 200, 250, 300, 350]
    prds_dates = {}
    # Make sure we have that day in our dataframe since some days are missing from our data source
    for key in highlowclose.keys():
        tl = []
        for x in xrange(257, 110, -1):
            try:
                dt   = str(rs[x])[:10]
                test = highlowclose[key][dt]
                tl.append(dt)
            except:
                pass
        # Get the dates of that number of days in the past, for example tl[1] is one day ago..
        prd_lst  = [tl[1], tl[2], tl[3], tl[5], tl[8], tl[10], tl[12], tl[14], tl[16], tl[20], 
                    tl[25], tl[30], tl[40], tl[50], tl[80], tl[125]]
        prds_dates[key] = prd_lst
    
    # Call our function that creates our period dictionary that converts the days into number
    # of minutes and adds that to each of the three plnums.
    prd_dict = create_prd_lst2(highlowclose, prds_dates, plnums, plnums2, plnums3)
    
    comp_key = resampled_adj.keys()[0]
    # Get the starting date of our new values
    start2   = str(resampled_adj[comp_key].index[0])[:10]
    
    # Dump our new highlowclose dictionary into a pickle file
    opp = open('Pickles/pickleadjustedintracomplete.pickle','wb')
    pickle.dump(intra, opp)
    opp.close()
    
    return intra, highlowclose, prd_dict, start2

def update_indicators(highlowclose, prd_dict, start2):
    """
    Update all of our indicator values, starting from our start2 value which is the last
    date in our indicator dictionaries. We get the added values, put them all in a list,
    where we'll send them to our update_newbase_files() function to append them to our
    old dictionaries. These functions are only called if you missed the real-time function
    for that day for any reason.
    """
    # Calculate all of our indicators, using our shortened highlowclose dictionary, the 
    # periods dictionary, and the date of the start of our new values
    ret_d                 = returns_create(highlowclose, prd_dict, start2)
    per_d                 = per_create(highlowclose, prd_dict, start2)
    cci_d                 = cci_create(highlowclose, prd_dict, start2)
    vol_d                 = vol_create(highlowclose, prd_dict, start2)
    bol_d                 = bol_create(highlowclose, prd_dict, start2)
    mom_d                 = mom_create(highlowclose, prd_dict, start2)
    sma_d                 = sma_create(highlowclose, prd_dict, start2)
    d_d                   = kdo_create(highlowclose, prd_dict, start2)
    mac_d, mac_d2         = mac_create(highlowclose, prd_dict, start2)
    rsi_d                 = rsi_create(highlowclose, prd_dict, start2)
    adx_d, adx_d2, adx_d3 = adx_create(highlowclose, prd_dict, start2)
    aro_d                 = aro_create(highlowclose, prd_dict, start2)
    
    # Combine them into list, minus adx_d2 and d3, since they aren't
    # put into the same directory as the other indicators
    short_lst = [adx_d, aro_d, cci_d, bol_d, d_d, mac_d, mac_d2,
                 mom_d, per_d, ret_d, rsi_d, sma_d, vol_d]
    
    return adx_d2, adx_d3, short_lst

print "LOADED"

LOADED


In [8]:
def update_newbase_files(ind_vals, intra):
    """
    Base files are where our combined data for each company is so we combine our
    new data and then append these to our last dataframe. Currently theres 15 
    dataframes each for each company. This is so that we only need to process about
    4 months of data at a time, so we don't have to have so much data in memory at
    once. We only process a max of 25000 rows at a time so it works out to roughly
    4 months at a time processed.
    """
    nms = ['adx', 'aro', 'cci', 'bol', 'kdo', 'mac', 'mactwo',
           'mom', 'per', 'rets', 'rsi', 'sma', 'vol']
    for name in intra.keys():
        new_df = pd.DataFrame()
        
        for dic, nm in zip(ind_vals, nms):
            nm_lst = []
            
            # Rename each column as the a combination of the stock symbol, the 
            # indicator abreviation, and the period number
            for x in range(32):
                nm_lst.append(name+'_'+nm+str(x))
            # Set the indicators column names
            dic[name].columns = nm_lst
            
            # Combine the indicators together
            new_df = pd.concat([new_df, dic[name]], 
                               axis=1).fillna(method='bfill').fillna(method='ffill')

        # Open the last piece of the indicator dataframe, to have the new values 
        # added to it.
        # NOTE: NEED TO CHANGE THIS AND THE DUMPING NAME WHEN THE LAST INDICATOR 
        #       BECOMES TOO LARGE AND YOU NEED TO ADD AN EXTRA PIECE
        opp = open('NewBase/'+name+'/'+name+'_df14.pickle','rb')
        old = pickle.load(opp)
        opp.close()

        new_comb = old.append(new_df)
        # Drop any duplicate rows that mysteriously showed up several times during testing
        new_comb = new_comb.reset_index().drop_duplicates(subset='index',
                                                    keep='last').set_index('index')
        # Dump the new indicator dataframe in place of the old one
        opp = open('NewBase/'+name+'/'+name+'_df14.pickle','wb')
        pickle.dump(new_comb, opp)
        opp.close()
    return

def update_adx23short(new_intra, adx_d2, adx_d3):
    """
    Update the adx2 and adx3 dictionaries that are precalculated for use in the the real-time 
    calculations to speed up calculation during real-time. These are updated during the real-time
    calculations but if you miss a day for some reason these functions update your dictionaries
    for you. Also update our short highlowclose dictionary that is a shortened version of our
    normal highlowclose dictionary for speed purposes since we don't need the full dictionary
    during real-time calcs.
    """
    # Google server uses different names for these below symbols compared to yahoo symbols
    ticks  = {'LMT':'NYSE:LMT', 'USO':'NYSEARCA:USO', 'GLD':'NYSEARCA:GLD',
              'SPY':'NYSEARCA:SPY', '^DJI':'INDEXDJX:.DJI', '^GSPC':'INDEXSP:.INX',
              '^IXIC':'INDEXNASDAQ:.IXIC'}
    
    # Open our old adx_d2 and shortened highlowclose dictionary, for updating
    opp   = open('NewBase/ADXD/adx_d2.pickle','rb')
    opp2  = open('Pickles/shortpickleintra.pickle','rb')
    d2    = pickle.load(opp)
    short = pickle.load(opp2)
    opp.close()
    opp2.close()

    new_short, new_d2, new_d3 = {}, {}, {}
    for name in new_intra.keys():
        # Shortened hlc dictionary uses Google symbols rather than Yahoo symbols
        # since it is used for our realtime data retrieval
        if name not in ticks.keys():
            name2 = name
        else:
            name2 = ticks[name]
            
        # Append new data to our old data
        new_short[name2] = short[name2].append(new_intra[name])
        new_d2[name2]    = d2[name2].append(adx_d2[name])
        
        # adx_d3 doesn't need to be added to old adx_d3 since it's completely
        # updated, meaning we replace the old one with this
        for w in range(32):
            new_d3[name2+str(w)] = adx_d3[name+str(w)]

    # Dump all our new dictionaries
    opp  = open('Pickles/shortpickleintra2.pickle','wb')
    opp2 = open('NewBase/ADXD/adx_d22.pickle','wb')
    opp3 = open('NewBase/ADXD/adx_d32.pickle','wb')
    pickle.dump(new_short, opp)
    pickle.dump(new_d2, opp2)
    pickle.dump(new_d3, opp3)
    opp.close()
    opp2.close()
    opp3.close()
    return

print "LOADED"

LOADED


In [None]:
# If you've already updated for dividends/splits after missing data, and needing
# to update one or more stocks for this, then uncomment the first get_new_data call
# and comment the second call.
#old_intra, resampled_adj = get_new_data(update=True)
old_intra, resampled_adj = get_new_data()
# Then you call this function to using the two dictionaries returned from get_new_data()
intra, highlowclose, prd_dict, start2 = update_hlc(old_intra, resampled_adj)
# After that, you're returned the new combined dictionary, a shortened dictionary,
# the period dictionary, and the date from which you'll be updating from.
adx_d2, adx_d3, ind_vals = update_indicators(highlowclose, prd_dict, start2)
# Then you finally update your indicators with the two functions below
update_newbase_files(ind_vals, intra)
update_adx23short(resampled_adj, adx_d2, adx_d3)