In [1]:
from nyse_dates_prds import *
from sys import stdout 
import cPickle as pickle
import pandas as pd
import numpy as np
import os

print "LOADED"

LOADED


In [2]:
def returns_create(highlowclose, periods, dmp_method):
    """
    This calculates the return values which is the closing price divided by the previous
    period's closing price minus 1. This gets us a percentage increase or decrease over
    that time period.
    """
    rets_dict, count = {}, 0
    for name in highlowclose.keys():
        df     = pd.DataFrame()
        close  = highlowclose[name]['Closes']
        # Return value periods have the normal periods plus a 1,2,3,4 and 5 minute period
        prds   = [1,2,3,4,5] + periods[name][0]
        
        # There's 32 periods to calculate returns for, we put them all in a dataframe
        for x in range(32):
            df[x] = (((close / close.shift(prds[x])) - 1.).fillna(0)).replace([np.inf], 0)
        
        rets_dict[name] = df
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files(rets_dict, 'rets', highlowclose, first=True)
    else:
        dump_indicator_data(rets_dict, 'rets')
    print "RETS DONE"
    return

def per_create(highlowclose, periods, dmp_method):
    """
    Calculates the price to earnings ratio which is the closing price divided by the
    difference between the closing price and the previous period's closing price.
    """
    per_dict, count = {}, 0
    for name in highlowclose.keys():
        df     = pd.DataFrame()
        cl     = highlowclose[name]['Closes']
        prds   = periods[name][0]
        
        # For each different period, first subtract current closing price by previous
        # periods closing price, replacing any zero with NaN to prevent divide by zero
        # bug in pandas and then convert NaN back to zero after the series division done
        for x in range(32): 
            clshf = (cl - cl.shift(prds[x])).replace(0,np.NaN)
            df[x] = (cl / clshf).fillna(0)
        
        per_dict[name] = df
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
        
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files(per_dict, 'per', highlowclose)
    else:
        dump_indicator_data(per_dict, 'per')
    print "PER DONE"
    return

def kdo_create(highlowclose, periods, dmp_method):  
    """
    This is the stochastic oscillator calculation, which takes each periods max and min
    value. Then we first subtract the closing price minus the minimum, then we subtract
    the max from the min. We then divide our first calculation by the second and multiply
    by 100 which gets us the K oscillator. Then we take the rolling mean value of the small
    period of the k values to get our D oscillator.
    """
    d_dict, count = {}, 0
    for name in highlowclose.keys(): 
        d_df       = pd.DataFrame()
        high, low  = highlowclose[name]['Highs'], highlowclose[name]['Lows']
        close      = highlowclose[name]['Closes']
        prds       = periods[name][1]
        
        # For each period, there's a small and large period, large for k oscillator calc
        # and the small for the d oscillator using the k oscillators.
        for x in range(32):
            num  = prds[x]
            num2 = prds[x+1]
            
            # Get rolling period minimums and maximums for lows and highs respectively
            prev_max  = high.rolling(window = num2, center = False).max()
            prev_min  = low.rolling(window  = num2, center = False).min()
        
            cl  =  close    - prev_min
            hl  = (prev_max - prev_min).replace(0., np.NaN)
            
            # K and D oscillators, the D which is important to us
            k_df    = ((cl / hl) * 100.).fillna(0.)
            d_df[x] = (k_df.rolling(window=num, center=False).mean()).fillna(0.)
            
        d_dict[name] = d_df
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
        
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files(d_dict, 'kdo', highlowclose)
    else:
        dump_indicator_data(d_dict, 'kdo')
    print "KD DONE"
    return

def cci_create(highlowclose, periods, dmp_method):
    """
    This is our commodity channel index calculation
    """
    cci_dict, count, constant = {}, 0, 0.015
    for name in highlowclose.keys():
        df   = pd.DataFrame()
        # Typical values are that point's day high + low + close / 3
        typ  = highlowclose[name]['Typical']
        prds = periods[name][0]
        
        for x in range(32):
            num = prds[x]
            # Get the standard deviation and the mean of each periods values    
            typ_std  = typ.rolling(window = num, center = False).std()
            typ_mean = typ.rolling(window = num, center = False).mean()
            
            # Subtract each pts typical value minus the mean typical
            ttmean   = typ - typ_mean
            
            # Then multiply the standard deviation value times the constant value
            #  which is a value that is well known for this indicator but can be
            #  changed if you believe you can get better calculations with differnt one
            ctmad    = (constant * typ_std).replace(0.,np.NaN)
            
            # Finally, divide the first calculation by the second to get that periods cci
            df[x]    = (ttmean / ctmad).fillna(0.)
        
        cci_dict[name] = df      
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files(cci_dict, 'cci', highlowclose)
    else:
        dump_indicator_data(cci_dict, 'cci')
    print "CCI DONE"
    return

def vol_create(highlowclose, periods, dmp_method):
    """
    This is the volatility indicator calculation. We get this by taking the period return 
    values for that company, and then taking the standard deviation of each periods returns,
    and multiplying that by the square root of the period length
    """
    vol_dict, count = {}, 0
    for name in highlowclose.keys():
        vol_df = pd.DataFrame()
        close  = highlowclose[name]['Closes']
        prds   = periods[name][2]
        
        for x in range(32): 
            # Uses two different periods, small and large, small for return calc portion,
            # and large for the actual vol calculation
            num       = prds[x]
            num2      = prds[x+3]
            rets      = (close / close.shift(num) - 1.).fillna(0.)
            vol_df[x] = rets.rolling(window=num2, center=False).std() * np.sqrt(num2)
        
        vol_dict[name] = vol_df        
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
       
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files(vol_dict, 'vol', highlowclose)
    else:
        dump_indicator_data(vol_dict, 'vol')
    print "VOL DONE"
    return

def bol_create(highlowclose, periods, dmp_method):
    """
    This is the bollinger band calculation. We get this by finding the periods mean and
    standard deviation values, adding those together and multiplying by two to get the
    upper band values. We then calculate the difference between closing prices and mean
    prices, then subtract the upper band values by the mean prices, and then divide these
    two answers to get our final bollinger band values.
    """
    bol_dict, count = {}, 0
    for name in highlowclose.keys():
        df    = pd.DataFrame()
        close = highlowclose[name]['Closes']
        prds  = periods[name][0]
        
        for x in range(32):
            num   = prds[x]
            
            # Get rolling period means and standard deviations
            rm    = (close.rolling(window=num, center=False).mean()).fillna(0)
            rstd  = (close.rolling(window=num, center=False).std()).fillna(0)
            
            # Create an upper band (lower band is rm - rstd*2) but since our bollinger
            # calculation is slightly modified we dont need lower band.
            upper = rm + rstd * 2. 
            clrm  = close - rm
            uprm  = (upper - rm).replace(0.,np.NaN)
            df[x] = (clrm / uprm).fillna(0.)
            
        bol_dict[name] = df        
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files(bol_dict, 'bol', highlowclose)
    else:
        dump_indicator_data(bol_dict, 'bol')
    print "BOL DONE"
    return

def mom_create(highlowclose, periods, dmp_method):
    """
    This is the momentum calculator, which is simply the the difference between current
    closing price, and closing price x periods ago, multiplied by 100
    """
    mom_dict, count = {}, 0
    for name in highlowclose.keys():
        df     = pd.DataFrame()
        close  = highlowclose[name]['Closes']
        prds   = periods[name][0]
        
        for x in range(32):
            num   = prds[x]
            df[x] = (((close - close.shift(num)) / close.shift(num)) * 100.).fillna(0.)
        
        mom_dict[name] = df       
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
       
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files(mom_dict, 'mom', highlowclose)
    else:
        dump_indicator_data(mom_dict, 'mom')
    print "MOM DONE"
    return

def sma_create(highlowclose, periods, dmp_method):
    """
    This is the simple moving average calculation. This is simply the mean closing price
    during that period.
    """
    sma_dict, count = {}, 0
    for name in highlowclose.keys():
        df    = pd.DataFrame()
        close = highlowclose[name]['Closes']
        prds  = periods[name][0]
        
        for x in range(32):
            num   = prds[x]
            
            # Rolling mean of closing prices, filling NaN's with zero
            df[x] = (close.rolling(window=num, center=False).mean()).fillna(0.)
            
        sma_dict[name] = df       
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
       
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files(sma_dict, 'sma', highlowclose)
    else:
        dump_indicator_data(sma_dict, 'sma')
    print "SMA DONE"
    return

def aro_create(highlowclose, periods, dmp_method):
    """
    This is the aroon indicator calculation. For each periods values, you how many periods it has
    been since you've had the max during that period as well as the min, and subtract from 
    eachother
    """
    aro_dict, count = {}, 0
    for name in highlowclose.keys():
        df     = pd.DataFrame()
        close  = highlowclose[name]['Closes']
        prds   = periods[name][0]
        
        for x in range(32):
            num   = prds[x]
            # Using an applied method for each period to cut down on computation time
            df[x] = close.rolling(window=num, center=False).apply(aro_apply, args=(num,))
            
        aro_dict[name] = df       
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files(aro_dict, 'aro', highlowclose)
    else:
        dump_indicator_data(aro_dict, 'aro')
    print "ARO DONE"
    return
def aro_apply(df, prd):
    # For each rolling period, find the number of days since last period min/max, then
    # divide by the period length and finally multiply by 100 to get the upper/lower aroon bands
    # Then subtract the upper by lower values
    up      = ((prd - df.argmax()) / float(prd)) * 100.
    down    = ((prd - df.argmin()) / float(prd)) * 100.
    return up - down

def mac_create(highlowclose, periods, dmp_method):
    """
    This is the moving average convergence divergence calculation. To get this we take the 
    expected moving average of the smaller period, and the larger period. Subtract from eachother,
    then take the expected moving average of the smallest period on those calculated values to get
    an buy/sell indicator of the macd calculation and use both of these.
    """
    mac_dict, mac_dict2, count = {}, {}, 0
    for name in highlowclose.keys():
        macd_df, macd_df2  = pd.DataFrame(), pd.DataFrame()
        close              = highlowclose[name]['Closes']
        prds               = periods[name][2]
        
        for x in range(32):
            # This indicator uses 3 period lengths in its calculation, a small, medium, and large
            num, num2, num3 = prds[x], prds[x+1], prds[x+3]
            
            # ewm stands for expected weighted moving means that its a rolling calculation which
            # weights more recent vals more heavily than less recent.             
            # Get our mid and large rolling ewm averages of closing prices, subtract them, then            
            # get our modified version of the macd which is the ewm average of the macd, div macd             
            # by this and subtract one            
            mid    = close.ewm(ignore_na=False, span=num2, min_periods=0, adjust=True).mean()
            large  = close.ewm(ignore_na=False, span=num3, min_periods=0, adjust=True).mean()
            macd   = large - mid
            small  = macd.ewm(ignore_na=False, span=num, min_periods=0, adjust=True).mean()
            result = (macd / small) - 1.
            
            # we calculate two indicators for this one.
            macd_df[x]  = macd
            macd_df2[x] = result
        
        mac_dict[name]  = macd_df  
        mac_dict2[name] = macd_df2
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
      
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files([mac_dict, mac_dict2], 'mactwo', highlowclose)
    else:
        dump_indicator_data([mac_dict, mac_dict2], 'mactwo')
    print "MAC DONE"
    return
    
def adx_create(highlowclose, periods, short, dmp_method):
    """
    This is the average directional index calculation.
    """
    tickers2      = {'USO':'NYSEARCA:USO','GLD':'NYSEARCA:GLD',
                     'SPY':'NYSEARCA:SPY','^DJI':'INDEXDJX:.DJI',
                     '^GSPC':'INDEXSP:.INX','^IXIC':'INDEXNASDAQ:.IXIC', 
                     'LMT':'NYSE:LMT'}
    
    tr_df_dict, adx_dict, adx_dict2, count = {}, {}, {}, 0
    for name in highlowclose.keys():
        if name in tickers2.keys():
            name2 = tickers2[name]
        else:
            name2 = name 
        adx_df, dx_df = pd.DataFrame(), pd.DataFrame()
        
        high   = highlowclose[name]['Highs']
        low    = highlowclose[name]['Lows']
        close  = highlowclose[name]['Closes']
        prds   = periods[name][0]
        
        for x in range(32):
            trpm_df = pd.DataFrame()
            num     = prds[x]
            
            # Get the true range values by subtracting highs and lows, highs and period closes,
            #  and the lows and period closes, and take the max at each period
            tr_df      = pd.DataFrame()
            tr_df[0]   = (high - low)
            tr_df[1]   = (high - close.shift(num)).fillna(0.)
            tr_df[2]   = (low  - close.shift(num)).fillna(0.)
            true_range = tr_df.max(axis=1)
            
            # Get the positive and negative directional indicators
            plus_dm   = (high - high.shift(num)).fillna(0.)
            minus_dm  = (low.shift(num) - low).fillna(0.)
            
            # For each pos/neg direc inds, if pos >= than minus, take positive, else take neg
            plus_dm   = pd.Series(np.where(plus_dm >= minus_dm, plus_dm, 0.),index=plus_dm.index)
            minus_dm  = pd.Series(np.where(minus_dm >= plus_dm, minus_dm, 0.),index=plus_dm.index)
            
            # If any values are below 0, set them to 0
            plus_dm[plus_dm < 0]   = 0.
            minus_dm[minus_dm < 0] = 0.
        
            # Put the true_range and pos/neg directional indicators into a dataframe so these 
            # can provide precalculated values for the real-time calculations
            trpm_df[0] = true_range
            trpm_df[1] = plus_dm
            trpm_df[2] = minus_dm
            
            # Dictionary key value is the company plus the period value
            tr_name = name2+str(x)
            
            # Store these values for use in the real-time calculation
            tr_df_dict[tr_name] = trpm_df[len(true_range)-num-1:]
            
            # Calculate first the average true range, using the expected weighted mean of the
            # true ranges, then calculate the positive and negative directional movements
            atr    = true_range.ewm(ignore_na=False, span=num, min_periods=0, adjust=True).mean()
            pos_dm = plus_dm.ewm(ignore_na=False, span=num, min_periods=0, adjust=True).mean()
            neg_dm = minus_dm.ewm(ignore_na=False, span=num, min_periods=0, adjust=True).mean()

            # Calculate the positive and negative directional indexes, replacing divide by zero
            # infinity bug by zero if any.
            pos_di  = ((pos_dm / atr) * 100.).replace([np.inf,-np.inf],0.)
            neg_di  = ((neg_dm / atr) * 100.).replace([np.inf,-np.inf],0.)
            
            # Calculate the directional index, replacing any zero infinity bugs by zero
            dx_df[x] = (abs(pos_di - neg_di) / (pos_di + neg_di)).replace([np.inf,-np.inf],0.)
            
            # Take the EMaverage of the directional index to get the average directional index
            adx_df[x] = dx_df[x].ewm(ignore_na=False, span=num, min_periods=0, 
                                     adjust=True).mean() * 100.
        
        adx_dict[name] = adx_df
        adx_dict2[name2] = dx_df[short:]
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
    
    base = 'NewBase/ADXD/'
    if not os.path.exists(base):
        os.makedirs(base)
    opp  = open('NewBase/ADXD/adx_d2.pickle','wb')
    opp2 = open('NewBase/ADXD/adx_d3.pickle','wb')
    pickle.dump(adx_dict2, opp)
    pickle.dump(tr_df_dict, opp2)
    opp.close()
    opp2.close()
    
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files(adx_dict, 'adx', highlowclose)
    else:
        dump_indicator_data(adx_dict, 'adx')
    print "ADX DONE"
    return

def rsi_create(highlowclose, periods, dmp_method):
    """
    This is the relative strength index calculation.
    """
    rsi_dict, count = {}, 0
    for name in highlowclose.keys():
        rsi_df = pd.DataFrame()
        
        close   = highlowclose[name]['Closes']
        prds    = periods[name][2]

        for x in range(32):
            # Uses both small and large periods
            num  = prds[x]
            num2 = prds[x+3]
            
            # Get closing price differences using small period
            deltas  = (close - close.shift(num)).fillna(0.)
            
            # set up/down df as values above/below zero setting others to 0
            up, down = deltas.copy(), deltas.copy()
            up[up < 0]     = 0
            down[down > 0] = 0

            # Get rolling means of up/down dfs with down dfs taking absolute vals
            rolup_df   =  up.rolling(window=num2, center=False).mean()
            roldown_df = (down.rolling(window=num2, center=False).mean()).abs()

            rol_updown = (rolup_df / roldown_df).replace([np.inf,-np.inf],0.)
            rsi_df[x] = (100. - (100. / (1. + rol_updown))).replace([np.inf,-np.inf],0.)

        rsi_dict[name]  = rsi_df
        stdout.write("\r%d" % count)
        stdout.flush()
        count += 1
        
    # If dmp_method is 0, we update the newbase files directly, else we do our 
    # intermediary step that is better for time, but uses more space temporarily
    if dmp_method == 0:
        create_newbase_files(rsi_dict, 'rsi', highlowclose)
    else:
        dump_indicator_data(rsi_dict, 'rsi')
    print "RSI DONE"
    return

print "LOADED"

LOADED


In [4]:
def update_indicators(highlowclose, prd_dict, short, dmp_method, update=False):
    """
    Create our indicators using the highlowclose dictionary, containing all the 
    dataframes with highs/lows/closes/typicals for each company, the period dict
    with all of our period length values, and the dmp_method which is the preferred
    way for us to create our newbase indicator files, where passing 0 means we
    directly add the indicators to the newbase files after each indicator is created
    rather than passing 1 which means we create intermediarry files containing our
    indicators in the Companies/ directory where each company will have its own 
    directory containing all of its indicators. Once this is done, we combine them 
    for each company. 
    
    The reason for the intermediary step is to reduce memory 
    consumption which for my computer would exceed limits and slow down the process
    significantly where it took 5 times longer for directly updating the files. If you
    have a lot of memory though it may actually be faster to update directly. The short
    parameter is used for the adx indicator which creates two seperate dictionaries apart
    from the indicator, that are used in the real-time indicator calculations for 
    precalculations but we only need about the last years worth of values for those, so
    we pass in a date in short.
    """
    # Calculate all of our indicators using our highlowclose dictionary, the period dict,
    # and the preferred way to create our newbase files
    returns_create(highlowclose, prd_dict, dmp_method)
    per_create(highlowclose, prd_dict, dmp_method)
    cci_create(highlowclose, prd_dict, dmp_method)
    vol_create(highlowclose, prd_dict, dmp_method)
    bol_create(highlowclose, prd_dict, dmp_method)
    mom_create(highlowclose, prd_dict, dmp_method)
    sma_create(highlowclose, prd_dict, dmp_method)
    kdo_create(highlowclose, prd_dict, dmp_method)
    mac_create(highlowclose, prd_dict, dmp_method)
    rsi_create(highlowclose, prd_dict, dmp_method)
    adx_create(highlowclose, prd_dict, short, dmp_method)
    aro_create(highlowclose, prd_dict, dmp_method)
    
    # If dmp_method is 1, we use the intermediary step
    if dmp_method == 1:
        # If we are just updating rather than creating the indicators for the first time,
        # then we pass the highlowclose dictionary which is the shortened version since
        # we only need the shortened version if just updating.
        if update == False:
            combined_company_indicators()
        else:
            combined_company_indicators(highlowclose)
    return

print "LOADED"

LOADED


In [10]:
def create_newbase_files(dic, nm, intra, first=False):
    """
    Base files are where our combined data for each company. Currently theres 15 
    dataframes each for each company. This is so that we only need to process about
    4 months of data at a time, so we don't have to have so much data in memory at
    once. We only process a max of 25000 rows at a time so it works out to roughly
    4 months at a time processed.
    """
    # Each name is a seperate company
    for name in intra.keys():
        new_df = pd.DataFrame()
        
        # If we not updating the mac indicator, we only create a single list of
        # of column names since the mac indicator has two indicators associated
        # with it.
        if nm != 'mactwo':
            nm_lst = []
            for x in range(32):
                nm_lst.append(name+'_'+nm+str(x))
            # Rename the columns from numbers to names
            dic[name].columns = nm_lst
            new_df = dic[name]
        else:
            nm_lst  = []
            nm_lst2 = []
            for x in range(32):
                nm_lst.append(name+'_mac'+str(x))
                nm_lst2.append(name+'_'+nm+str(x))
            # Rename the columns from numbers to names
            dic[0][name].columns = nm_lst
            dic[1][name].columns = nm_lst2
            # Combine the two mac indicators together
            new_df = pd.concat([dic[0][name], dic[1][name]], 
                        axis=1).fillna(method='bfill').fillna(method='ffill')
        
        # Seperate the indicator dataframe into 15 parts using specified dates for
        # roughly equal parts.
        ddf1  = new_df['2013-01-08':'2013-04-08']
        ddf2  = new_df['2013-04-09':'2013-07-08']
        ddf3  = new_df['2013-07-09':'2013-10-08']
        ddf4  = new_df['2013-10-09':'2014-01-08']
        ddf5  = new_df['2014-01-09':'2014-04-10']
        ddf6  = new_df['2014-04-11':'2014-07-10']
        ddf7  = new_df['2014-07-11':'2014-10-09']
        ddf8  = new_df['2014-10-16':'2015-01-09']
        ddf9  = new_df['2015-01-12':'2015-04-13']
        ddf10 = new_df['2015-04-14':'2015-07-13']
        ddf11 = new_df['2015-07-14':'2015-10-12']
        ddf12 = new_df['2015-10-13':'2016-01-12']
        ddf13 = new_df['2016-01-13':'2016-04-21']
        ddf14 = new_df['2016-04-22':'2016-07-08']
        ddf15 = new_df['2016-07-11':]
        
        # Put all of these parts into a list
        ddf_lst = [ddf1,ddf2,ddf3,ddf4,ddf5,ddf6,ddf7,ddf8,ddf9,
                   ddf10,ddf11,ddf12,ddf13,ddf14,ddf15]
        
        base = 'NewBase/'+name+'/'
        
        # If first is True, this is the first indicator and we create
        # the indicator files.
        if first == True:
            # If the base directory doesn't already exist, create it.
            if not os.path.exists(base):
                os.makedirs(base)
                
            # For each indicator dataframe part, create our pickled
            # indicator file in that companies base directory, naming
            # the indicator files using this format: companyname_df0
            # where the number is the indicator dataframe part(0-14)
            for y in range(len(ddf_lst)):
                opp = open(base+name+'_df'+str(y)+'.pickle', 'wb')
                pickle.dump(ddf_lst[y], opp)
                opp.close()
        else:
            # For each indicator part, open its corresponding indicator
            # dataframe part
            for y2 in range(len(ddf_lst)):
                opp = open(base+name+'_df'+str(y2)+'.pickle', 'rb')
                old = pickle.load(opp)
                opp.close()
                
                # Then combine this indicator part with the indicator dataframe part
                comb = pd.concat([old, ddf_lst[y2]], 
                        axis=1).fillna(method='bfill').fillna(method='ffill')
                
                # Dump the combined dataframe back to the file
                opp = open(base+name+'_df'+str(y2)+'.pickle', 'wb')
                pickle.dump(comb, opp)
                opp.close()
    return

def update_adx23short(resampled_adj, adx_d2, adx_d3):
    """
    Update the adx2 and adx3 dictionaries that are precalculated for use in the the real-time 
    calculations to speed up calculation during real-time. These are updated during the real-time
    calculations but if you miss a day for some reason these functions update your dictionaries
    for you. Also update our short highlowclose dictionary that is a shortened version of our
    normal highlowclose dictionary for speed purposes since we don't need the full dictionary
    during real-time calcs.
    """
    # The ticks dictionary that contains the company stock symbols that differ between Yahoo
    # and Google server symbol retrievals. The key is the yahoo symbol, while the value is
    # the google symbol
    ticks   = {'LMT':'NYSE:LMT', 'USO':'NYSEARCA:USO', 'GLD':'NYSEARCA:GLD',
               'SPY':'NYSEARCA:SPY', '^DJI':'INDEXDJX:.DJI', 
               '^GSPC':'INDEXSP:.INX', '^IXIC':'INDEXNASDAQ:.IXIC'}
    
    # Open both the shortened highlowclose dictionary, as well as the adx dictionary that's 
    # used for precalculations in the real-time calculations.
    opp   = open('NewBase/ADXD3/adx_d2.pickle','rb')
    opp2  = open('Pickles/shortpickleintra.pickle','rb')
    d2    = pickle.load(opp)
    short = pickle.load(opp2)
    opp.close()
    opp2.close()

    new_short, new_d2, new_d3 = {}, {}, {}
    for name in resampled_adj.keys():
        if name in ticks.keys():
            name2 = ticks[name]
        else:
            name2 = name
            
        # For each of the new highs/lows/closes/typicals since the last update, add each to
        # the previous highlowclose dictionary dataframe. Then add the new adx dictionary
        # values to the previous adx dictionary values.
        new_short[name2] = short[name2].append(resampled_adj[name])
        new_d2[name2]    = d2[name2].append(adx_d2[name])
        
        # Rename the 2nd adx dictionary precalculation dictonary to use the google names
        for w in range(32):
            new_d3[name2+str(w)] = adx_d3[name+str(w)]

    # Dump everything to there files.
    opp  = open('Pickles/shortpickleintra.pickle','wb')
    opp2 = open('NewBase/ADXD3/adx_d2.pickle','wb')
    opp3 = open('NewBase/ADXD3/adx_d3.pickle','wb')
    pickle.dump(new_short, opp)
    pickle.dump(new_d2, opp2)
    pickle.dump(new_d3, opp3)
    opp.close()
    opp2.close()
    opp3.close()
    return

def update_for_spldivs(dmp_method, update=False):
    """
    Update a companies indicator values following a dividend or stock split using the
    new adjusted high/low/close/typical values that were updated. Pass in a list of
    company tickers that need to be updated.
    """
    # If we are just updating one or more companies for a split/dividend, then we
    # only will use the partial highlowclose dictionary containing only the companies
    # needing the update, rather than updating every company.
    if update == False:
        opp = open('Pickles/pickleadjustedintracomplete.pickle','rb')
    else:
        opp = open('Pickles/onlyupdateintra.pickle','rb')
    intra = pickle.load(opp)
    opp.close()

    # Use our stock dating function to get us the stock dates between about 1.5 years ago,
    # to about 1 year in the future
    rs            = NYSE_tradingdays()
    short         = str(rs[0])[:10]
    hlc_cols      = ['Highs','Lows','Closes','Typical']

    # The prd_lst is in days, ie. 1day, 2day, etc..
    prd_lst  = [1,  2,  3,  5,  8,  10, 12, 14, 16, 20, 25,  30,  40,  50,  80,  125]
    # The 3 plnums lists are in stock minutes, ie. 10min, 14min, etc..
    plnums   = [10, 14, 16, 18, 20, 25, 30, 40, 50, 75, 100, 125, 150, 200, 250, 300]
    plnums2  = [8,  10, 14, 16, 18, 20, 25, 30, 40, 50, 75,  100, 125, 150, 200, 250, 300, 350]
    plnums3  = [6, 8, 10, 14, 16, 18, 20, 25, 30, 40, 50,  75,  100, 125, 150, 200, 250, 300, 350]
    prds_dates = {}
    for key in intra.keys():
        # Make sure there are no duplicate index values and order the columns in hlc_cols order
        intra[key] = intra[key][hlc_cols].reset_index().drop_duplicates(
                                subset='index',keep='last').set_index('index')
        tl = []
        # Go through present, to about 0.5 years ago in the dates dictionary, making sure for
        # each company, that that date exists since some dates are missing for some companies.
        # If the date is missing we skip it so as to ensure that when we measure number of 
        # stock minutes are in, for example, 50 days, its accurate.
        for x in xrange(257, 110, -1):
            try:
                dt   = str(rs[x])[:10]
                test = intra[key][dt]
                tl.append(dt)
            except:
                pass
        prd_lst  = [tl[1], tl[2], tl[3], tl[5], tl[8], tl[10], tl[12], tl[14], tl[16], tl[20], 
                    tl[25], tl[30], tl[40], tl[50], tl[80], tl[125]]
        prds_dates[key] = prd_lst
    # Pass all of this to our period creation function which calculates all of our period lengths
    # for each company.
    prd_dict = create_prd_lst2(intra, prds_dates, plnums, plnums2, plnums3)
    # Then update all of our indicators
    update_indicators(intra, prd_dict, short, dmp_method, update)
    return

def dump_indicator_data(dic, name):
    """
    This function takes a indicator dictionary, and is identified
    with the name variable that contains the indicator abreviation
    like 'rsi' for relative strength index. We then create a directory
    for each company in the main Companies/ directory. Each companies
    directory will contain each indicator as a file for each stored as
    pickle files.
    
    We then create the indicator names for the column names of the 
    dataframes, using a combination of the company name, the indicator
    and the period length value. We then dump the indicator data as a 
    pickle file into that companies directory. We start every indicator
    file after 2013-01-08 as that is when we have data from every company
    starting.
    
    For one indicator, Moving Average Convergence Divergence, we have 
    2 indicators that are passed from that indicator, and we identify
    it as 'mactwo' and for that indicator, we have to name two dataframes
    and dump 2 files.
    
    All of these files will be used to combine indicators to create our
    company indicator dataframes, that will be used for predictions.
    """
    # If we not updating the mac indicator, we only create a single list of
    # of column names since the mac indicator has two indicators associated
    # with it.
    if name != 'mactwo':
        for key, val in dic.iteritems():
            # Make sure our companies directory exists, creating it if not
            base = 'Companies/'+key+'/'
            if not os.path.exists(base):
                os.makedirs(base)

            names = []
            # Rename our columns from numbers to names
            for x in range(32):
                names.append(key+'_'+name+str(x))
            val.columns = names
            
            # Dump our indicator to a file
            opp = open(base+key+'_'+name+'.pickle', 'wb')
            pickle.dump(val['2013-01-08':], opp)
            opp.close()
    else:
        for key in dic[0].keys():
            # Make sure our companies directory exists, creating it if not
            base = 'Companies/'+key+'/'
            if not os.path.exists(base):
                os.makedirs(base)
                
            val = dic[0][key]
            val2 = dic[1][key]
            
            names, names2 = [], []
            # Rename our columns from numbers to names
            for x in range(32):
                names.append(key+'_mac'+str(x))
                names2.append(key+'_'+name+str(x))
            val.columns  = names
            val2.columns = names2
            
            # Dump our indicators to two files
            opp  = open(base+key+'_mac.pickle', 'wb')
            opp2 = open(base+key+'_'+name+'.pickle', 'wb')
            pickle.dump(val['2013-01-08':], opp)
            pickle.dump(val2['2013-01-08':], opp2)
            opp.close()
            opp2.close()
    return

def combined_company_indicators(highlowclose=None):
    """
    This function takes the company indicator directories, and combines
    them to create our combined company dataframes that each have 416 
    columns together. After combining them, we divide this dataframe into
    15 similar pieces so that when training data, we only have to use about
    25000 records at a time, and don't have to worry as much about memory
    issues. 
    
    These new pieces are all dumped in company directories in the NewBase
    directory that is created. These pieces will be used both for real
    time indicator calculations as well as training and testing for our
    neural network.
    """
    # The stocks used for my project
    ticks   = ['BPOP','FITB','HBAN','CMCSA','EBAY',
               'AAPL','AMAT','BRCD','CSCO','GOOG','INTC',
               'LVLT','MSFT','MU','NVDA','ORCL','QCOM',
               'SIRI','WIN','YHOO','BHP','BP',
               'RIO','XOM','GE','F','MO','XRX','GS','JPM',
               'LYG','MS','RF','USB','WFC','MRK','PFE','LMT',
               'MGM','AMD','GLW','HPQ','S','T',
               '^GSPC','^IXIC','^DJI','GLD','USO','SPY']
    # The indicator abreviations
    lst = ['rsi', 'vol', 'sma', 'cci', 'per', 'mom', 'bol', 
           'aro', 'mac', 'mactwo', 'adx', 'kdo', 'rets']
    
    # If the shortened version of the highlowclose dictionary was not passed, then
    # we use all of the companies in ticks, rather than just a certain number of them.
    try:
        if highlowclose == None:
            ticks = ticks
        else:
            ticks = highlowclose.keys()
    except:
        ticks = highlowclose.keys()
    
    for key in ticks:
        ddf_lst = []
        new_df  = pd.DataFrame()
        
        folder = 'Companies/'+key+'/'
        # For every file in that companies indicator directory, we open it, and combine, doing
        # this for every file(indicator) in the directory
        for indic in lst:
            opp = open(folder+key+'_'+indic+'.pickle', 'rb')
            df  = pickle.load(opp)
            opp.close()
                
            new_df = pd.concat([new_df, df], axis=1).fillna(method='bfill').fillna(method='ffill')
        
        # Seperate the indicator dataframe into 15 seperate, roughly equal parts
        ddf1  = new_df['2013-01-08':'2013-04-08']
        ddf2  = new_df['2013-04-09':'2013-07-08']
        ddf3  = new_df['2013-07-09':'2013-10-08']
        ddf4  = new_df['2013-10-09':'2014-01-08']
        ddf5  = new_df['2014-01-09':'2014-04-10']
        ddf6  = new_df['2014-04-11':'2014-07-10']
        ddf7  = new_df['2014-07-11':'2014-10-09']
        ddf8  = new_df['2014-10-16':'2015-01-09']
        ddf9  = new_df['2015-01-12':'2015-04-13']
        ddf10 = new_df['2015-04-14':'2015-07-13']
        ddf11 = new_df['2015-07-14':'2015-10-12']
        ddf12 = new_df['2015-10-13':'2016-01-12']
        ddf13 = new_df['2016-01-13':'2016-04-21']
        ddf14 = new_df['2016-04-22':'2016-07-08']
        ddf15 = new_df['2016-07-11':]
        # Put them in a list
        ddf_lst = [ddf1,ddf2,ddf3,ddf4,ddf5,ddf6,ddf7,ddf8,ddf9,
                   ddf10,ddf11,ddf12,ddf13,ddf14,ddf15]
        
        # Create the indicator dataframe directory if not already created
        base = 'NewBase/'+key+'/'
        if not os.path.exists(base):
            os.makedirs(base)
            
        # Dump each part as its own file
        for x in range(len(ddf_lst)):
            opp = open(base+key+'_df'+str(x)+'.pickle', 'wb')
            pickle.dump(ddf_lst[x], opp)
            opp.close()
    return
            
print "LOADED"

LOADED


In [None]:
"""
Dump method is whether you choose to temporarily store each companies data in a folder
called Companies/ where each subfolder is a company, and within the company folders you
temporarily store each indicator, before finally combining all the companies indicators
and storing 14 different combined dataframes with roughly 4 months of combined data in each.

If you choose the other option, you don't use the temporary storage, and rather go straight to
dumping and adding to the dump files in the combined data. 

The former method is better if you have a lot of available storage space. Each company takes 
up about 3GB of data so if you're working w/ 50 companies, you need 150GB on top of the 150GB 
for combining them. You can then delete the companies folder after, unless you want to keep 
them to be able to look at each individual companies individual indicators. The latter method 
is better if you don't have a lot of storage space. This method takes WAYYY longer though!!
(ex. on 50 companies, it took 3hrs for the first method, and 24 hrs for the second!!)

dmp_method = 0 is the latter method(ie. less storage, much more computing time)
dmp_method = 1 is the former method(ie. more storage, much less computing time)
"""
dmp_method = 1
update_for_spldivs(dmp_method)
#If just updating, rather than initialing creating the indicators, uncomment below
# and then comment the call above.
#update_for_spldivs(dmp_method, update=True)