In [1]:
import numpy as np
import pandas as pd
import math as mth
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import time

In [2]:
sandp500 = pd.read_csv ('sandp500/all_stocks_5yr.csv')
sandp500['date'] = pd.to_datetime(sandp500['date'])

#Study only a particular company right now
company = pd.read_csv('sandp500/individual_stocks_5yr/AAPL_data.csv')
company['date'] = pd.to_datetime(company['date'])

In [3]:
#Function that returns specific time-range stockvalues for a particular company
def period(df, initial, final):
    df = df.set_index(['date'])
    return df.loc[initial:final].reset_index()

# (1) AD Analysis

In [4]:
# https://www.investopedia.com/terms/a/accumulationdistribution.asp
# A/D=Previous A/D+CMFV where CMFV = ((c - l) - (h - c))/(h - l)

def AD (p):
 
    for x in all_companies:
        temp = sp500.loc[x]
        c = np.empty((p-1,))
        c[:] = np.NaN
        c = list(c)
        c.extend(temp['close'].iloc[p-1:])
   
        v = temp['volume'].rolling(window = p).sum()
        h = temp['high'].rolling(window = p).max()
        l = temp['low'].rolling(window = p).min()
        
        AD = v * (((c - l) - (h - c))/(h - l))
        output = AD.cumsum()
        sp500.loc[x,'Accum_Distri'] = pd.DataFrame(output)

#p = 14: Time period (in days)
#     initial = company['date'][0]
#     final = initial + pd.Timedelta(p, unit='d')
#     itr = 1 # To start the process for the 1st iteration
#     output = []
#     last_elem = company['date'].iloc[-1]

#     while final <= last_elem:
#         temp = period(company, initial, final) #A pandas frame with only the stocks in the particular time range
    
        
#         if itr == 1:
#             v = temp['volume'].sum()
#             c = temp['close'][len(temp)-1]
#             h = temp['high'].max()
#             l = temp['low'].min()
#             AD = v * (((c - l) - (h - c))/(h - l))
#         else:
#             v = temp['volume'].sum()
#             AD += v * (((c - l) - (h - c))/(h - l))
#             c = temp['close'][len(temp)-1]
#             h = temp['high'].max()
#             l = temp['low'].min()
#             output.append(AD)

#         initial += pd.Timedelta(1, unit='d')
#         final += pd.Timedelta(1, unit='d')
#         itr = 0

# (2) GOPALAKRISHNAN RANGE INDEX (GAPO Index)

In [5]:
# Standard code for working on specific time-range stockvalues for a particular company
def GAPO (p):
#     initial = company['date'][0]
#     final = initial + pd.Timedelta(p, unit='d')
#     output = []
#     last_elem = company['date'].iloc[-1]

#     while final <= last_elem:
#         temp = period(company, initial, final) #A pandas frame with only the stocks in the particular time range
#   initial += pd.Timedelta(1, unit='d')
#         final += pd.Timedelta(1, unit='d')
        
    for x in all_companies:
        temp = sp500.loc[x]
        
        hh = temp['high'].rolling(window = p).max()
        ll = temp['low'].rolling(window = p).min()
        
        GAPO = np.log(hh-ll)/np.log(p)
        sp500.loc[x,'GAPO'] = pd.DataFrame(GAPO)

# (3) LINEAR REGRESSION FORECAST (LRF)

In [6]:
# Standard code for working on specific time-range stockvalues for a particular company
def LRF (company, p):
    initial = company['date'][0]
    final = initial + pd.Timedelta(p, unit='d')
    output = [[],[],[]]
    last_elem = company['date'].iloc[-1]

    while final <= last_elem:
        temp = period(company, initial, final) #A pandas frame with only the stocks in the particular time range
        
        temp['average'] = temp.apply(lambda row: (row.high + row.low)/2. , axis = 1)         
        x = np.array(temp.apply(lambda row: row.date.value/10**9/86400, axis = 1))
        output[0].extend(x)
        output[1].extend(temp['average'])
        
        answer = LinearRegression().fit(x.reshape(-1, 1), np.array(temp['average']).reshape(-1, 1),)
        output[2].extend(answer.coef_[0][0]*x + answer.intercept_[0])       

        initial = final + pd.Timedelta(1, unit='d')
        final = initial + pd.Timedelta(p, unit='d')

    return output

# (4) On Balance Volume (OBV)

In [7]:
# Standard code for working on specific time-range stockvalues for a particular company
def OBV (p):
#     initial = company['date'][0]
#     final = initial + pd.Timedelta(p, unit='d')
#     output = []
#     itr=1
  
#     last_elem = company['date'].iloc[-1]

#     while final <= last_elem:
#         temp = period(company, initial, final) #A pandas frame with only the stocks in the particular time range
    for x in all_companies:   
        temp = sp500.loc[x]
        v = temp['volume'].rolling(window = p).sum()
        c = np.empty((p-1,))
        c[:] = np.NaN
        c = list(c)
        c.extend(temp['close'].iloc[p-1:])
        
        for i in range(p, len(c)):
            if c[i] < c[i-1]:
                v[i] = - v[i]
            if c[i] == c[i-1]:
                v[i] = 0
            
        obv = np.cumsum(v)
        sp500.loc[x,'On_Bal_Vol'] = pd.DataFrame(obv)
#         else:
#             if temp['close'][len(temp)-1] > c:
#                 obv += v
#             elif temp['close'][len(temp)-1] < c:
#                 obv -= v
#             else:
#                 obv = obv
                
#         output.append(obv)

#         initial += pd.Timedelta(1, unit='d')
#         final += pd.Timedelta(1, unit='d')
        
#         c = temp['close'][len(temp)-1]
#         itr = 0

# (5) Relative Strength Index (RSI)

In [8]:
# Standard code for working on specific time-range stockvalues for a particular company
def RSI (p):
    for x in all_companies:
        temp = sp500.loc[x]
        
        difference = temp['close'].diff()
        up, down = difference.copy(), difference.copy()
        up [up < 0] = 0
        down [down > 0] = 0
        rs = up.rolling(window = p).mean() / down.abs().rolling(window = p).mean()
        rsi = 100.0 - (100.0 / (1.0 + rs))
        sp500.loc[x,'RSI'] = pd.DataFrame(rsi)
    
#     initial = company['date'][0]
#     final = initial + pd.Timedelta(p, unit='d')
#     output = []
#     itr = 1
#     rsi = 0
#     loss = []
#     profit = []
#     last_elem = company['date'].iloc[-1]

#     while final <= last_elem:
#         temp = period(company, initial, final) #A pandas frame with only the stocks in the particular time range
        
#         for row in range(1, len(temp)):
#             if temp['close'][row] < temp['close'][row-1]:
#                 loss.append(100*(temp['close'][row-1] - temp['close'][row])/temp['close'][row-1])
#             else:
#                 profit.append(100*(temp['close'][row] - temp['close'][row-1])/temp['close'][row-1])
        
#         if loss == []:
#             loss = [0]
#         if profit == []:
#             profit = [0]
            
#         loss = np.mean(loss)/p
#         profit = np.mean(profit)/p
        
#         if itr == 1:
#             rsi = 100 - (100 / (1+profit/loss) )
#         else:
#             rsi = 100 - (100 / (1 + ((prev_profit)*(p-1) + profit)/((prev_loss)*(p-1) + loss)))
            
#         output.append(rsi)

#         initial += pd.Timedelta(1, unit='d')
#         final += pd.Timedelta(1, unit='d')
#         itr = 0
#         prev_loss = loss
#         loss = []
#         prev_profit = profit
#         profit = []

#     return output

# (6) Weighted Close (WC)

In [9]:
# Note there is no time granularity for WC analysis
def WC(p):        
    for x in all_companies:
        temp = sp500.loc[x]
        output = (temp['high'] + temp['low'] + 2*temp['close'])/4
        sp500.loc[x,'Weighted_Close'] = pd.DataFrame(output)

# (7) Williams %R Formula (WRF)

In [10]:
def WRF (p):
#     initial = company['date'][0]
#     final = initial + pd.Timedelta(p, unit='d')
#     output = []
#     last_elem = company['date'].iloc[-1]
#     while final <= last_elem:
#         temp = period(company, initial, final) #A pandas frame with only the stocks in the particular time range
#         initial += pd.Timedelta(1, unit='d')
#         final += pd.Timedelta(1, unit='d')

    for x in all_companies:
        temp = sp500.loc[x]

        hh = temp['high'].rolling(window = p).max()
        ll = temp['low'].rolling(window = p).min()
        close = np.empty((p-1,))
        close[:] = np.NaN
        close = list(close)
        close.extend(temp['close'].iloc[p-1:])
        
        WR = (hh-close)/(hh-ll)*(-100)
        sp500.loc[x,'Will_R_Form'] = pd.DataFrame(WR)

# (8) Aaron Oscillator (AO)

In [11]:
def AO (p):
    for x in all_companies:
        stocks = sp500.loc[x]
        output = np.empty((p-1,))
        output[:] = np.NaN
        output = list(output)
        first_run = True
        ctr1 = 0
        ctr2 = 0
        hh = 0
        ll = 0
        itr = p

        while itr <= len(stocks): 
            temp = stocks.iloc[itr - p: itr]
            if first_run or ctr1 == 25:
                hh = temp['high'].max()
                ctr1 = p - temp['high'].values.argmax() - 2

            if first_run or ctr2 == 25:
                ll = temp['low'].min()
                ctr2 = p - temp['low'].values.argmin() - 2
                
            if temp['high'].iloc[-1] >= hh:
                ctr1 = 0
                hh = temp['high'].iloc[-1]
            else: 
                ctr1 += 1

            if temp['low'].iloc[-1] <= ll:
                ctr2 = 0
                ll = temp['low'].iloc[-1]
            else: 
                ctr2 += 1
            
            a_up = 100*(p-ctr1)/p
            a_down = 100*(p-ctr2)/p
            a_osc = a_up - a_down   
            output.append(a_osc)
            itr += 1
            first_run = False
        
        sp500.loc[x,'Aaron_Osc'] = pd.DataFrame(output)

    #     return output

# (9) Donchian Width

In [12]:
def DW (p):    
    for x in all_companies:
        temp = sp500.loc[x]

        hh = temp['high'].rolling(window = p).max()
        ll = temp['low'].rolling(window = p).min()

        DW = hh - ll
        sp500.loc[x,'Donchian_Width'] = pd.DataFrame(DW)

# (10) True Range

In [13]:
def TR (p):    
    for x in all_companies:
        temp = sp500.loc[x]
        TR = temp['high'] - temp['low']
        
        sp500.loc[x,'True_Range'] = pd.DataFrame(TR)

# "MAIN FUNCTION"

In [14]:
sp500 = sandp500.copy()
sp500.set_index(['Name','date'], inplace=True)
all_companies = sp500.index.get_level_values(0).unique() # gets the name of all companies

p = 14 # Time period in days
# analysis = 'DW' # What type of feature to use for our analysis

# fig, ax = plt.subplots(figsize=(20,10))
# ax.tick_params(axis='both', labelsize=20)
all_features = [AD,WRF,WC,GAPO,OBV,RSI,DW,TR]#AO]

for feature in all_features:
    tic = time.perf_counter()
    if feature == AO:
        AO(25)
    else:
        feature(p)
    
    toc = time.perf_counter()
    print("{0} Done!".format(str(feature)))
    print("Execution time: {0} seconds\n".format(toc - tic))
        
print(sp500)

<function AD at 0x000001E1BF52D0D0> Done!
Execution time: 7.4925305 seconds

<function WRF at 0x000001E1BF52D158> Done!
Execution time: 6.680782399999999 seconds

<function WC at 0x000001E1BF52D950> Done!
Execution time: 5.256540900000001 seconds

<function GAPO at 0x000001E1BF52DB70> Done!
Execution time: 5.558692000000001 seconds

<function OBV at 0x000001E1BA3E6C80> Done!
Execution time: 27.304009999999998 seconds

<function RSI at 0x000001E1BA3E6E18> Done!
Execution time: 7.949425499999997 seconds

<function DW at 0x000001E1BF51D0D0> Done!
Execution time: 5.5314486999999914 seconds

<function TR at 0x000001E1BF51D400> Done!
Execution time: 4.885769699999997 seconds

                  open     high      low  close    volume  Accum_Distri  \
Name date                                                                 
AAL  2013-02-08  15.07  15.1200  14.6300  14.75   8407500           NaN   
     2013-02-11  14.89  15.0100  14.2600  14.46   8882000           NaN   
     2013-02-12  14.4

In [15]:
# plt.plot(sp500.loc['AAL','DW'])
# plt.show()

# if analysis == 'LRF':
#     output = LRF(company, p)
#     plt.plot(output[0], output[1], label='High-Low Average')
#     plt.plot(output[0], output[2], label='Linear Reg Fit')
#     plt.xlabel('Time in Days', size = 20)
#     plt.ylabel('Price (in USD)', size = 20)
#     plt.title('Stocks for {0}'.format(company['Name'].iloc[0]), size =30)
#     plt.legend()