In [1]:
# General imports and reading the possible trades files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from numpy.random import choice
from numpy.random import randint

from util import get_stock_n_smooth
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import gc; gc.collect()

train_df = pd.read_csv('../data/train_possible_trades.csv')
test_df = pd.read_csv('../data/test_possible_trades.csv')

print(f"Train row count: {len(train_df)}")
print(f"Test  row count: {len(test_df)}")

Train row count: 4403
Test  row count: 6508


In [2]:
YFLOAD_PATH = '/Users/frkornet/CDA/Project/stockie/data/yfin/'

In [3]:
ls ../python/util.py

../python/util.py


In [4]:
def stats_table(df, days_cutoff):
    pos_df = df[['gain_pct', 'trading_days']].loc[df.gain_pct > 0].groupby(by='trading_days').count()
    pos_df.columns = ['pos_counts']
    loss_df = df[['gain_pct', 'trading_days']].loc[df.gain_pct < 0].groupby(by='trading_days').count()
    loss_df.columns = ['neg_counts']
    zero_df = df[['gain_pct', 'trading_days']].loc[df.gain_pct == 0].groupby(by='trading_days').count()
    zero_df.columns = ['zero_counts']
    temp = pd.concat([loss_df, pos_df, zero_df], join="outer").reset_index()
    cols = [ 'pos_counts', 'neg_counts', 'zero_counts' ]

    stats_cnt = np.zeros((3, 4))
    
    totals = np.zeros((4,))
    total_lt_xd = 0
    print('         \t pos     neg    zero    total')
    print('         \t======  ======  =====   ======')
    print(f'< {days_cutoff} days\t', end='')
    for i, num in enumerate(temp[cols].loc[temp.trading_days < days_cutoff].sum()):
        print(num, end='\t')
        total_lt_xd += num
        totals[i] += num
    print(total_lt_xd)
    totals[3] += total_lt_xd

    total_ge_xd = 0
    print(f'>= {days_cutoff} days\t', end='')
    for i, num in enumerate(temp[cols].loc[temp.trading_days >= days_cutoff].sum()):
        print(num, end='\t')
        total_ge_xd += num
        totals[i] += num
    print(total_ge_xd)
    totals[3] += total_ge_xd

    print('         \t------\t------\t-----\t------')
    print('         \t', end='')
    for num in totals:
        print(num, end='\t')

    print('')
    print('')


    totals_pct = np.zeros((4,))
    total_lt_xd = 0
    print('         \t pos     neg    zero    total')
    print('         \t======  ======  =====   ======')
    print(f'< {days_cutoff} days\t', end='')
    for i, num in enumerate(temp[cols].loc[temp.trading_days < days_cutoff].sum()):
        num_pct = (num / totals[3])* 100
        print(f'{num_pct:.2f}', end='\t')
        total_lt_xd += num_pct
        totals_pct[i] += num_pct
    print(f'{total_lt_xd:.2f}')
    totals_pct[3] += total_lt_xd

    total_ge_xd = 0
    print(f'>= {days_cutoff} days\t', end='')
    for i, num in enumerate(temp[cols].loc[temp.trading_days >= days_cutoff].sum()):
        num_pct = (num / totals[3])* 100    
        print(f'{num_pct:.2f}', end='\t')
        total_ge_xd += num_pct
        totals_pct[i] += num_pct
    print(f'{total_ge_xd:.2f}')
    totals_pct[3] += total_ge_xd

    print('         \t------\t------\t-----\t------')
    print('         \t', end='')
    for num in totals_pct:
        print(f'{num:.2f}', end='\t')

In [5]:
def stats_table(df, days_cutoff, verbose=False):
    pos_df = df[['gain_pct', 'trading_days']].loc[df.gain_pct > 0].groupby(by='trading_days').count()
    pos_df.columns = ['pos_counts']
    loss_df = df[['gain_pct', 'trading_days']].loc[df.gain_pct < 0].groupby(by='trading_days').count()
    loss_df.columns = ['neg_counts']
    zero_df = df[['gain_pct', 'trading_days']].loc[df.gain_pct == 0].groupby(by='trading_days').count()
    zero_df.columns = ['zero_counts']
    temp = pd.concat([loss_df, pos_df, zero_df], join="outer").reset_index()
    cols = [ 'pos_counts', 'neg_counts', 'zero_counts' ]

    stats_cnt = np.zeros((3, 4))
    totals_col = np.zeros((4,))
    total_lt_xd = 0
    for i, num in enumerate(temp[cols].loc[temp.trading_days < days_cutoff].sum()):
        stats_cnt[0,i] = num
        stats_cnt[2,i] += num
    stats_cnt[0,3] = np.sum(stats_cnt[0,0:3])

    
    for i, num in enumerate(temp[cols].loc[temp.trading_days >= days_cutoff].sum()):
        stats_cnt[1,i] = num
        stats_cnt[2,i] += num
    stats_cnt[1,3] = np.sum(stats_cnt[1,0:3])  
    stats_cnt[2,3] = stats_cnt[0,3] + stats_cnt[1,3]
    
    stats_pct = (stats_cnt.copy() / stats_cnt[2,3]) * 100
    if verbose == False:
        return stats_cnt, stats_pct
    
    print('         \t pos     neg    zero    total')
    print('         \t======  ======  =====   ======')
    print(f'< {days_cutoff} days\t', end='')
    for i in range(4):
        print(f'{stats_cnt[0, i]:.0f}', end='\t')
        
    print(f'\n>= {days_cutoff} days\t', end='')
    for i in range(4):
        print(f'{stats_cnt[1, i]:.0f}', end='\t') 
    
    print('\n         \t------\t------\t-----\t------')
    print('         \t', end='')
    for i in range(4):
        print(f'{stats_cnt[2, i]:.0f}', end='\t')

    print('\n\n\n')
    print('         \t pos     neg    zero    total')
    print('         \t======  ======  =====   ======')
    print(f'< {days_cutoff} days\t', end='')
    for i in range(4):
        print(f'{stats_pct[0, i]:.2f}', end='\t')
        
    print(f'\n>= {days_cutoff} days\t', end='')
    for i in range(4):
        print(f'{stats_pct[1, i]:.2f}', end='\t') 
    
    print('\n         \t------\t------\t-----\t------')
    print('         \t', end='')
    for i in range(4):
        print(f'{stats_pct[2, i]:.2f}', end='\t')
    return stats_cnt, stats_pct

In [6]:
stats_table(test_df, 60, True)

         	 pos     neg    zero    total
< 60 days	4581	1134	24	5739	
>= 60 days	138	630	1	769	
         	------	------	-----	------
         	4719	1764	25	6508	



         	 pos     neg    zero    total
< 60 days	70.39	17.42	0.37	88.18	
>= 60 days	2.12	9.68	0.02	11.82	
         	------	------	-----	------
         	72.51	27.11	0.38	100.00	

(array([[4.581e+03, 1.134e+03, 2.400e+01, 5.739e+03],
        [1.380e+02, 6.300e+02, 1.000e+00, 7.690e+02],
        [4.719e+03, 1.764e+03, 2.500e+01, 6.508e+03]]),
 array([[7.03902889e+01, 1.74247081e+01, 3.68776890e-01, 8.81837738e+01],
        [2.12046712e+00, 9.68039336e+00, 1.53657037e-02, 1.18162262e+01],
        [7.25107560e+01, 2.71051014e+01, 3.84142594e-01, 1.00000000e+02]]))

In [7]:
stats_table(test_df, 55, True)

         	 pos     neg    zero    total
< 55 days	4546	1046	24	5616	
>= 55 days	173	718	1	892	
         	------	------	-----	------
         	4719	1764	25	6508	



         	 pos     neg    zero    total
< 55 days	69.85	16.07	0.37	86.29	
>= 55 days	2.66	11.03	0.02	13.71	
         	------	------	-----	------
         	72.51	27.11	0.38	100.00	

(array([[4.546e+03, 1.046e+03, 2.400e+01, 5.616e+03],
        [1.730e+02, 7.180e+02, 1.000e+00, 8.920e+02],
        [4.719e+03, 1.764e+03, 2.500e+01, 6.508e+03]]),
 array([[6.98524892e+01, 1.60725261e+01, 3.68776890e-01, 8.62937923e+01],
        [2.65826675e+00, 1.10325753e+01, 1.53657037e-02, 1.37062077e+01],
        [7.25107560e+01, 2.71051014e+01, 3.84142594e-01, 1.00000000e+02]]))

In [8]:
stats_table(train_df, 60, True)

         	 pos     neg    zero    total
< 60 days	4403	0	0	4403	
>= 60 days	0	0	0	0	
         	------	------	-----	------
         	4403	0	0	4403	



         	 pos     neg    zero    total
< 60 days	100.00	0.00	0.00	100.00	
>= 60 days	0.00	0.00	0.00	0.00	
         	------	------	-----	------
         	100.00	0.00	0.00	100.00	

(array([[4403.,    0.,    0., 4403.],
        [   0.,    0.,    0.,    0.],
        [4403.,    0.,    0., 4403.]]), array([[100.,   0.,   0., 100.],
        [  0.,   0.,   0.,   0.],
        [100.,   0.,   0., 100.]]))

In [9]:
stats_table(train_df, 55, True)

         	 pos     neg    zero    total
< 55 days	4403	0	0	4403	
>= 55 days	0	0	0	0	
         	------	------	-----	------
         	4403	0	0	4403	



         	 pos     neg    zero    total
< 55 days	100.00	0.00	0.00	100.00	
>= 55 days	0.00	0.00	0.00	0.00	
         	------	------	-----	------
         	100.00	0.00	0.00	100.00	

(array([[4403.,    0.,    0., 4403.],
        [   0.,    0.,    0.,    0.],
        [4403.,    0.,    0., 4403.]]), array([[100.,   0.,   0., 100.],
        [  0.,   0.,   0.,   0.],
        [100.,   0.,   0., 100.]]))

In [10]:
def get_stock_n_smooth(ticker, period):
    """ 
    Copy of what is in util.py. Except this version reads what has been read 
    from yfinance and stored on file. The stored version is smoothed already, 
    and reading from disk should be much faster as it avoids the expensive 
    smoothing operation. The reading from file, will only return success if 
    there is at least 5 years worth of data to work with. 
    """
    gc.collect()
    try:
        hist = pd.read_csv(f'{YFLOAD_PATH}{ticker}.csv')
        hist.index = hist.Date.values
        del hist['Date']
        success = len(hist) > 5 * 252 
        #print(f'Successfully retrieved smoothed price data for {ticker} '+
        #    f'(len(hist)={len(hist)}, success={success})')
    except:
        hist = None
        success = False
        print(f'Failed to find {ticker}.csv in {YFLOAD_PATH}!')
    return success, hist

In [11]:
# def random_stock(tickers):
#     while True:
#         ticker = choice(tickers)
#         success, hist = get_stock_n_smooth(ticker, '10y')
#         if success == False:
#             continue
        
#         n_len = len(hist)
#         if n_len < 5 * 252:
#             continue
        
#         train_len = int(n_len * 0.7)
#         test_len  = n_len - train_len
        
#         return ticker, hist.reset_index(), train_len, test_len

In [12]:
hist_dict = dict()
def random_stock(tickers):
    while True:
        ticker = choice(tickers)
        if ticker in hist_dict:
            success, hist = True, hist_dict[ticker]
        else:
            success, hist = get_stock_n_smooth(ticker, '10y')
            if success == True:
                hist_dict[ticker] = hist.copy()
        
        if success == False:
            continue
        
        n_len = len(hist)
        if n_len < 5 * 252:
            continue
        
        train_len = int(n_len * 0.7)
        test_len  = n_len - train_len
        
        return ticker, hist.reset_index(), train_len, test_len

In [13]:
tickers = list(set(train_df.ticker) | set(test_df.ticker))
len(tickers)

542

In [67]:
def simulate_close_price(simulations,  days, daily_ret):
    np.random.seed(12345)
    mean_ratio_list, median_ratio_list = [], []
    for i in range(simulations):
        ratio_list = []
        tot_weight, weight = 0, 1
        for j in range(days):
            ticker, hist, _, _ = random_stock(tickers)
            idx = np.random.choice(len(hist))
            #print("idx=", idx)
            #print(f'Open={hist.Open.iloc[idx]:.2f} High={hist.High.iloc[idx]:.2f} Low={hist.Low.iloc[idx]} Close={hist.Close.iloc[idx]}')
            if hist.Close.iloc[idx] > 0.0:
                # if daily_ret > 0:
                weight = (1+daily_ret) ** j
                tot_weight += weight
                
                ratio = ( hist.Open.iloc[idx] + hist.High.iloc[idx] + hist.Low.iloc[idx]) / \
                    ( 3 * hist.Close.iloc[idx] )
                ratio_list.append(weight * ratio)
            else:
                print("Encountered zero Close for:", ticker,  hist.Close.iloc[idx], "simulation run:", i,")")    
        
        ratios = np.array(ratio_list)
        #if tot_weight > 0:
        ratios = (ratios / tot_weight) * days
        #print("ratios=",ratios)
        
        mean_ratio_list.append(np.mean(ratios))
        median_ratio_list.append(np.median(ratios))
    
    return mean_ratio_list, median_ratio_list

In [72]:
%time mean_ratio_list, median_ratio_list = simulate_close_price(simulations=1000, days=500, daily_ret=0.4)

Encountered zero Close for: LBAI nan simulation run: 431 )
Encountered zero Close for: VSH nan simulation run: 741 )
Encountered zero Close for: CAH nan simulation run: 987 )
CPU times: user 6min 17s, sys: 331 ms, total: 6min 18s
Wall time: 6min 18s


In [73]:
mean_ratio_list

[1.0041492298256782,
 0.996264433644047,
 0.9991218396394722,
 1.0036149692897167,
 0.9958163429555704,
 1.0040279943977968,
 1.0076492982938594,
 0.9979037691474655,
 1.0001377665275029,
 1.0061255799494613,
 1.0041114354933471,
 0.9948290731388797,
 0.9853839992550253,
 0.9986248223731489,
 0.9984263752702615,
 1.0022471510466833,
 0.9988128352800972,
 1.0063365459346387,
 1.0046589635607774,
 0.9933149901060941,
 1.0170435862709373,
 1.0049140580467348,
 1.0022331425340458,
 1.0053971373068775,
 0.9941507402235041,
 0.9993143526969548,
 0.9996028195926051,
 1.0008709362872257,
 1.0075040420039814,
 1.0000416153856075,
 0.9983385181910109,
 0.9946861090097742,
 1.011795791776337,
 1.0002997760027599,
 1.011524272375744,
 0.9843727218189969,
 0.9962037688006559,
 1.0119155961075406,
 1.0148050426613948,
 0.9969041075922294,
 1.0007800463643306,
 1.0032156511803842,
 0.9977474718867752,
 1.0236628898723423,
 1.013120952393194,
 1.0085544918588814,
 0.9971873684402588,
 0.99288190158371

In [74]:
np.mean(mean_ratio_list), np.median(mean_ratio_list), np.min(mean_ratio_list), np.max(mean_ratio_list), np.std(mean_ratio_list)

(1.0002022084600772,
 1.000080868073534,
 0.9618972812074102,
 1.0483613044725248,
 0.006999765102815904)

In [75]:
np.mean(median_ratio_list), np.median(median_ratio_list), np.min(median_ratio_list), np.max(median_ratio_list), np.std(median_ratio_list)

(5.038595052014058e-35,
 5.036862974652892e-35,
 4.2534357505276993e-35,
 5.907579496597117e-35,
 7.709480272374991e-37)

In [None]:
idx = np.where(np.isnan(np.array(mean_ratio_list)))
mratios = []
for i in range(len(mean_ratio_list)):
    if i in [320, 415, 524, 702, 975]:
        continue
    mratios.append(mean_ratio_list[i])

np.mean(mratios), np.median(mratios), np.min(mratios), np.max(mratios), np.std(mratios)

In [None]:
idx

In [None]:
idx = np.where(np.isnan(np.array(median_ratio_list)))
idx

In [None]:
mratios = []
for i in range(len(median_ratio_list)):
    if i in [320, 415, 524, 702, 975]:
        continue
    mratios.append(median_ratio_list[i])

np.mean(mratios), np.median(mratios), np.min(mratios), np.max(mratios), np.std(mratios)

In [None]:
tickers = list(set(train_df.ticker) | set(test_df.ticker))
len(tickers)

In [None]:
ticker, hist, train_len, test_len = random_stock(tickers)
hist = hist
ticker, len(hist), train_len, test_len

In [None]:
tdl   = test_df.groupby(by='trading_days').count()['ticker'].index.tolist()
p_tdl = test_df.groupby(by='trading_days').count()['ticker']
som = sum(p_tdl)
p_tdl = np.array(p_tdl / som)

In [None]:
len(tdl), len(p_tdl)

In [None]:
def random_trading_days(test_len, tdl, p_tdl):
    while True:
        sday = randint(0, test_len)
        tday = choice(a=tdl, size=1, p=p_tdl)[0]
        
        if sday + tday >= test_len:
            continue
            
        return sday, tday

In [None]:
ticker, hist, train_len, test_len = random_stock(tickers)
sday, tday = random_trading_days(test_len, tdl, p_tdl)

buy_date   = str(hist['Date'].iloc[train_len+sday])[:10]
buy_close  = float(hist['Close'].iloc[train_len+sday])

sell_date  = str(hist['Date'].iloc[train_len+sday+tday])[:10]
sell_close = float(hist['Close'].iloc[train_len+sday+tday])

gain_pct   = ((sell_close - buy_close) / buy_close)
daily_ret  = (1 + gain_pct) ** (1/tday) - 1.0
gain_pct  *= 100
daily_ret *= 100

dict_df = { 'buy_close': [buy_close], 'buy_date': [buy_date], 'daily_return': [daily_ret], 
            'gain_pct' : [gain_pct],  'sell_close': [sell_close], 'sell_date': [sell_date],
            'ticker': [ticker], 'trading_days' : [tday]
          }
df = pd.DataFrame().from_dict(dict_df)
df

In [None]:
def simulate_trades(nr_trades):
    sim_df = pd.DataFrame()
    for t in tqdm(range(nr_trades), 'Simulations:'):
        
        ticker, hist, train_len, test_len = random_stock(tickers)
        sday, tday = random_trading_days(test_len, tdl, p_tdl)

        buy_date   = str(hist['Date'].iloc[train_len+sday])[:10]
        buy_close  = float(hist['Close'].iloc[train_len+sday])

        sell_date  = str(hist['Date'].iloc[train_len+sday+tday])[:10]
        sell_close = float(hist['Close'].iloc[train_len+sday+tday])

        gain_pct   = ((sell_close - buy_close) / buy_close)
        daily_ret  = (1 + gain_pct) ** (1/tday) - 1.0
        gain_pct  *= 100
        daily_ret *= 100

        dict_df = { 'buy_close': [buy_close], 'buy_date': [buy_date], 'daily_return': [daily_ret], 
                    'gain_pct' : [gain_pct],  'sell_close': [sell_close], 'sell_date': [sell_date],
                    'ticker': [ticker], 'trading_days' : [tday]
                  }
        df = pd.DataFrame().from_dict(dict_df)
        sim_df = pd.concat([sim_df, df])
    
    return sim_df

In [None]:
%time sim_df = simulate_trades(len(test_df))
sim_df

In [None]:
stats_table(sim_df, 55)

In [None]:
stats_table(test_df, 55)

In [None]:
len(test_df)

In [None]:
len(test_df.ticker.unique())

In [None]:
len(sim_df.ticker.unique())

In [None]:
len(tickers)

In [None]:
%time sim_df = simulate_trades(len(test_df))
stats_table(sim_df, 55)

In [None]:
%time sim_df = simulate_trades(len(test_df))
stats_table(sim_df, 55)

In [None]:
pwd

In [None]:
fnm = '/Users/frkornet/CDA/Project/stockie/data/negtrades/test_possible_trades_05.csv'
negtrades_test_possible_trades = pd.read_csv(fnm)
negtrades_test_possible_trades

In [None]:
stats_table(negtrades_test_possible_trades, 55)

In [None]:
new_test_df = pd.read_csv('/Users/frkornet/CDA/Project/stockie/data/test_possible_trades.csv')
new_test_df

In [None]:
stats_table(new_test_df, 55, verbose=True)

In [None]:
len(test_df)

In [None]:
from itertools import chain, combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)  # allows duplicate elements
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

stuff = ['MACD', 'PctDiff', 'StdDev', 'RSI', 'WPR', 'MFI', 'BBP']
count_f = 0
combs = []
for p in powerset(stuff):
    if len(p) > 4:
        count_f += 1
        combs.append(p)
        print(p, len(p))
print(f"Combinations with at least 3 or more lements: {count_f}")

In [None]:
combs

In [None]:
def stats_table2(df, days_cutoff):
    pos_df = df[['gain_pct', 'trading_days']].loc[df.gain_pct > 0].groupby(by='trading_days').count()
    pos_df.columns = ['pos_counts']
    loss_df = df[['gain_pct', 'trading_days']].loc[df.gain_pct < 0].groupby(by='trading_days').count()
    loss_df.columns = ['neg_counts']
    zero_df = df[['gain_pct', 'trading_days']].loc[df.gain_pct == 0].groupby(by='trading_days').count()
    zero_df.columns = ['zero_counts']
    temp = pd.concat([loss_df, pos_df, zero_df], join="outer").reset_index()
    cols = [ 'pos_counts', 'neg_counts', 'zero_counts' ]

    stats_cnt = np.zeros((3, 4))
    totals_col = np.zeros((4,))
    total_lt_xd = 0
    for i, num in enumerate(temp[cols].loc[temp.trading_days < days_cutoff].sum()):
        stats_cnt[0,i] = num
        stats_cnt[2,i] += num
    stats_cnt[0,3] = np.sum(stats_cnt[0,0:3])

    
    for i, num in enumerate(temp[cols].loc[temp.trading_days >= days_cutoff].sum()):
        stats_cnt[1,i] = num
        stats_cnt[2,i] += num
    stats_cnt[1,3] = np.sum(stats_cnt[1,0:3])  
    stats_cnt[2,3] = stats_cnt[0,3] + stats_cnt[1,3]
    
    stats_pct = (stats_cnt.copy() / stats_cnt[2,3]) * 100
    
    print('         \t pos     neg    zero    total')
    print('         \t======  ======  =====   ======')
    print(f'< {days_cutoff} days\t', end='')
    for i in range(4):
        print(f'{stats_cnt[0, i]:.0f}', end='\t')
        
    print(f'\n>= {days_cutoff} days\t', end='')
    for i in range(4):
        print(f'{stats_cnt[1, i]:.0f}', end='\t') 
    
    print('\n         \t------\t------\t-----\t------')
    print('         \t', end='')
    for i in range(4):
        print(f'{stats_cnt[2, i]:.0f}', end='\t')

    print('\n\n\n')
    print('         \t pos     neg    zero    total')
    print('         \t======  ======  =====   ======')
    print(f'< {days_cutoff} days\t', end='')
    for i in range(4):
        print(f'{stats_pct[0, i]:.2f}', end='\t')
        
    print(f'\n>= {days_cutoff} days\t', end='')
    for i in range(4):
        print(f'{stats_pct[1, i]:.2f}', end='\t') 
    
    print('\n         \t------\t------\t-----\t------')
    print('         \t', end='')
    for i in range(4):
        print(f'{stats_pct[2, i]:.2f}', end='\t')
    return stats_cnt, stats_pct

In [None]:
stats_cnt, stats_pct = stats_table2(new_test_df, 55)
print('\n\n\n')
for i in range(3):
    print('\t\t', end='')
    for j in range(4):
        print(f'{stats_cnt[i,j]:.0f}\t', end='')
    print('')

print('')

for i in range(3):
    print('\t\t', end='')
    for j in range(4):
        print(f'{stats_pct[i,j]:.2f}\t', end='')
    print('')

In [None]:
stats_table(new_test_df, 55)

In [None]:
stats_cnt = np.zeros((3,4))
for i, num in enumerate(temp[cols].loc[temp.trading_days < days_cutoff].sum()):
        print(num, end='\t')
        total_lt_xd += num
        totals[i] += num
    print(total_lt_xd)
    totals[3] += total_lt_xd