In [1]:
from backtest import ticker_stats, smooth, features, stringify, print_ticker_heading, balanced_scorecard, \
                     determine_minima_n_maxima, align_minima_n_maxima, plot_trades, split_data
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.datasets import load_breast_cancer, load_iris, make_moons, make_circles, make_classification
from sklearn.linear_model import LogisticRegression
from category_encoders import WOEEncoder

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer, FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
%matplotlib inline

from scipy.signal import savgol_filter, argrelmin, argrelmax

import gc; gc.enable()

In [2]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
DATAPATH = '/Users/frkornet/Flatiron/Stock-Market-Final-Project/data/'
sdf = pd.read_csv(f'{DATAPATH}optimal_params.csv')

sdf = sdf.loc[sdf.up3_tpr > 0.0].reset_index()
if 'index' in sdf.columns:
    del sdf['index']
sdf.head(20)

Unnamed: 0,NAME_OF_ISSUER,CUSIP,VALUE,SHRS_OR_PRN_AMT,SOLE_VOTING_AUTH,NO_VOTING_AUTH,TICKER,up3_period,up3_lags,up3_tpr,...,up8_lags,up8_tpr,up10_period,up10_lags,up10_tpr,naive1_tpr,naive3_tpr,naive5_tpr,naive10_tpr,naive8_tpr
0,ADURO BIOTECH INC,00739L101,2902,2738200,2590838,147362,ADRO,15y,13,0.9452,...,135102030,0.9873,3y,135,0.9595,0,0.4286,0.4091,0.4286,0.474
1,BMC STK HLDGS INC,05591B109,34168,1305100,1175955,129145,BMCH,3y,13,0.9878,...,135,1.0,3y,13,1.0,0,0.5519,0.5584,0.6104,0.6104
2,CVR ENERGY INC,12662P108,92673,2104769,2104769,0,CVI,8y,13,0.8873,...,13510203045,1.0,5y,13,0.9831,0,0.4026,0.3831,0.3117,0.3636
3,GRACO INC,384109104,8329,180900,72895,108005,GGG,5y,13,0.9053,...,1351020,1.0,15y,135,0.9898,0,0.5519,0.6104,0.5649,0.5584
4,GULF RESOURCES INC,40251W309,1276,1830162,1828362,1800,GURE,5y,13,0.7941,...,13,0.9615,15y,13,1.0,0,0.3377,0.3182,0.2857,0.2792
5,HANCOCK WHITNEY CORPORATION,410120109,9687,252962,252962,0,HWC,5y,13,0.8846,...,13510203045,0.988,3y,13,1.0,0,0.487,0.487,0.4545,0.474
6,MBIA INC,55262C100,2108,228374,146742,81632,MBI,8y,13,0.8824,...,13,1.0,3y,13,0.9865,0,0.4091,0.4286,0.3831,0.4026
7,PALO ALTO NETWORKS INC,697435105,1192028,5848150,5622490,225660,PANW,3y,13510,0.9495,...,13,0.9908,5y,13,1.0,0,0.6234,0.6494,0.5974,0.6299
8,SI BONE INC,825704109,723,40900,40900,0,SIBN,15y,1351020304560,0.8933,...,13,0.9684,15y,13,0.9775,0,0.4935,0.526,0.513,0.5519
9,SPECTRUM PHARMACEUTICALS INC,84763A108,22936,2765017,2747310,17707,SPPI,3y,13510,0.9125,...,13510,1.0,10y,135,0.9722,0,0.4675,0.4351,0.3831,0.3961


In [4]:
tickers = sdf.TICKER.to_list()
print(tickers)

['ADRO', 'BMCH', 'CVI', 'GGG', 'GURE', 'HWC', 'MBI', 'PANW', 'SIBN', 'SPPI']


In [5]:
min_indices, max_indices = determine_minima_n_maxima(tickers, False)

tickers= ['ADRO', 'BMCH', 'CVI', 'GGG', 'GURE', 'HWC', 'MBI', 'PANW', 'SIBN', 'SPPI']


In [6]:
min_indices, max_indices = align_minima_n_maxima(tickers, min_indices, max_indices, True)

Ticker: ADRO min_id= 23 max_id= 42
      ADRO min_id= 23 max_id= 42 
Ticker: BMCH min_id= 26 max_id= 105
      BMCH min_id= 26 max_id= 105 
Ticker: CVI min_id= 92 max_id= 21
      CVI min_id= 92 max_id= 115  (*)
Ticker: GGG min_id= 88 max_id= 50
      GGG min_id= 88 max_id= 111  (*)
Ticker: GURE min_id= 109 max_id= 24
      GURE min_id= 109 max_id= 115  (*)
Ticker: HWC min_id= 133 max_id= 29
      HWC min_id= 133 max_id= 216  (*)
Ticker: MBI min_id= 84 max_id= 48
      MBI min_id= 84 max_id= 172  (*)
Ticker: PANW min_id= 106 max_id= 36
      PANW min_id= 106 max_id= 152  (*)
Ticker: SIBN min_id= 22 max_id= 53
      SIBN min_id= 22 max_id= 53 
Ticker: SPPI min_id= 87 max_id= 7
      SPPI min_id= 87 max_id= 113  (*)


In [7]:
# suppress output of plotting all the trades as this is quite a lot...
# plot_trades(tickers, min_indices, max_indices)

In [62]:
def get_signals(hist, target, threshold):
    # NB: we do not include smooth in data!
    data = hist[['Close', 'Open', 'Low', 'High']]
    data = features(data, hist, target)

    used_cols = [c for c in data.columns.tolist() if c not in [target]]
    X, y, X_train, X_test, y_train, y_test = split_data(data, used_cols, target, 0.7)

    encoder   = WOEEncoder()
    binner    = KBinsDiscretizer(n_bins=5, encode='ordinal')
    objectify = FunctionTransformer(func=stringify, check_inverse=False, validate=False)
    imputer   = SimpleImputer(strategy='constant', fill_value=0.0)
    clf       = LogisticRegression(class_weight='balanced', random_state=42)

    pipe = make_pipeline(binner, objectify, encoder, imputer, clf)
    pipe.fit(X_train, y_train.values)

    signals = (pipe.predict_proba(X_test)  > threshold).astype(int)[:,1]
    return signals

In [71]:
BUY = 1
SELL = 2

def merge_buy_n_sell_signals(buy_signals, sell_signals):
    
    assert len(buy_signals) == len(sell_signals), "buy_signal and sell_signal lengths different!"
    
    buy_n_sell = [0] * len(buy_signals)
    length     = len(buy_n_sell)
    i          = 0
    state      = SELL
    
    while i < length:
        if state == SELL and buy_signals[i] == 1:
            state = BUY
            buy_n_sell[i] = 1
        
        elif state == BUY and sell_signals[i] == 1:
            state = SELL
            buy_n_sell[i] = 2
            continue
        
        i = i + 1
    
    return buy_n_sell

In [72]:
def extract_trades(hist, buy_n_sell, verbose):
    test_start_at = len(hist) - len(buy_n_sell)
    
    state       = SELL
    
    cols = ['buy_date', 'buy_close', 'sell_date', 'sell_close', 'gain_pct',
            'trading_days', 'daily_return' ]
    possible_trades_df = pd.DataFrame(columns=cols)
    
    for i, b_or_s in enumerate(buy_n_sell):
        
        if b_or_s == BUY:
            buy_id    = test_start_at + i
            buy_close = hist.Close.iloc[buy_id]
            buy_date  = hist.index[buy_id]
            state = SELL
            
        if b_or_s == SELL:
            sell_id    = test_start_at + i
            sell_close = hist.Close.iloc[sell_id]
            sell_date  = hist.index[sell_id] 
            
            gain = sell_close - buy_close
            gain_pct = round( (gain / buy_close)*100, 2)
            
            trading_days = sell_id - buy_id
            
            daily_return = (1+gain_pct/100) ** (1/trading_days) - 1
            daily_return = round(daily_return * 100, 2)
            
            trade_dict = {'buy_date'    : [buy_date],  'buy_close'    : [buy_close],
                         'sell_date'    : [sell_date], 'sell_close'   : [sell_close],
                         'gain_pct'     : [gain_pct],  'trading_days' : [trading_days],
                         'daily_return' : [daily_return] }
            possible_trades_df = pd.concat([possible_trades_df, 
                                           pd.DataFrame(trade_dict)])
            
            #$print("buy_id=",  buy_id,  "buy_close=",  buy_close,  "buy_date=", buy_date)
            #print("sell_id=", sell_id, "sell_close=", sell_close, "sell_date=", sell_date)
            #print("gain=", gain, f"gain_pct={gain_pct}%")
            #print("trading_days=", trading_days)
            #print(f"daily compounded return={daily_return}%")
            #print('')
    
    if verbose == True:
        print("****EXTRACT_TRADES****")
        display(possible_trades_df)
    
    return possible_trades_df

In [85]:
def predict_minima_n_maxima(tickers, threshold, verbose):
    
    print("tickers=", tickers)
    target = 'target'
    
    cols = ['buy_date', 'buy_close', 'sell_date', 'sell_close', 'gain_pct',
        'trading_days', 'daily_return' ]
    possible_trades_df = pd.DataFrame(columns=cols)
    
    for ticker in tickers:

        # free up memory
        gc.collect()

        if verbose == True:
            print_ticker_heading(ticker)

        # get stock data and smooth the Close curve
        hist = ticker_stats(ticker, 3, False)
        hist = smooth(hist)

        # get the buy signals
        hist[target] = 0
        min_ids = argrelmin(hist.smooth.values)[0].tolist()
        hist[target].iloc[min_ids] = 1        
        buy_signals = get_signals(hist, target, threshold)
        #print("buy_signals=", buy_signals, '\n')

        # get the sell signals
        hist[target] = 0
        max_ids = argrelmax(hist.smooth.values)[0].tolist()
        hist[target].iloc[max_ids] = 1
        sell_signals = get_signals(hist, target, threshold)
        #print("sell_signals=", sell_signals, '\n')
        
        # merge the buy and sell signals
        buy_n_sell = merge_buy_n_sell_signals(buy_signals, sell_signals)
        # print("buy_n_sell=", buy_n_sell, '\n')
        
        # extract trades
        ticker_df = extract_trades(hist, buy_n_sell, verbose)
        possible_trades_df = pd.concat([possible_trades_df, ticker_df])
    
    possible_trades_df.trading_days = possible_trades_df.trading_days.astype(int)
    return possible_trades_df

In [86]:
possible_trades_df = predict_minima_n_maxima(tickers, 0.5, False)
possible_trades_df

tickers= ['ADRO', 'BMCH', 'CVI', 'GGG', 'GURE', 'HWC', 'MBI', 'PANW', 'SIBN', 'SPPI']


Unnamed: 0,buy_date,buy_close,sell_date,sell_close,gain_pct,trading_days,daily_return
0,2018-12-06,2.88,2019-02-06,3.44,19.44,41,0.43
0,2019-03-11,3.98,2019-03-15,4.12,3.52,4,0.87
0,2019-05-24,3.2,2019-12-02,1.19,-62.81,132,-0.75
0,2018-04-16,18.75,2018-06-04,20.85,11.2,34,0.31
0,2018-07-03,20.95,2018-08-30,22.4,6.92,41,0.16
0,2018-09-25,20.0,2018-11-06,17.74,-11.3,30,-0.4
0,2018-11-23,16.22,2018-12-06,16.63,2.53,8,0.31
0,2018-12-26,15.18,2019-01-11,16.48,8.56,11,0.75
0,2019-03-28,17.49,2019-04-23,20.44,16.87,17,0.92
0,2019-06-04,20.85,2019-06-13,20.4,-2.16,7,-0.31


In [87]:
possible_trades_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95 entries, 0 to 0
Data columns (total 7 columns):
buy_date        95 non-null datetime64[ns]
buy_close       95 non-null float64
sell_date       95 non-null datetime64[ns]
sell_close      95 non-null float64
gain_pct        95 non-null float64
trading_days    95 non-null int64
daily_return    95 non-null float64
dtypes: datetime64[ns](2), float64(4), int64(1)
memory usage: 5.9 KB


In [88]:
possible_trades_df.describe()

Unnamed: 0,buy_close,sell_close,gain_pct,trading_days,daily_return
count,95.0,95.0,95.0,95.0,95.0
mean,28.656842,29.808,2.329158,26.242105,0.468316
std,37.984046,40.872878,11.302668,26.827945,1.043421
min,2.88,1.19,-62.81,1.0,-0.75
25%,8.325,8.605,-0.025,6.0,0.0
50%,18.64,19.03,2.75,19.0,0.23
75%,40.12,41.655,6.695,38.0,0.695
max,214.61,227.78,27.52,132.0,8.54


In [92]:
pd.set_option("display.max_rows", 200)
possible_trades_df.sort_values(by=['buy_date', 'gain_pct'], ascending=[True, False])

Unnamed: 0,buy_date,buy_close,sell_date,sell_close,gain_pct,trading_days,daily_return
0,2017-03-15,16.6,2017-05-01,18.11,9.1,32,0.27
0,2017-03-22,9.05,2017-07-19,9.15,1.1,82,0.01
0,2017-04-04,42.47,2017-05-04,45.22,6.48,21,0.3
0,2017-04-07,8.93,2017-04-10,9.03,1.12,1,1.12
0,2017-04-19,8.7,2017-04-20,8.72,0.23,1,0.23
0,2017-04-21,8.53,2017-04-24,8.71,2.11,1,2.11
0,2017-04-28,8.4,2017-06-14,8.66,3.1,32,0.1
0,2017-05-08,6.12,2017-07-14,7.53,23.04,47,0.44
0,2017-05-19,43.82,2017-05-23,44.8,2.24,2,1.11
0,2017-06-05,16.57,2017-06-27,17.5,5.61,16,0.34


In [80]:
1-23/96

0.7604166666666666

In [82]:
possible_trades_df.buy_date.min(), possible_trades_df.buy_date.max()

(Timestamp('2017-03-15 00:00:00'), Timestamp('2020-01-28 00:00:00'))