In [1]:
import pandas as pd
import numpy as np
import time
from psycopg2 import sql
import psycopg2
import secret

# Connect to database and get df
**My dataset consists of stocks in the S&P500**
```
id |symbol|
---+------+
504|A     |
505|AAL   |
506|AAP   |
507|AAPL  |
```

In [2]:
def get_df(stock_id):
    conn = psycopg2.connect(
    dbname=secret.details['dbname'],
    user=secret.details['user'],
    password=secret.details['password'],
    host=secret.details['host'],
    port=secret.details['port']
    )
    # Create a cursor object to execute SQL statements
    cursor = conn.cursor()
    cursor.execute(
        sql.SQL("select * from stock_price where stock_id = %s order by date asc;"),
        [stock_id]
    )
    data = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    df = pd.DataFrame(data, columns=columns)
    df = df.sort_values(by='date')
    df = df.reset_index(drop=True)
    cursor.close()
    conn.close()
    return df

In [3]:
df = get_df(505)

In [4]:
display(df.head())

Unnamed: 0,id,date,stock_id,open,high,low,close,volume,vol_ma_50,prc_ma_50,earnings,candle_type,low_tl,high_tl,volume_criteria,start_point_low,start_point_high
0,1112742,2014-02-20,505,34.43,35.67,34.43,35.66,9539448,13237427.74,29.75,0,1,0,0,0,0,0
1,1112743,2014-02-21,505,35.78,36.45,35.75,36.17,7821334,12537203.76,29.98,0,1,0,1,0,0,0
2,1112744,2014-02-24,505,36.2,36.66,36.02,36.65,7022370,12309845.18,30.22,0,1,1,0,0,0,0
3,1112745,2014-02-25,505,36.63,37.15,36.51,37.0,8991389,11721759.18,30.44,0,1,1,1,0,0,0
4,1112746,2014-02-26,505,37.12,37.28,36.58,36.75,10516864,11539437.84,30.67,0,0,1,1,0,0,0


# Create Variables needed to calculate trendlines

In [5]:
num_rows = len(df)
high_arr = df['high'].values.astype(np.float64)
close_arr = df['close'].values.astype(np.float64)
trendline_breach_df = pd.DataFrame(columns=['start', 'end', 'breach', 'gradient', 'intercept'])

# Use pandas filtering to get eligible starting points indexes

1. earnings == 0: no earnings on that day
2. high_tl == 1: trendline connecting highs
3. volume_criteria == 1: volume on that day is higher that the 50 day moving average

In [6]:
def get_starting_points(df): #High Trendlines
    c1 = df['earnings']  == 0
    c2 = df['high_tl']  == 1
    c3 = df['volume_criteria']  == 1
    combined = c1 & c2 & c3
    return df[combined].index.to_numpy()

In [7]:
starting_points = get_starting_points(df)

In [8]:
display(starting_points[:10])

array([ 8, 10, 28, 29, 33, 34, 35, 36, 37, 38], dtype=int64)

# Main Loop to calculate eligible trendlines
## Some functions required to calculate trendlines

In [9]:
def get_gradient_arr(start_point_x, start_point_y, x_arr, y_arr):
    gradient_arr = (y_arr - start_point_y) / (x_arr - start_point_x)
    return gradient_arr

def get_intercept_arr(start_point_x, start_point_y, gradient_arr):
    intercept_arr = start_point_y - (gradient_arr * start_point_x)
    return intercept_arr

def create_discrete_trendline_y_arr(gradient, intercept, x_arr):
    trendline_y_arr = (gradient * x_arr) + intercept
    return trendline_y_arr

def arr_compare_close_arr_to_2D_array(df_arr, n_dim_trendline_arr):
    comparison = df_arr > n_dim_trendline_arr
    index_arr = np.argmax(comparison, axis=1)
    return index_arr

## Steps:
1. Loop through each eligible starting point `def get_starting_points(df)`
2. Get the high price of the starting point
3. Create high price array and index array from the starting point + 1 to the end of the price data
4. Bulk calculate gradients and intercepts from the starting point to every other point in the future and store it in a array
5. Pre Create a 2D array to store all the trendlines 'y' data for the purpose of comparing to the high price
6. Compare high price to trendline 'y' data for each day. If all high price < trendline 'y' data, the trendline is eligible
7. Further check when this trendline has been breached. It is breach when close > trendline 'y'
8. Store the data for eligible trendlines in a df with columns 'start', 'end', 'breach', 'intercept', 'gradient'

In [10]:
start_time = time.time()

for start_point_index in range(len(starting_points)):
    start_point_x = starting_points[start_point_index]
    start_point_y = high_arr[start_point_x]

    df_x_arr = np.arange(start_point_x+1, num_rows)
    df_y_arr = high_arr[start_point_x+1:]

    trendline_gradient_arr = get_gradient_arr(start_point_x, start_point_y, df_x_arr, df_y_arr)
    trendline_intercept_arr = get_intercept_arr(start_point_x, start_point_y, trendline_gradient_arr)

    num_trendlines = len(trendline_gradient_arr)
    trendline_len = len(trendline_gradient_arr) + 1

    all_trendline_arrays = np.full((num_trendlines, trendline_len), np.inf)
    high_arr_for_comparison = high_arr[start_point_x:]

    for trendline_index in range(num_trendlines):
        end_point_x = start_point_x + (trendline_index + 1)
        trendline_x_arr = np.arange(start_point_x, end_point_x + 1)
        trendline_y_arr = create_discrete_trendline_y_arr(trendline_gradient_arr[trendline_index], trendline_intercept_arr[trendline_index], trendline_x_arr)

        #To ensure comparison is always false for the first and last point (Due to floating point precision)
        trendline_y_arr[0] = np.inf
        trendline_y_arr[-1] = np.inf

        all_trendline_arrays[trendline_index][:len(trendline_y_arr)] = trendline_y_arr

    comparison_arr = arr_compare_close_arr_to_2D_array(high_arr_for_comparison, all_trendline_arrays)
    indices_where_no_breach = np.where(comparison_arr == 0)[0]

    no_breach_gradient_arr = trendline_gradient_arr[indices_where_no_breach]
    no_breach_intercept_arr = trendline_intercept_arr[indices_where_no_breach]
    no_breach_num_trendlines = len(no_breach_gradient_arr)
    no_breach_trenline_len = num_rows - start_point_x

    no_breach_trendline_arrays = np.full((no_breach_num_trendlines, no_breach_trenline_len), np.inf)
    close_arr_for_comparison = close_arr[start_point_x:]

    for no_breach_trendline_index in range(no_breach_num_trendlines):
        no_breach_end_point_x = num_rows
        no_breach_trendline_x_arr = np.arange(start_point_x, end_point_x + 1)
        no_breach_trendline_y_arr = create_discrete_trendline_y_arr(no_breach_gradient_arr[no_breach_trendline_index], no_breach_intercept_arr[no_breach_trendline_index], no_breach_trendline_x_arr)

        no_breach_trendline_arrays[no_breach_trendline_index][:len(no_breach_trendline_y_arr)] = no_breach_trendline_y_arr

    no_breach_comparison_arr = arr_compare_close_arr_to_2D_array(close_arr_for_comparison, no_breach_trendline_arrays)

    no_breach_indices_adjusted = indices_where_no_breach + 1 + start_point_x
    close_breach_indices_adjusted = no_breach_comparison_arr + start_point_x

    no_breach_start_arr = np.full((len(no_breach_indices_adjusted), ), start_point_x)

    trendline_breach_df_temp = pd.DataFrame({
        'start': no_breach_start_arr,
        'end': no_breach_indices_adjusted,
        'breach': close_breach_indices_adjusted,
        'gradient': no_breach_gradient_arr,
        'intercept': no_breach_intercept_arr
    })

    trendline_breach_df = pd.concat([trendline_breach_df, trendline_breach_df_temp])

print("--- %s seconds ---" % (time.time() - start_time))

  trendline_breach_df = pd.concat([trendline_breach_df, trendline_breach_df_temp])


--- 20.08881163597107 seconds ---


In [11]:
display(trendline_breach_df.head())

Unnamed: 0,start,end,breach,gradient,intercept
0,8,9,10,0.57,32.9
1,8,10,8,0.935,29.98
0,10,11,54,-0.04,39.73
1,10,12,10,0.275,36.58
0,28,29,28,0.71,18.31


# Calculate number of touches for each trendline

## Steps
1. Pre create 2D array to store trendline 'y' data. Fill it with np.inf. This data will span the whole length of the df.
2. Calculate trendline 'y' data from start point to breach point, store it in an array
3. Populate the 2D array with the trendline 'y' data, starting from the start point onwards. The rest of the elements in the array will remain as inf.
4. Bulk compare this 2D array with df['high'].values data. Set a threshold for what considers a 'touch' on the trendline.

In [12]:
trendline_breach_df['breach_diff'] = trendline_breach_df['breach'] - trendline_breach_df['end']
trendline_breach_df = trendline_breach_df[trendline_breach_df['breach_diff'] > 0]

In [13]:
display(trendline_breach_df.head())

Unnamed: 0,start,end,breach,gradient,intercept,breach_diff
0,8,9,10,0.57,32.9,1
0,10,11,54,-0.04,39.73,43
0,29,30,34,-0.46,52.24,4
1,29,34,39,-0.38,49.92,5
2,29,35,42,-0.253333,46.246667,7


In [14]:
num_eligible_trendlines = len(trendline_breach_df)

In [15]:
eligible_trendlines_arr = np.full((num_eligible_trendlines, num_rows), np.inf)

In [16]:
eligible_trendline_start_arr = trendline_breach_df['start'].values
eligible_trendline_breach_arr = trendline_breach_df['breach'].values
eligible_trendline_gradient_arr = trendline_breach_df['gradient'].values
eligible_trendline_intercept_arr = trendline_breach_df['intercept'].values

In [17]:
for eligible_trendline_start_point_index in range(num_eligible_trendlines):
    eligible_trendline_start_point_x = eligible_trendline_start_arr[eligible_trendline_start_point_index]
    eligible_trendline_breach_point_x = eligible_trendline_breach_arr[eligible_trendline_start_point_index]

    eligible_trendline_x_arr = np.arange(eligible_trendline_start_point_x, eligible_trendline_breach_point_x + 1)
    eligible_trendline_y_arr = create_discrete_trendline_y_arr(eligible_trendline_gradient_arr[eligible_trendline_start_point_index], eligible_trendline_intercept_arr[eligible_trendline_start_point_index], eligible_trendline_x_arr)

    eligible_trendlines_arr[eligible_trendline_start_point_index][eligible_trendline_start_point_x:eligible_trendline_breach_point_x + 1] = eligible_trendline_y_arr

In [18]:
display(eligible_trendlines_arr[-1])

array([inf, inf, inf, ..., inf, inf, inf])

In [19]:
def check_num_touches_on_trendline(high_arr, eligible_trendlines_arr, threshold):
    pct_diff = (eligible_trendlines_arr - high_arr) / high_arr
    pct_diff = np.abs(pct_diff)
    pct_diff = pct_diff < threshold
    num_touches = np.sum(pct_diff, axis=1)
    return num_touches

In [20]:
num_touches = check_num_touches_on_trendline(high_arr, eligible_trendlines_arr, 0.01)

In [21]:
display(num_touches)

array([2, 8, 4, ..., 2, 3, 3])

In [22]:
trendline_breach_df['num_touches'] = num_touches

In [23]:
display(trendline_breach_df.head())

Unnamed: 0,start,end,breach,gradient,intercept,breach_diff,num_touches
0,8,9,10,0.57,32.9,1,2
0,10,11,54,-0.04,39.73,43,8
0,29,30,34,-0.46,52.24,4,4
1,29,34,39,-0.38,49.92,5,4
2,29,35,42,-0.253333,46.246667,7,5


In [24]:
trendline_breach_df.to_csv('trendline_breach_df_new.csv', index=False)