# Import packages


In [1]:
import numpy as np
import pylab as pl
from numpy import fft
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
import datetime
from dateutil.relativedelta import relativedelta


In [2]:
def data_to_harmonics_function(data_stock):
    harmonics = {}
    for i in data_stock:
        harmonics[i] = {}
        # get data_stock's infomation
        data = data_stock[i]['Close']
        array_data = np.array(data)
        n_data = array_data.size
        time_data = np.arange(0, n_data)

        # detrend data
        # find linear trend in data
        Polynomial = np.polyfit(time_data, array_data, 1)
        data_notrend = array_data - Polynomial[0] * time_data    # detrended x

        # fft process
        data_freqdom = fft.fft(data_notrend, n=n_data)
        frequence = fft.fftfreq(n_data)
        f_positive = frequence[np.where(frequence > 0)]
        data_freqdom_positive = data_freqdom[np.where(frequence > 0)]

        # sort indexes
        indexes = list(range(f_positive.size))      # frequencies
        # sort method 1
        # indexes.sort(key = lambda i: np.absolute(frequence[i]))     # sort indexes by frequency, lower -> higher
        # sort method 2 :
        # sort indexes by amplitudes, lower -> higher
        indexes.sort(key=lambda i: np.absolute(data_freqdom[i]))
        indexes.reverse()       # sort indexes by amplitudes, higher -> lower

        # get data_all_time'size
        time_transfer = np.arange(0, 2*array_data.size)

        # mix harmonics
        for j in indexes:
            ampli = np.absolute(
                data_freqdom_positive[j]) / n_data     # amplitude
            phase = np.angle(data_freqdom_positive[j])      # phase
            harmonics[i][j] = ampli * \
                np.cos(2 * np.pi * f_positive[j] * time_transfer + phase)
    return harmonics


In [3]:
def find_pv_function(data, pv_range):
    pd.options.mode.chained_assignment = None
    try:
        for i in data:
            pv = data[i]['Close']
            data[i]['peaks'] = pd.Series(dtype='float64')
            data[i]['valleys'] = pd.Series(dtype='float64')
            peaks = data[i]['peaks']
            valleys = data[i]['valleys']
            for idx in range(0, len(pv)):
                if idx < pv_range:
                    if pv[idx] == pv.iloc[0:pv_range*2+1].max():
                        peaks.iloc[idx] = pv[idx]
                    if pv[idx] == pv.iloc[0:pv_range*2+1].min():
                        valleys.iloc[idx] = pv[idx]
                if pv[idx] == pv.iloc[idx-pv_range:idx+pv_range].max():
                    peaks.iloc[idx] = pv[idx]
                if pv[idx] == pv.iloc[idx-pv_range:idx+pv_range].min():
                    valleys.iloc[idx] = pv[idx]
            data[i]['peaks'] = peaks
            data[i]['valleys'] = valleys
    except:
        for i in data:
            for j in data[i]:
                pv = data[i][j]['Close']
                data[i][j]['peaks'] = pd.Series(dtype='float64')
                data[i][j]['valleys'] = pd.Series(dtype='float64')
                peaks = data[i][j]['peaks']
                valleys = data[i][j]['valleys']
                for idx in range(0, len(pv)):
                    if idx < pv_range:
                        if pv[idx] == pv.iloc[0:pv_range*2+1].max():
                            peaks.iloc[idx] = pv[idx]
                        if pv[idx] == pv.iloc[0:pv_range*2+1].min():
                            valleys.iloc[idx] = pv[idx]
                    if pv[idx] == pv.iloc[idx-pv_range:idx+pv_range].max():
                        peaks.iloc[idx] = pv[idx]
                    if pv[idx] == pv.iloc[idx-pv_range:idx+pv_range].min():
                        valleys.iloc[idx] = pv[idx]
                data[i][j]['peaks'] = peaks
                data[i][j]['valleys'] = valleys


In [4]:
def mix_harmonics(harmonics, n_harm_lower_limit, n_harm_upper_limit):
    processed_signal = {}
    for i in harmonics:
        processed_signal[i] = {}
        for n_harm in range(n_harm_lower_limit, n_harm_upper_limit+1):
            mixed_harmonic = np.zeros(len(harmonics[i][0]))
            for j in range(n_harm):
                mixed_harmonic += harmonics[i][j]
                # print(n_harm)
            cuted_mixed_harmonic = mixed_harmonic[int(
                len(mixed_harmonic)/2):int(len(mixed_harmonic))]
            processed_signal[i][n_harm] = pd.DataFrame(
                {'Close': cuted_mixed_harmonic})
    return processed_signal


In [5]:
def find_pv_lead_function(data, processed_signal):
    for d in data:
        for p in processed_signal[d]:
            processed_signal[d][p]['pv'] = pd.Series(dtype='str')
            # print(data[d])
            processing_signal = processed_signal[d][p].loc[list(data[d].index)]
            p_data = pd.DataFrame(
                {'peaks': data[d]['peaks'], 'count': range(len(data[d]))})
            p_data = p_data.drop(p_data[p_data['peaks'].isna()].index)
            p_data_count = list(p_data['count'])
            p_signal = pd.DataFrame(
                {'peaks': processing_signal['peaks'], 'count': range(len(processing_signal))})
            p_signal = p_signal.drop(p_signal[p_signal['peaks'].isna()].index)
            p_signal_list = list(p_signal['count'])
            p_lead = []
            for i in range(0, len(p_signal_list)):
                temp = []
                temp_abs = []
                temp_2 = []
                for j in range(0, len(p_data_count)):
                    temp.append((p_data_count[j] - p_signal_list[i]))
                    temp_abs.append(abs(p_data_count[j] - p_signal_list[i]))
                for k in range(0, len(temp_abs)):
                    if temp_abs[k] == min(temp_abs):
                        temp_2 = temp[k]
                p_lead.append(temp_2)
            p_signal['lead'] = p_lead
            processed_signal[d][p]['lead'] = pd.Series(dtype='float64')
            processed_signal[d][p]['lead'].loc[p_signal['lead'].index] = p_signal['lead']
            processed_signal[d][p]['pv'].loc[p_signal['lead'].index] = 'peak'

            v_data = pd.DataFrame(
                {'valleys': data[d]['valleys'], 'count': range(len(data[d]))})
            v_data = v_data.drop(v_data[v_data['valleys'].isna()].index)
            v_data_count = list(v_data['count'])
            v_signal = pd.DataFrame(
                {'valleys': processing_signal['valleys'], 'count': range(len(processing_signal))})
            v_signal = v_signal.drop(
                v_signal[v_signal['valleys'].isna()].index)
            v_signal_list = list(v_signal['count'])
            v_lead = []
            for i in range(0, len(v_signal_list)):
                temp = []
                temp_abs = []
                temp_2 = []
                for j in range(0, len(v_data_count)):
                    temp.append((v_data_count[j] - v_signal_list[i]))
                    # print(v_data_count[j])
                    # print(v_signal_list[i])
                    temp_abs.append(abs(v_data_count[j] - v_signal_list[i]))
                for k in range(0, len(temp_abs)):
                    if temp_abs[k] == min(temp_abs):
                        temp_2 = temp[k]
                v_lead.append(temp_2)
            v_signal['lead'] = v_lead
            processed_signal[d][p]['lead'].loc[v_signal['lead'].index] = v_signal['lead']
            processed_signal[d][p]['pv'].loc[v_signal['lead'].index] = 'valley'


In [6]:
def get_fit_error_function(processed_signal, fit_method):
    errors = {}
    for i in processed_signal:
        errors[i] = {}
        for j in processed_signal[i]:
            signal_dropna = processed_signal[i][j].drop(
                processed_signal[i][j][processed_signal[i][j]['lead'].isna()].index)
            if fit_method == 'mean':
                error = signal_dropna['lead'].mean()
            elif fit_method == 'abs':
                error = abs(signal_dropna['lead']).mean()
            errors[i][j] = error
    return errors


In [7]:
def fit_error_task(processed_signal, errors):
    best_error = {}
    best_fit_harm = {}
    for i in processed_signal:
        best_error[i] = pd.Series(errors[i]).abs().min()
        best_fit_harm[i] = pd.Series(errors[i]).abs().idxmin()
        # print(best_fit_harm, best_error)
    return best_fit_harm, best_error


In [8]:
def get_first_lead_function(processed_signal):
    first_date = {}
    lead = {}
    pv = {}
    for i in processed_signal:
        first_date[i] = {}
        lead[i] = {}
        pv[i] = {}
        for j in processed_signal[i]:
            temp = processed_signal[i][j].loc[list(
                processed_signal[i][j]['lead'].dropna().index)[0]]
            first_date[i][j] = list(
                processed_signal[i][j]['lead'].dropna().index)[0]
            lead[i][j] = temp['lead']
            pv[i][j] = temp['pv']
    return first_date, lead, pv


In [9]:
def get_first_lead_function_best_fit(processed_signal, best_fit_harm):
    first_date = {}
    lead = {}
    pv = {}
    for i in processed_signal:
        harm = best_fit_harm[i]
        temp = processed_signal[i][harm].loc[list(
            processed_signal[i][harm]['lead'].dropna().index)[0]]
        first_date[i] = list(processed_signal[i][harm]
                             ['lead'].dropna().index)[0]
        lead[i] = temp['lead']
        pv[i] = temp['pv']
    return first_date, lead, pv


In [10]:
def load_data(stock_name, date_predict_start, data_range, slide_range, n_slide):
    train_data = {}
    test_data = {}
    date_predict_start = datetime.datetime.strptime(
        date_predict_start, '%Y-%m-%d')
    date_data_start_list = []
    date_predict_start_list = []
    date_predict_end_list = []
    for i in range(n_slide*2):
        date_data_start = date_predict_start - \
            relativedelta(days=+data_range)
        date_predict_end = date_predict_start + \
            relativedelta(days=+data_range)
        date_data_start_list.append(date_data_start)
        date_predict_start_list.append(date_predict_start)
        date_predict_end_list.append(date_predict_end)
        date_data_start = date_data_start + \
            relativedelta(days=+slide_range)
        date_predict_start = date_predict_start + \
            relativedelta(days=+slide_range)

    train_data_all = yf.Ticker(stock_name).history(
        start=date_data_start_list[0], end=date_predict_start_list[-1])
    test_data_all = yf.Ticker(stock_name).history(
        start=date_predict_start_list[0], end=date_predict_end_list[-1])
    test_data_all['count'] = range(len(test_data_all))
    test_data_start_list = []
    for i in range(n_slide):
        train_data['data_' + str(i)] = train_data_all.iloc[i *
                                                           slide_range:i*slide_range+data_range]
        train_data['data_' + str(i)] = train_data['data_' +
                                                  str(i)].reset_index(drop=True)
        test_data['data_' + str(i)] = test_data_all.iloc[i *
                                                         slide_range:i*slide_range+data_range]
        test_data_start_list.append(test_data['data_' + str(i)].index[0])
        test_data['data_' + str(i)] = test_data['data_' +
                                                str(i)].reset_index(drop=True)
    return train_data, test_data, test_data_all, test_data_start_list


In [11]:
def train_model(processed_signal, fit_method):
    errors = get_fit_error_function(processed_signal, fit_method)
    best_fit_harm, best_error = fit_error_task(processed_signal, errors)
    first_date, lead, pv = get_first_lead_function_best_fit(
        processed_signal, best_fit_harm)
    return best_fit_harm, best_error, first_date, lead, pv


In [12]:
def evaluate_model(processed_signal, test_data_start_list, test_data_all, best_fit_harm, best_error, first_date, lead, pv):
    result_table = pd.DataFrame(columns=[
        'start_date', 'target_date_after_start', 'target_date', 'lead', 'pv', 'error', 'best_fit'])
    for i in processed_signal:
        result_table.loc[i, 'error'] = round(best_error[i], 2)
        result_table.loc[i, 'best_fit'] = best_fit_harm[i]
        result_table.loc[i, 'target_date_after_start'] = first_date[i]
        result_table.loc[i, 'lead'] = lead[i]
        result_table.loc[i, 'pv'] = pv[i]
    result_table['start_date'] = test_data_start_list
    for i in result_table.index:
        target_date = test_data_all.loc[test_data_all['count'] ==
                                        test_data_all['count'].loc[result_table.loc[i, 'start_date']] +
                                        result_table.loc[i, 'target_date_after_start']].index[0]
        target_date = datetime.datetime.strftime(target_date, '%Y-%m-%d')
        result_table.loc[i, 'target_date'] = target_date
    final_error = round(
        sum([abs(ele) for ele in result_table['lead']]) / len(result_table['lead']), 2)
    return result_table, final_error


In [13]:
def main_funtion(
    stock_name, date_predict_start, data_range, slide_range,
        n_slide, pv_range, n_harm_lower_limit, n_harm_upper_limit, fit_method):

    # 1. Load data
    train_data, test_data, test_data_all, test_data_start_list = load_data(
        stock_name, date_predict_start, data_range, slide_range, n_slide)
    # 2. Preprocessing
    find_pv_function(train_data, pv_range)
    find_pv_function(test_data, pv_range)
    # 3. Build model
    harmonics = data_to_harmonics_function(train_data)
    processed_signal = mix_harmonics(
        harmonics, n_harm_lower_limit, n_harm_upper_limit)
    find_pv_function(processed_signal, pv_range)
    find_pv_lead_function(test_data, processed_signal)
    # 4. Train model
    best_fit_harm, best_error, first_date, lead, pv = train_model(
        processed_signal, fit_method)
    # 5. Evaluate model
    result_table, final_error = evaluate_model(
        processed_signal, test_data_start_list, test_data_all, best_fit_harm, best_error, first_date, lead, pv)
    print('final_error = ', final_error)
    print(result_table)
    # 6. Predict

    # return result_table, final_error


In [14]:
# stock_name = "^GSPC"
# date_predict_start = '2021-01-01'
# data_range = 200
# slide_range = 10
# n_slide = 24
# pv_range = 2
# n_harm_lower_limit = 20
# n_harm_upper_limit = 40
# fit_method = 'mean'
# train_data = {}
# test_data = {}
# date_predict_start = datetime.datetime.strptime(
#     date_predict_start, '%Y-%m-%d')  # ex.'2021-01-01'
# date_data_start_list = []
# date_predict_start_list = []
# date_predict_end_list = []
# for i in range(n_slide*2):
#     date_data_start = date_predict_start - \
#         relativedelta(days=+data_range)  # ex.'2020-07-01'
#     date_predict_end = date_predict_start + \
#         relativedelta(days=+data_range)  # ex.'2021-07-01'
#     date_data_start_list.append(date_data_start)
#     date_predict_start_list.append(date_predict_start)
#     date_predict_end_list.append(date_predict_end)
#     date_data_start = date_data_start + \
#         relativedelta(days=+slide_range)  # ex.'2020-07-15'
#     date_predict_start = date_predict_start + \
#         relativedelta(days=+slide_range)  # ex.'2021-01-15'

# train_data_all = yf.Ticker(stock_name).history(
#     start=date_data_start_list[0], end=date_predict_start_list[-1])
# test_data_all = yf.Ticker(stock_name).history(
#     start=date_predict_start_list[0], end=date_predict_end_list[-1])
# test_data_all['count'] = range(len(test_data_all))
# test_data_start_list = []
# for i in range(n_slide):
#     train_data['data_' + str(i)] = train_data_all.iloc[i *
#                                                        slide_range:i*slide_range+data_range]
#     train_data['data_' + str(i)] = train_data['data_' +
#                                               str(i)].reset_index(drop=True)
#     test_data['data_' + str(i)] = test_data_all.iloc[i *
#                                                      slide_range:i*slide_range+data_range]
#     test_data_start_list.append(test_data['data_' + str(i)].index[0])
#     test_data['data_' + str(i)] = test_data['data_' +
#                                             str(i)].reset_index(drop=True)


In [15]:
stock_name = "^GSPC"
date_predict_start = '2021-01-01'
data_range = 200
slide_range = 10
n_slide = 24
pv_range = 2
n_harm_lower_limit = 20
n_harm_upper_limit = 40
fit_method = 'mean'
main_funtion(
    stock_name, date_predict_start, data_range, slide_range,
    n_slide, pv_range, n_harm_lower_limit, n_harm_upper_limit, fit_method)


final_error =  1.42
        start_date target_date_after_start target_date lead      pv error  \
data_0  2020-12-31                       0  2020-12-31  1.0  valley  0.03   
data_1  2021-01-15                       1  2021-01-19 -1.0  valley  0.02   
data_2  2021-02-01                       2  2021-02-03 -2.0  valley  0.05   
data_3  2021-02-16                       0  2021-02-16  4.0  valley  0.03   
data_4  2021-03-02                       0  2021-03-02  2.0  valley   0.0   
data_5  2021-03-16                       0  2021-03-16  3.0  valley   0.0   
data_6  2021-03-30                       0  2021-03-30  0.0  valley   0.0   
data_7  2021-04-14                       1  2021-04-15 -1.0  valley  0.02   
data_8  2021-04-28                       0  2021-04-28  2.0  valley  0.03   
data_9  2021-05-12                       0  2021-05-12  2.0    peak   0.0   
data_10 2021-05-26                       0  2021-05-26  0.0  valley  0.03   
data_11 2021-06-10                       0  2021-06-10  