In [1]:
import pandas as pd
from pmdarima import auto_arima
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import numpy as np
import glob

In [2]:
# Load and preprocess data
data = pd.read_csv("sp100_ohlcv_2018_2023.csv")  # Ensure the dataset includes columns: 'Date', 'Ticker', 'Open'
data['Date'] = pd.to_datetime(data['Date'])
data.sort_values(by=['Ticker', 'Date'], inplace=True)

# Add movement labels
data['Movement'] = data.groupby('Ticker')['Open'].shift(-1) > data['Open']
data['Movement'] = data['Movement'].astype(int)

In [3]:
data.head(20)

Unnamed: 0.1,Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume,Movement
1508,1739,AAPL,2018-01-02,170.16,172.3,169.26,172.26,25048048,1
1507,1738,AAPL,2018-01-03,172.53,174.55,171.96,172.23,28819653,1
1506,1737,AAPL,2018-01-04,172.54,173.47,172.08,173.03,22211345,1
1505,1736,AAPL,2018-01-05,173.44,175.37,173.05,175.0,23016177,1
1504,1735,AAPL,2018-01-08,174.35,175.61,173.93,174.35,20134092,1
1503,1734,AAPL,2018-01-09,174.55,175.06,173.41,174.33,21262614,0
1502,1733,AAPL,2018-01-10,173.16,174.3,173.0,174.29,23589129,1
1501,1732,AAPL,2018-01-11,174.59,175.49,174.49,175.28,17523256,1
1500,1731,AAPL,2018-01-12,176.18,177.36,175.65,177.09,25039531,1
1499,1730,AAPL,2018-01-16,177.9,179.39,176.14,176.19,29159005,0


In [4]:
# Parameters
window_size = 60  # Rolling window size
train_start = "2018-01-01"
train_end = "2022-12-31"
test_start = "2023-01-01"

# Get unique tickers
tickers = data['Ticker'].unique()

# Split tickers into groups of 10
groups = np.array_split(tickers, len(tickers) // 10)

In [5]:
len(groups)

10

In [None]:

group_1 = groups[0]  # First group of 10 tickers
results = []  # To store results

# Process tickers
for ticker in tqdm(group_1, desc="Processing Group 1"):
    stock_data = data[data['Ticker'] == ticker]

    # Train-test split
    train_data = stock_data[(stock_data['Date'] >= train_start) & (stock_data['Date'] <= train_end)]
    test_data = stock_data[(stock_data['Date'] >= test_start)]

    predictions = []
    true_labels = []

    # Initial training phase
    try:
        model = auto_arima(
            train_data['Open'],
            seasonal=False,
            max_p=7,  # Constrain ARIMA parameters
            max_d=1,
            max_q=7,
            trace=False,
            error_action="ignore",
            suppress_warnings=True,
            stepwise=True
        )
    except Exception as e:
        print(f"Error training initial ARIMA model for {ticker}: {e}")
        continue

    # Rolling update phase
    for i in range(len(test_data)):
        # Define rolling window
        window_data = test_data.iloc[max(0, i - window_size):i]

        if len(window_data) < window_size:
            continue  # Skip if there’s not enough data for the rolling window

        test_point = test_data.iloc[i]

        try:
            # Incrementally update the model
            model.update(window_data['Open'].iloc[-1])

            # Predict the next day's price
            forecast = model.predict(n_periods=1)
            if len(forecast) == 0:
                continue

            # Append prediction and true label
            predictions.append(int(forecast[0] > window_data['Open'].iloc[-1]))
            true_labels.append(int(test_point['Movement']))
        except Exception as e:
            print(f"Error updating ARIMA model for {ticker} at iteration {i}: {e}")
            continue

    # Evaluate accuracy for this ticker
    if predictions:
        accuracy = accuracy_score(true_labels, predictions)
        results.append({'Ticker': ticker, 'Accuracy': accuracy})
    else:
        results.append({'Ticker': ticker, 'Accuracy': None})

# Convert to DataFrame and save results for this group
results_df = pd.DataFrame(results)
results_df.to_csv(f"arima_results_group_1.csv", index=False)
print("Results for Group 1 saved.")

Processing Group 1: 100%|██████████| 11/11 [01:04<00:00,  5.83s/it]

Results for Group 1 saved.





In [10]:
group_2 = groups[1]  # Second group of 10 tickers
results = []  # To store results

# Process tickers
for ticker in tqdm(group_2, desc="Processing Group 2"):
    stock_data = data[data['Ticker'] == ticker]

    # Train-test split
    train_data = stock_data[(stock_data['Date'] >= train_start) & (stock_data['Date'] <= train_end)]
    test_data = stock_data[(stock_data['Date'] >= test_start)]

    predictions = []
    true_labels = []

    # Initial training phase
    try:
        model = auto_arima(
            train_data['Open'],
            seasonal=False,
            max_p=7,  # Constrain ARIMA parameters
            max_d=1,
            max_q=7,
            trace=False,
            error_action="ignore",
            suppress_warnings=True,
            stepwise=True
        )
    except Exception as e:
        print(f"Error training initial ARIMA model for {ticker}: {e}")
        continue

    # Rolling update phase
    for i in range(len(test_data)):
        # Define rolling window
        window_data = test_data.iloc[max(0, i - window_size):i]

        if len(window_data) < window_size:
            continue  # Skip if there’s not enough data for the rolling window

        test_point = test_data.iloc[i]

        try:
            # Incrementally update the model
            model.update(window_data['Open'].iloc[-1])

            # Predict the next day's price
            forecast = model.predict(n_periods=1)
            if len(forecast) == 0:
                continue

            # Append prediction and true label
            predictions.append(int(forecast[0] > window_data['Open'].iloc[-1]))
            true_labels.append(int(test_point['Movement']))
        except Exception as e:
            print(f"Error updating ARIMA model for {ticker} at iteration {i}: {e}")
            continue

    # Evaluate accuracy for this ticker
    if predictions:
        accuracy = accuracy_score(true_labels, predictions)
        results.append({'Ticker': ticker, 'Accuracy': accuracy})
    else:
        results.append({'Ticker': ticker, 'Accuracy': None})

# Convert to DataFrame and save results for this group
results_df = pd.DataFrame(results)
results_df.to_csv(f"arima_results_group_2.csv", index=False)
print("Results for Group 2 saved.")

Processing Group 2: 100%|██████████| 10/10 [01:05<00:00,  6.54s/it]

Results for Group 2 saved.





In [11]:
group_3 = groups[2]  # Third group of 10 tickers
results = []  # To store results

# Process tickers
for ticker in tqdm(group_3, desc="Processing Group 3"):
    stock_data = data[data['Ticker'] == ticker]

    # Train-test split
    train_data = stock_data[(stock_data['Date'] >= train_start) & (stock_data['Date'] <= train_end)]
    test_data = stock_data[(stock_data['Date'] >= test_start)]

    predictions = []
    true_labels = []

    # Initial training phase
    try:
        model = auto_arima(
            train_data['Open'],
            seasonal=False,
            max_p=7,  # Constrain ARIMA parameters
            max_d=1,
            max_q=7,
            trace=False,
            error_action="ignore",
            suppress_warnings=True,
            stepwise=True
        )
    except Exception as e:
        print(f"Error training initial ARIMA model for {ticker}: {e}")
        continue

    # Rolling update phase
    for i in range(len(test_data)):
        # Define rolling window
        window_data = test_data.iloc[max(0, i - window_size):i]

        if len(window_data) < window_size:
            continue  # Skip if there’s not enough data for the rolling window

        test_point = test_data.iloc[i]

        try:
            # Incrementally update the model
            model.update(window_data['Open'].iloc[-1])

            # Predict the next day's price
            forecast = model.predict(n_periods=1)
            if len(forecast) == 0:
                continue

            # Append prediction and true label
            predictions.append(int(forecast[0] > window_data['Open'].iloc[-1]))
            true_labels.append(int(test_point['Movement']))
        except Exception as e:
            print(f"Error updating ARIMA model for {ticker} at iteration {i}: {e}")
            continue

    # Evaluate accuracy for this ticker
    if predictions:
        accuracy = accuracy_score(true_labels, predictions)
        results.append({'Ticker': ticker, 'Accuracy': accuracy})
    else:
        results.append({'Ticker': ticker, 'Accuracy': None})

# Convert to DataFrame and save results for this group
results_df = pd.DataFrame(results)
results_df.to_csv(f"arima_results_group_3.csv", index=False)
print("Results for Group 3 saved.")

Processing Group 3: 100%|██████████| 10/10 [02:38<00:00, 15.81s/it]

Results for Group 3 saved.





In [12]:
group_4 = groups[3]  # Fourth group of 10 tickers
results = []  # To store results

# Process tickers
for ticker in tqdm(group_4, desc="Processing Group 4"):
    stock_data = data[data['Ticker'] == ticker]

    # Train-test split
    train_data = stock_data[(stock_data['Date'] >= train_start) & (stock_data['Date'] <= train_end)]
    test_data = stock_data[(stock_data['Date'] >= test_start)]

    predictions = []
    true_labels = []

    # Initial training phase
    try:
        model = auto_arima(
            train_data['Open'],
            seasonal=False,
            max_p=7,  # Constrain ARIMA parameters
            max_d=1,
            max_q=7,
            trace=False,
            error_action="ignore",
            suppress_warnings=True,
            stepwise=True
        )
    except Exception as e:
        print(f"Error training initial ARIMA model for {ticker}: {e}")
        continue

    # Rolling update phase
    for i in range(len(test_data)):
        # Define rolling window
        window_data = test_data.iloc[max(0, i - window_size):i]

        if len(window_data) < window_size:
            continue  # Skip if there’s not enough data for the rolling window

        test_point = test_data.iloc[i]

        try:
            # Incrementally update the model
            model.update(window_data['Open'].iloc[-1])

            # Predict the next day's price
            forecast = model.predict(n_periods=1)
            if len(forecast) == 0:
                continue

            # Append prediction and true label
            predictions.append(int(forecast[0] > window_data['Open'].iloc[-1]))
            true_labels.append(int(test_point['Movement']))
        except Exception as e:
            print(f"Error updating ARIMA model for {ticker} at iteration {i}: {e}")
            continue

    # Evaluate accuracy for this ticker
    if predictions:
        accuracy = accuracy_score(true_labels, predictions)
        results.append({'Ticker': ticker, 'Accuracy': accuracy})
    else:
        results.append({'Ticker': ticker, 'Accuracy': None})

# Convert to DataFrame and save results for this group
results_df = pd.DataFrame(results)
results_df.to_csv(f"arima_results_group_4.csv", index=False)
print("Results for Group 4 saved.")

Processing Group 4: 100%|██████████| 10/10 [01:37<00:00,  9.76s/it]

Results for Group 4 saved.





In [13]:
group_5 = groups[4]  # Fifth group of 10 tickers
results = []  # To store results

# Process tickers
for ticker in tqdm(group_5, desc="Processing Group 5"):
    stock_data = data[data['Ticker'] == ticker]

    # Train-test split
    train_data = stock_data[(stock_data['Date'] >= train_start) & (stock_data['Date'] <= train_end)]
    test_data = stock_data[(stock_data['Date'] >= test_start)]

    predictions = []
    true_labels = []

    # Initial training phase
    try:
        model = auto_arima(
            train_data['Open'],
            seasonal=False,
            max_p=7,  # Constrain ARIMA parameters
            max_d=1,
            max_q=7,
            trace=False,
            error_action="ignore",
            suppress_warnings=True,
            stepwise=True
        )
    except Exception as e:
        print(f"Error training initial ARIMA model for {ticker}: {e}")
        continue

    # Rolling update phase
    for i in range(len(test_data)):
        # Define rolling window
        window_data = test_data.iloc[max(0, i - window_size):i]

        if len(window_data) < window_size:
            continue  # Skip if there’s not enough data for the rolling window

        test_point = test_data.iloc[i]

        try:
            # Incrementally update the model
            model.update(window_data['Open'].iloc[-1])

            # Predict the next day's price
            forecast = model.predict(n_periods=1)
            if len(forecast) == 0:
                continue

            # Append prediction and true label
            predictions.append(int(forecast[0] > window_data['Open'].iloc[-1]))
            true_labels.append(int(test_point['Movement']))
        except Exception as e:
            print(f"Error updating ARIMA model for {ticker} at iteration {i}: {e}")
            continue

    # Evaluate accuracy for this ticker
    if predictions:
        accuracy = accuracy_score(true_labels, predictions)
        results.append({'Ticker': ticker, 'Accuracy': accuracy})
    else:
        results.append({'Ticker': ticker, 'Accuracy': None})

# Convert to DataFrame and save results for this group
results_df = pd.DataFrame(results)
results_df.to_csv(f"arima_results_group_5.csv", index=False)
print("Results for Group 5 saved.")

Processing Group 5: 100%|██████████| 10/10 [01:40<00:00, 10.09s/it]

Results for Group 5 saved.





In [14]:
group_6 = groups[5]  # Sixth group of 10 tickers
results = []  # To store results

# Process tickers
for ticker in tqdm(group_6, desc="Processing Group 6"):
    stock_data = data[data['Ticker'] == ticker]

    # Train-test split
    train_data = stock_data[(stock_data['Date'] >= train_start) & (stock_data['Date'] <= train_end)]
    test_data = stock_data[(stock_data['Date'] >= test_start)]

    predictions = []
    true_labels = []

    # Initial training phase
    try:
        model = auto_arima(
            train_data['Open'],
            seasonal=False,
            max_p=7,  # Constrain ARIMA parameters
            max_d=1,
            max_q=7,
            trace=False,
            error_action="ignore",
            suppress_warnings=True,
            stepwise=True
        )
    except Exception as e:
        print(f"Error training initial ARIMA model for {ticker}: {e}")
        continue

    # Rolling update phase
    for i in range(len(test_data)):
        # Define rolling window
        window_data = test_data.iloc[max(0, i - window_size):i]

        if len(window_data) < window_size:
            continue  # Skip if there’s not enough data for the rolling window

        test_point = test_data.iloc[i]

        try:
            # Incrementally update the model
            model.update(window_data['Open'].iloc[-1])

            # Predict the next day's price
            forecast = model.predict(n_periods=1)
            if len(forecast) == 0:
                continue

            # Append prediction and true label
            predictions.append(int(forecast[0] > window_data['Open'].iloc[-1]))
            true_labels.append(int(test_point['Movement']))
        except Exception as e:
            print(f"Error updating ARIMA model for {ticker} at iteration {i}: {e}")
            continue

    # Evaluate accuracy for this ticker
    if predictions:
        accuracy = accuracy_score(true_labels, predictions)
        results.append({'Ticker': ticker, 'Accuracy': accuracy})
    else:
        results.append({'Ticker': ticker, 'Accuracy': None})

# Convert to DataFrame and save results for this group
results_df = pd.DataFrame(results)
results_df.to_csv(f"arima_results_group_6.csv", index=False)
print("Results for Group 6 saved.")

Processing Group 6: 100%|██████████| 10/10 [02:33<00:00, 15.31s/it]

Results for Group 6 saved.





In [15]:
group_7 = groups[6]  # Seventh group of 10 tickers
results = []  # To store results

# Process tickers
for ticker in tqdm(group_7, desc="Processing Group 7"):
    stock_data = data[data['Ticker'] == ticker]

    # Train-test split
    train_data = stock_data[(stock_data['Date'] >= train_start) & (stock_data['Date'] <= train_end)]
    test_data = stock_data[(stock_data['Date'] >= test_start)]

    predictions = []
    true_labels = []

    # Initial training phase
    try:
        model = auto_arima(
            train_data['Open'],
            seasonal=False,
            max_p=7,  # Constrain ARIMA parameters
            max_d=1,
            max_q=7,
            trace=False,
            error_action="ignore",
            suppress_warnings=True,
            stepwise=True
        )
    except Exception as e:
        print(f"Error training initial ARIMA model for {ticker}: {e}")
        continue

    # Rolling update phase
    for i in range(len(test_data)):
        # Define rolling window
        window_data = test_data.iloc[max(0, i - window_size):i]

        if len(window_data) < window_size:
            continue  # Skip if there’s not enough data for the rolling window

        test_point = test_data.iloc[i]

        try:
            # Incrementally update the model
            model.update(window_data['Open'].iloc[-1])

            # Predict the next day's price
            forecast = model.predict(n_periods=1)
            if len(forecast) == 0:
                continue

            # Append prediction and true label
            predictions.append(int(forecast[0] > window_data['Open'].iloc[-1]))
            true_labels.append(int(test_point['Movement']))
        except Exception as e:
            print(f"Error updating ARIMA model for {ticker} at iteration {i}: {e}")
            continue

    # Evaluate accuracy for this ticker
    if predictions:
        accuracy = accuracy_score(true_labels, predictions)
        results.append({'Ticker': ticker, 'Accuracy': accuracy})
    else:
        results.append({'Ticker': ticker, 'Accuracy': None})

# Convert to DataFrame and save results for this group
results_df = pd.DataFrame(results)
results_df.to_csv(f"arima_results_group_7.csv", index=False)
print("Results for Group 7 saved.")

Processing Group 7: 100%|██████████| 10/10 [01:08<00:00,  6.81s/it]

Results for Group 7 saved.





In [16]:
group_8 = groups[7]  # Eigth group of 10 tickers
results = []  # To store results

# Process tickers
for ticker in tqdm(group_8, desc="Processing Group 8"):
    stock_data = data[data['Ticker'] == ticker]

    # Train-test split
    train_data = stock_data[(stock_data['Date'] >= train_start) & (stock_data['Date'] <= train_end)]
    test_data = stock_data[(stock_data['Date'] >= test_start)]

    predictions = []
    true_labels = []

    # Initial training phase
    try:
        model = auto_arima(
            train_data['Open'],
            seasonal=False,
            max_p=7,  # Constrain ARIMA parameters
            max_d=1,
            max_q=7,
            trace=False,
            error_action="ignore",
            suppress_warnings=True,
            stepwise=True
        )
    except Exception as e:
        print(f"Error training initial ARIMA model for {ticker}: {e}")
        continue

    # Rolling update phase
    for i in range(len(test_data)):
        # Define rolling window
        window_data = test_data.iloc[max(0, i - window_size):i]

        if len(window_data) < window_size:
            continue  # Skip if there’s not enough data for the rolling window

        test_point = test_data.iloc[i]

        try:
            # Incrementally update the model
            model.update(window_data['Open'].iloc[-1])

            # Predict the next day's price
            forecast = model.predict(n_periods=1)
            if len(forecast) == 0:
                continue

            # Append prediction and true label
            predictions.append(int(forecast[0] > window_data['Open'].iloc[-1]))
            true_labels.append(int(test_point['Movement']))
        except Exception as e:
            print(f"Error updating ARIMA model for {ticker} at iteration {i}: {e}")
            continue

    # Evaluate accuracy for this ticker
    if predictions:
        accuracy = accuracy_score(true_labels, predictions)
        results.append({'Ticker': ticker, 'Accuracy': accuracy})
    else:
        results.append({'Ticker': ticker, 'Accuracy': None})

# Convert to DataFrame and save results for this group
results_df = pd.DataFrame(results)
results_df.to_csv(f"arima_results_group_8.csv", index=False)
print("Results for Group 8 saved.")

Processing Group 8: 100%|██████████| 10/10 [02:53<00:00, 17.39s/it]

Results for Group 8 saved.





In [18]:
group_9 = groups[8]  # Ninth group of 10 tickers
results = []  # To store results

# Process tickers
for ticker in tqdm(group_9, desc="Processing Group 9"):
    stock_data = data[data['Ticker'] == ticker]

    # Train-test split
    train_data = stock_data[(stock_data['Date'] >= train_start) & (stock_data['Date'] <= train_end)]
    test_data = stock_data[(stock_data['Date'] >= test_start)]

    predictions = []
    true_labels = []

    # Initial training phase
    try:
        model = auto_arima(
            train_data['Open'],
            seasonal=False,
            max_p=7,  # Constrain ARIMA parameters
            max_d=1,
            max_q=7,
            trace=False,
            error_action="ignore",
            suppress_warnings=True,
            stepwise=True
        )
    except Exception as e:
        print(f"Error training initial ARIMA model for {ticker}: {e}")
        continue

    # Rolling update phase
    for i in range(len(test_data)):
        # Define rolling window
        window_data = test_data.iloc[max(0, i - window_size):i]

        if len(window_data) < window_size:
            continue  # Skip if there’s not enough data for the rolling window

        test_point = test_data.iloc[i]

        try:
            # Incrementally update the model
            model.update(window_data['Open'].iloc[-1])

            # Predict the next day's price
            forecast = model.predict(n_periods=1)
            if len(forecast) == 0:
                continue

            # Append prediction and true label
            predictions.append(int(forecast[0] > window_data['Open'].iloc[-1]))
            true_labels.append(int(test_point['Movement']))
        except Exception as e:
            print(f"Error updating ARIMA model for {ticker} at iteration {i}: {e}")
            continue

    # Evaluate accuracy for this ticker
    if predictions:
        accuracy = accuracy_score(true_labels, predictions)
        results.append({'Ticker': ticker, 'Accuracy': accuracy})
    else:
        results.append({'Ticker': ticker, 'Accuracy': None})

# Convert to DataFrame and save results for this group
results_df = pd.DataFrame(results)
results_df.to_csv(f"arima_results_group_9.csv", index=False)
print("Results for Group 9 saved.")

Processing Group 9: 100%|██████████| 10/10 [03:50<00:00, 23.02s/it]

Results for Group 9 saved.





In [19]:
group_10 = groups[9]  # Tenth group of 10 tickers
results = []  # To store results

# Process tickers
for ticker in tqdm(group_10, desc="Processing Group 10"):
    stock_data = data[data['Ticker'] == ticker]

    # Train-test split
    train_data = stock_data[(stock_data['Date'] >= train_start) & (stock_data['Date'] <= train_end)]
    test_data = stock_data[(stock_data['Date'] >= test_start)]

    predictions = []
    true_labels = []

    # Initial training phase
    try:
        model = auto_arima(
            train_data['Open'],
            seasonal=False,
            max_p=7,  # Constrain ARIMA parameters
            max_d=1,
            max_q=7,
            trace=False,
            error_action="ignore",
            suppress_warnings=True,
            stepwise=True
        )
    except Exception as e:
        print(f"Error training initial ARIMA model for {ticker}: {e}")
        continue

    # Rolling update phase
    for i in range(len(test_data)):
        # Define rolling window
        window_data = test_data.iloc[max(0, i - window_size):i]

        if len(window_data) < window_size:
            continue  # Skip if there’s not enough data for the rolling window

        test_point = test_data.iloc[i]

        try:
            # Incrementally update the model
            model.update(window_data['Open'].iloc[-1])

            # Predict the next day's price
            forecast = model.predict(n_periods=1)
            if len(forecast) == 0:
                continue

            # Append prediction and true label
            predictions.append(int(forecast[0] > window_data['Open'].iloc[-1]))
            true_labels.append(int(test_point['Movement']))
        except Exception as e:
            print(f"Error updating ARIMA model for {ticker} at iteration {i}: {e}")
            continue

    # Evaluate accuracy for this ticker
    if predictions:
        accuracy = accuracy_score(true_labels, predictions)
        results.append({'Ticker': ticker, 'Accuracy': accuracy})
    else:
        results.append({'Ticker': ticker, 'Accuracy': None})

# Convert to DataFrame and save results for this group
results_df = pd.DataFrame(results)
results_df.to_csv(f"arima_results_group_10.csv", index=False)
print("Results for Group 10 saved.")

Processing Group 10: 100%|██████████| 10/10 [03:03<00:00, 18.34s/it]

Results for Group 10 saved.





In [20]:
# Read all CSV files for results
csv_files = glob.glob("arima_results_group_*.csv")
all_results = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

# Save combined results
all_results.to_csv("arima_results_combined.csv", index=False)
print("Combined results saved to 'arima_results_combined.csv'.")

Combined results saved to 'arima_results_combined.csv'.
