# Technical Analysis Indicator Price Prediction
The goal of this project is to analyze the predictive power of the top 10 most popular TA indicators and see how well they do to predict price over a 30 day period. I am going to find the value of the indicators on day 1 (30 trading days ago) and then find the daily closing price for 30 days later and measure how well the indicator predicted the price.  

first we'll find the top 500 stocks by market cap from nasdaq and pull them into a dataframe


In [1]:
# Importing pandas library for data manipulation and analysis
import pandas as pd

# Load the CSV file into a DataFrame
csv_file_path = '/Users/evancallaghan/Downloads/nasdaq_screener_1726538993372.csv' 
df = pd.read_csv(csv_file_path)

# Inspect the DataFrame to understand its structure
print(df.head())

# Filter DataFrame to only show the columns 'Symbol', 'Name', and 'Market Cap'
df = df[['Symbol', 'Name', 'Market Cap']]

# Convert 'Market Cap' to numeric if it's not already
# Remove commas, dollar signs, and replace these symbols with empty spaces
df['Market Cap'] = df['Market Cap'].replace({'\$': '', ',': ''}, regex=True).astype(float)

# Sort the DataFrame by Market Cap in descending order
df_sorted = df.sort_values(by='Market Cap', ascending=False).head(5000)                                                                        
df_sorted.head()


  Symbol                                               Name Last Sale  \
0      A             Agilent Technologies Inc. Common Stock   $138.31   
1     AA                    Alcoa Corporation Common Stock     $34.50   
2   AACG   ATA Creativity Global American Depositary Shares   $0.5025   
3   AACT  Ares Acquisition Corporation II Class A Ordina...    $10.80   
4   AADI                  Aadi Bioscience Inc. Common Stock     $1.88   

   Net Change % Change    Market Cap        Country  IPO Year    Volume  \
0      1.0000   0.728%  3.974029e+10  United States    1999.0    887040   
1      1.9800   6.089%  8.912735e+09  United States    2016.0  10730428   
2     -0.0275  -5.189%  1.608006e+07          China    2008.0     25043   
3      0.0200   0.186%  0.000000e+00            NaN    2023.0     35074   
4      0.0800   4.444%  4.627589e+07  United States       NaN     81942   

        Sector                                          Industry  
0  Industrials  Biotechnology: Laboratory A

Unnamed: 0,Symbol,Name,Market Cap
15,AAPL,Apple Inc. Common Stock,3288959000000.0
4208,MSFT,Microsoft Corporation Common Stock,3206167000000.0
4559,NVDA,NVIDIA Corporation Common Stock,2864613000000.0
2819,GOOG,Alphabet Inc. Class C Capital Stock,1957167000000.0
2820,GOOGL,Alphabet Inc. Class A Common Stock,1945719000000.0


In [2]:
# Reset the index of the DataFrame and drop the old index
df_sorted.reset_index(drop=True, inplace=True)

# Update the index to start from 1 instead of 0
df_sorted.index = df_sorted.index + 1

# Display the first few rows of the updated DataFrame
df_sorted.head()

Unnamed: 0,Symbol,Name,Market Cap
1,AAPL,Apple Inc. Common Stock,3288959000000.0
2,MSFT,Microsoft Corporation Common Stock,3206167000000.0
3,NVDA,NVIDIA Corporation Common Stock,2864613000000.0
4,GOOG,Alphabet Inc. Class C Capital Stock,1957167000000.0
5,GOOGL,Alphabet Inc. Class A Common Stock,1945719000000.0


remove all stocks except common stocks

In [3]:
# Ensure there are no leading or trailing whitespaces in the 'Name' column
df_sorted['Name'] = df_sorted['Name'].str.strip()

# List of terms to filter out
terms_to_drop = ["Capital Stock", "Depository Shares", "Global Notes", "ADS", 
                 "Registry Shares", "Depositary Shares"
]

# Create a regex pattern to match any of the terms
# //b ensures that the match occues only at the start or end of a word
# pipe '|' ensures that if any of the terms in 'terms_to_drop' are seen, 
# there is a match
pattern = '|'.join([f"\\b{term}\\b" for term in terms_to_drop])

# Apply filtering based on the updated pattern
df_filtered = df_sorted[~df_sorted['Name'].str.contains(pattern, case=False, 
                                                        na=False)
]

# Display the filtered DataFrame
df_filtered.head()

Unnamed: 0,Symbol,Name,Market Cap
1,AAPL,Apple Inc. Common Stock,3288959000000.0
2,MSFT,Microsoft Corporation Common Stock,3206167000000.0
3,NVDA,NVIDIA Corporation Common Stock,2864613000000.0
5,GOOGL,Alphabet Inc. Class A Common Stock,1945719000000.0
6,AMZN,Amazon.com Inc. Common Stock,1940525000000.0


In [7]:
# Reset the index of the DataFrame and drop the old index
df_filtered.reset_index(drop=True, inplace=True)

# Update the index to start from 1 instead of 0
df_filtered.index = df_filtered.index + 1

# Display the first few rows of the updated DataFrame
df_filtered.head()

Unnamed: 0,Symbol,Name,Market Cap
1,AAPL,Apple Inc. Common Stock,3288959000000.0
2,MSFT,Microsoft Corporation Common Stock,3206167000000.0
3,NVDA,NVIDIA Corporation Common Stock,2864613000000.0
4,GOOGL,Alphabet Inc. Class A Common Stock,1945719000000.0
5,AMZN,Amazon.com Inc. Common Stock,1940525000000.0


below are the 10 technical indicators we are going to use for this project.
1. Relative Strength Index (RSI)
2. Moving Average Convergence Divergence (MACD)
3. Stochastic Oscillator
4. Simple Moving Average (SMA)
5. Exponential Moving Average (EMA)
6. Volume Weighted Average Price (VWAP)
7. Bollinger Bands
8. Average True Range (ATR)
9. Fibonacci Retracement 

In [None]:
# There is a CSV file I have provided so this code does not need to be run again
# Computationally intensive

import yfinance as yf
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Function to download stock data for a single stock
def download_stock_data(ticker):
    try:
        data = yf.download(ticker, start="2022-02-10", end="2025-02-10", interval="1d")[['Close', 'High', 'Low', 'Volume']]
        if data.empty:
            print(f"Warning: No data found for {ticker} (possibly due to non-trading days like weekends or holidays)")
            return None  # Return None if the data is empty
        return data
    except Exception as e:
        print(f"Error downloading data for {ticker}: {e}")
        return None  # Return None if there is any error (e.g., stock not found)

# List of tickers from your df_filtered dataframe
tickers = df_filtered['Symbol'].tolist()

# Batch size for processing tickers in chunks
batch_size = 100

# Create a function to download data for a batch of tickers in parallel
def download_batch(batch_tickers):
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(download_stock_data, batch_tickers))
    # Remove None values from the results
    return [result for result in results if result is not None]

# Loop through the tickers in batches
for i in range(0, len(tickers), batch_size):
    batch_tickers = tickers[i:i + batch_size]
    results = download_batch(batch_tickers)

    if results:  # Check if results are not empty
        # Combine all individual stock data into a single dataframe
        df_batch = pd.concat(results, keys=batch_tickers)

        # Save the data to CSV for the current batch
        df_batch.to_csv(f'/content/drive/MyDrive/stock_data_yahoo_{i // batch_size}.csv')
        print(f"Downloaded batch {i // batch_size} and saved to CSV")
    else:
        print(f"Batch {i // batch_size} has no data. Skipping...")
        # Optionally, log the tickers that failed for this batch
        print(f"Failed tickers in batch {i // batch_size}: {batch_tickers}")


In [10]:
import pandas as pd

# Initialize an empty list to hold DataFrames
df_list = []

# List of specific file indices
file_indices = [0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 27, 28, 29, 30, 45, 44, 26, 18, 46]

# Loop through the specific CSV file indices
for i in file_indices:
    # Construct the file path for each batch
    csv_file_path = f'/Users/evancallaghan/flatiron_ds/phase_5/capstone_project/stock_data_yahoo_{i}.csv'

    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # Rename 'Price' column to 'Symbol'
    df = df.rename(columns={'Price': 'Symbol'})

    # Append the DataFrame to the list
    df_list.append(df)

# Concatenate all DataFrames in the list along the rows (axis=0)
df_all = pd.concat(df_list, ignore_index=True)

# Display the first few rows of the merged DataFrame
df_all.head()


  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)
  df = pd.read_csv(csv_file_path)


Unnamed: 0,Symbol,Unnamed: 1,Close,Close.1,High,High.1,Low,Low.1,Volume,Volume.1,...,Low.21,Volume.21,Close.22,High.22,Low.22,Volume.22,Close.23,High.23,Low.23,Volume.23
0,Ticker,,CCCS,POST,CCCS,POST,CCCS,POST,CCCS,POST,...,WTS,WTS,,,,,,,,
1,,Date,,,,,,,,,...,,,,,,,,,,
2,AAPL,2022-02-10,10.520000457763672,69.12957763671875,10.729999542236328,70.2225112915039,10.199999809265137,68.92015838623047,1037700.0,642524.0,...,,,,,,,,,,
3,AAPL,2022-02-11,10.09000015258789,69.76439666748047,10.489999771118164,70.5235595703125,10.020000457763672,68.95942687988281,480300.0,492169.0,...,,,,,,,,,,
4,AAPL,2022-02-14,10.220000267028809,71.02094268798828,10.460000038146973,71.27617645263672,9.970000267028809,69.64659881591797,724400.0,672473.0,...,,,,,,,,,,
