<a href="https://colab.research.google.com/github/diejo57/Quant-Analysis/blob/main/TSLA_daily_returns_1000shares.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import yfinance as yf
from datetime import date
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # Import numpy for calculations

print("✅ Using pre-installed libraries. No installation needed.")

✅ Using pre-installed libraries. No installation needed.


In [8]:
# --- Load the tickers from the provided CSV file ---
try:
    screener_df = pd.read_csv('/content/nasdaq_screener_1758363072383.csv')
    nasdaq_tickers = screener_df['Symbol'].head(1500).tolist()
    if 'TSLA' in nasdaq_tickers:
        nasdaq_tickers.remove('TSLA')
    print(f"Loaded {len(nasdaq_tickers)} tickers from the CSV file.")
except FileNotFoundError:
    print("Error: The file 'nasdaq_screener_1758363072383.csv' was not found.")
    nasdaq_tickers = []

# --- Define our essential tickers ---
additional_tickers = [
    'TSLA', '^GSPC', '^IXIC', '^VIX', '^TNX', 'DX-Y.NYB', 'CL=F', 'BTC-USD',
]

# --- Combine lists and clean symbols ---
all_tickers = list(set(nasdaq_tickers + additional_tickers))
cleaned_tickers = [ticker.replace('/', '-').replace('.', '-') for ticker in all_tickers]

print(f"Assembled and cleaned a final list of {len(cleaned_tickers)} unique tickers for analysis.")

Loaded 1500 tickers from the CSV file.
Assembled and cleaned a final list of 1508 unique tickers for analysis.


In [9]:
# --- Define the key dates for our test ---
prediction_date = date(2025, 9, 19)
end_of_training_data = date(2025, 9, 18)
start_date = end_of_training_data - pd.DateOffset(years=5)

# --- Download and Clean Data ---
if cleaned_tickers:
    raw_data = yf.download(
        tickers=cleaned_tickers,
        start=start_date,
        end=prediction_date + pd.DateOffset(days=1),
        group_by='ticker',
        threads=True,
        progress=True
    )

    # --- Robust Data Cleaning ---
    data = raw_data.stack(level=0)['Close'].unstack()
    data.fillna(method='ffill', inplace=True)
    min_valid_obs = int(len(data) * 0.90)
    data.dropna(axis=1, thresh=min_valid_obs, inplace=True)
    data.dropna(inplace=True)

    print(f"✅ Successfully downloaded and cleaned data for {data.shape[1]} assets.")

  raw_data = yf.download(
[                       0%                       ]  7 of 1508 completedERROR:yfinance:HTTP Error 404: 
[*********************100%***********************]  1508 of 1508 completed
ERROR:yfinance:
97 Failed downloads:
ERROR:yfinance:['ASB^F', 'ABR^E', 'BFS^E', 'BC^A', 'AHT^I', 'BC^C', 'ARES^B', 'ALL^H', 'ATCO^H', 'ALB^A', 'CDR^B', 'CIM^A', 'AHT^F', 'BCV^A', 'AHT^G', 'AHL^D', 'BAC^P', 'ANG^D', 'AGM^G', 'CFG^E', 'CMS^C', 'AMH^H', 'BA^A', 'C^N', 'BML^J', 'BAC^O', 'BHR^B', 'AGM^E', 'AHT^D', 'BML^G', 'CIM^D', 'ADC^A', 'ALTG^A', 'ABR^F', 'BAC^M', 'BOH^B', 'AMH^G', 'ACR^C', 'BML^L', 'AHH^A', 'BAC^B', 'CHMI^A', 'ALL^B', 'CDR^C', 'BIP^B', 'CMS^B', 'BANC^F', 'AGM^H', 'ANG^B', 'ACR^D', 'ABR^D', 'BK^K', 'BOH^A', 'BHR^D', 'AUB^A', 'AXS^E', 'BFS^D', 'CFR^B', 'CHMI^B', 'CFG^H', 'CADE^A', 'AHL^F', 'ALL^J', 'ATH^B', 'CMA^B', 'CNO^A', 'CIM^B', 'CIO^A', 'ALL^I', 'DX-Y-NYB', 'BW^A', 'BAC^S', 'ATH^E', 'ATH^A', 'BML^H', 'BAC^N', 'ATH^D', 'BAC^K', 'CMRE^D', 'CMRE^B', 'BAC^Q', 'AGM^D', 

✅ Successfully downloaded and cleaned data for 979 assets.


  data.fillna(method='ffill', inplace=True)


In [10]:
# --- Create a DataFrame to hold the indicators ---
indicators = pd.DataFrame(index=data.index)

# --- 1. Simple Moving Average (SMA) ---
indicators['TSLA_SMA50'] = data['TSLA'].rolling(window=50).mean()

# --- 2. Bollinger Bands ---
window = 20
sma_20 = data['TSLA'].rolling(window=window).mean()
std_dev = data['TSLA'].rolling(window=window).std()
indicators['TSLA_BB_UPPER'] = sma_20 + (std_dev * 2)
indicators['TSLA_BB_LOWER'] = sma_20 - (std_dev * 2)

# --- 3. Relative Strength Index (RSI) ---
delta = data['TSLA'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
indicators['TSLA_RSI'] = 100 - (100 / (1 + rs))

print("✅ Technical indicators calculated manually.")

✅ Technical indicators calculated manually.


In [11]:
# --- Calculate daily returns ---
returns = data.pct_change()

# --- Create Lagged Features ---
features = returns.copy().shift(1)
lagged_indicators = indicators.shift(1)

# --- Create the Regression Target Variable ---
target = returns['TSLA'].rename('Target_Return')

# --- Combine all features into a final DataFrame ---
final_df = features.join(target, how='inner')
final_df = final_df.join(lagged_indicators, how='inner')

# --- NEW: Clean non-finite values (NaN, inf, -inf) that can cause errors ---
final_df.replace([np.inf, -np.inf], np.nan, inplace=True)
final_df.dropna(inplace=True)

print("✅ Final DataFrame created and cleaned.")

✅ Final DataFrame created and cleaned.


In [12]:
# --- Ensure the index is a DatetimeIndex before proceeding ---
final_df.index = pd.to_datetime(final_df.index)

# --- A more robust way to isolate the data by position ---
# The last row contains our known outcome (for Sep 19th)
last_day_data = final_df.iloc[-1]
# The second-to-last row contains the features we use for prediction (from Sep 18th)
prediction_input_day = final_df.iloc[-2]

# --- Isolate the data for our single-day backtest ---
X_predict = prediction_input_day.drop('Target_Return').to_frame().T
actual_result = last_day_data['Target_Return']

# --- Correctly define the training data (everything EXCEPT the last two days) ---
training_data = final_df.iloc[:-2]
X_train = training_data.drop(columns='Target_Return')
y_train = training_data['Target_Return']

# --- Train the scikit-learn regression model ---
model = RandomForestRegressor(
    n_estimators=100,
    min_samples_split=10,
    random_state=42,
    verbose=1,
    n_jobs=-1
)
print(f"--- Starting Model Training on data up to {training_data.index[-1].date()} ---")
model.fit(X_train, y_train)
print("--- Model Training Complete ---")

# --- Make the prediction ---
predicted_result = model.predict(X_predict)[0]

# --- Display Final Result ---
print("\n=========================================================")
print(f"  Return Prediction Backtest (for {last_day_data.name.date()})  ")
print("=========================================================")
print(f"Predicted Return: {predicted_result * 100: .4f}%")
print(f"Actual Return:    {actual_result * 100: .4f}%")
print("=========================================================")

--- Starting Model Training on data up to 2025-09-17 ---


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.9min


--- Model Training Complete ---

  Return Prediction Backtest (for 2025-09-19)  
Predicted Return:  1.3174%
Actual Return:     2.2118%


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
