<a href="https://colab.research.google.com/github/gtyellow/stockbot/blob/main/StockTradingBot2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#!pip install yfinance sec-api vaderSentiment scikit-learn xgboost tensorflow newsapi-python


In [19]:
!pip install finnhub-python

Collecting finnhub-python
  Downloading finnhub_python-2.4.20-py3-none-any.whl.metadata (9.0 kB)
Downloading finnhub_python-2.4.20-py3-none-any.whl (11 kB)
Installing collected packages: finnhub-python
Successfully installed finnhub-python-2.4.20


In [34]:
import yfinance as yf
from sec_api import QueryApi
import finnhub
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [22]:


# SEC API initialization
sec_api = QueryApi(api_key='680a77dac860076b7302bbf2789c1b66787a98d94189fb328954d42fd5806439')
finnhub_client = finnhub.Client(api_key="crku8ghr01qhc7mjec30crku8ghr01qhc7mjec3g")

# Stock symbols
stocks = ['W', 'PTON', 'SHC', 'CVNA', 'ARWR', 'EVA', 'CHGG', 'CALX', 'LIFW', 'ELF',
          'HP', 'FRO', 'CEIX', 'SMCI', 'ANET', 'LSCC', 'TTD', 'FSLR',
          'PSTG', 'GNRC', 'NUS', 'TDC', 'IOT']

In [23]:

# Fetch historical price data
def get_stock_data(symbols, start="2023-01-01", end="2023-12-31"):
    all_data = {}
    for symbol in symbols:
        df = yf.download(symbol, start=start, end=end)
        all_data[symbol] = df
    return all_data

In [24]:

# Fetch 10-K, 10-Q filings from SEC
def fetch_financial_statements(stock):
    query = {
        "query": { "query_string": { "query": f"ticker:{stock} AND formType:(10-K OR 10-Q)" } },
        "from": "2023-01-01",
        "to": "2023-12-31",
        "sort": [{"filedAt": {"order": "desc"}}]
    }
    filings = sec_api.get_filings(query)
    return filings

In [27]:



# News sentiment analysis
def get_news_sentiment(symbol, start_date, end_date):
    analyzer = SentimentIntensityAnalyzer()
    try:
        news = finnhub_client.company_news(symbol, _from=start_date, to=end_date)
    except Exception as e:
        print(f"Error fetching news for {symbol}: {e}")
        return 0

    sentiment_scores = []

    for article in news:
        text = article.get('headline', '') + ' ' + article.get('summary', '')
        if text:
            sentiment = analyzer.polarity_scores(text)
            sentiment_scores.append(sentiment['compound'])

    if sentiment_scores:
        avg_sentiment = sum(sentiment_scores) / len(sentiment_scores)
    else:
        avg_sentiment = 0

    return avg_sentiment


In [29]:
def extract_financial_metrics(filings):
    # This is a placeholder function. Extracting financial metrics from filings requires parsing the documents.
    # For simplicity, we'll return dummy data. In a real scenario, you would parse the filings to extract actual data.
    metrics = {
        'revenue': np.random.uniform(100, 1000),  # Placeholder for revenue
        'net_income': np.random.uniform(-100, 500),  # Placeholder for net income
    }
    return metrics

In [30]:
def prepare_training_data(stock_data, financial_data, news_sentiment):
    all_features = []
    all_targets = []

    for stock, data in stock_data.items():
        prices = data['Close'].pct_change().fillna(0)  # Daily percentage change
        target = np.sign(prices.values[1:])  # Skip the first NaN value

        # Financial metrics
        financial_metrics = extract_financial_metrics(financial_data[stock])

        # Repeat financial metrics to match the length of the price data
        revenue = np.full(len(target), financial_metrics['revenue'])
        net_income = np.full(len(target), financial_metrics['net_income'])

        # News sentiment
        sentiment = np.full(len(target), news_sentiment.get(stock, 0))

        # Features: [Revenue, Net Income, Sentiment]
        features = np.column_stack((revenue, net_income, sentiment))

        all_features.append(features)
        all_targets.append(target)

    X = np.vstack(all_features)
    y = np.hstack(all_targets)

    return X, y

In [32]:
# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [31]:
# Fetch data
stock_data = get_stock_data(stocks)
financial_data = {stock: fetch_financial_statements(stock) for stock in stocks}
news_sentiment = {stock: get_news_sentiment(stock, "2023-01-01", "2023-12-31") for stock in stocks}

# Prepare training data
X, y = prepare_training_data(stock_data, financial_data, news_sentiment)

# Remove any potential NaN or infinite values
X = np.nan_to_num(X)
y = np.nan_to_num(y)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [36]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, use_label_encoder=False, eval_metric='logloss')

# Map y labels to [0, 1, 2]
y_train_mapped = np.where(y_train == -1, 0, y_train + 1)
y_test_mapped = np.where(y_test == -1, 0, y_test + 1)

# Now y_train_mapped and y_test_mapped will be in the range [0, 1, 2]


In [37]:

# Train the model
model.fit(X_train, y_train_mapped)

# Evaluate the model using remapped labels
y_pred_mapped = model.predict(X_test)

# Reverse the mapping to [-1, 0, 1] for evaluation
y_pred = np.where(y_pred_mapped == 0, -1, y_pred_mapped - 1)

# Check accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy * 100:.2f}%")

Model accuracy: 52.53%


Parameters: { "use_label_encoder" } are not used.



In [38]:
# Define backtest stocks
backtest_stocks = ['ZS', 'SNOW', 'CRWD', 'MDB', 'OKTA']  # Example stocks

# Fetch backtest stock data
backtest_stock_data = get_stock_data(backtest_stocks)
backtest_financial_data = {stock: fetch_financial_statements(stock) for stock in backtest_stocks}
backtest_news_sentiment = {stock: get_news_sentiment(stock, "2023-01-01", "2023-12-31") for stock in backtest_stocks}

# Prepare backtest data
X_backtest, y_backtest = prepare_training_data(backtest_stock_data, backtest_financial_data, backtest_news_sentiment)
X_backtest_scaled = scaler.transform(X_backtest)

# Predict and evaluate
y_backtest_pred = model.predict(X_backtest_scaled)
backtest_accuracy = accuracy_score(y_backtest, y_backtest_pred)

print(f"Backtest accuracy: {backtest_accuracy * 100:.2f}%")


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Backtest accuracy: 0.08%
