In [1]:
import yfinance as yf
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [5]:
# Install necessary libraries
!pip install yfinance pandas numpy scikit-learn tensorflow matplotlib



In [6]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [7]:
# --- Step 1: Data Acquisition ---
ticker = "AAPL"
start_date = "2020-01-01"
end_date = "2023-01-01"

In [8]:
# 1. Get historical stock data
df_stock = yf.download(ticker, start=start_date, end=end_date)

  df_stock = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


In [9]:
# 2. Get news sentiment data (conceptual, for demonstration)
def get_sentiment_data():
    # In a real project, you would use a news API.
    # We use a static list to illustrate the concept.
    news_headlines = [
        {'date': '2020-01-02', 'headline': 'Apple stock soars after positive earnings report.'},
        {'date': '2020-01-03', 'headline': 'Apple faces new lawsuit over privacy policies.'},
        {'date': '2020-01-06', 'headline': 'Analyst predicts record sales for new iPhone.'},
        {'date': '2020-01-07', 'headline': 'Negative outlook for Apple due to supply chain disruption.'},
        {'date': '2021-02-15', 'headline': 'Apple announces innovative new product, stock gains.'},
        {'date': '2021-03-20', 'headline': 'Concerns over chip shortages may impact Apple production.'},
        {'date': '2022-05-10', 'headline': "Apple's CEO delivers a confident speech on future growth."},
        {'date': '2022-08-25', 'headline': 'Global economic slowdown could hurt Apple sales.'},
    ]

    sid = SentimentIntensityAnalyzer()
    sentiment_scores = []

    for news in news_headlines:
        # Get the compound sentiment score from the VADER analyzer
        compound_score = sid.polarity_scores(news['headline'])['compound']
        sentiment_scores.append({'Date': news['date'], 'Sentiment': compound_score})

    df_sentiment = pd.DataFrame(sentiment_scores)
    df_sentiment['Date'] = pd.to_datetime(df_sentiment['Date'])

    return df_sentiment

In [10]:
df_sentiment = get_sentiment_data()
df_sentiment = df_sentiment.set_index('Date')


In [None]:
# Merge the dataframes
df_stock = df_stock.reset_index()
# Flatten the multi-level columns after reset_index()
df_stock.columns = ['_'.join(col).strip('_') for col in df_stock.columns.values]
# Rename the 'level_0' column to 'Date' if it exists after reset_index()
if 'level_0' in df_stock.columns:
    df_stock = df_stock.rename(columns={'level_0': 'Date'})
df_stock['Date'] = pd.to_datetime(df_stock['Date'])

df = pd.merge(df_stock, df_sentiment, on='Date', how='left')

In [None]:
# Fill missing sentiment values with a neutral score (e.g., 0)
df['Sentiment'] = df['Sentiment'].fillna(0)

In [None]:
# --- Step 2: Data Preprocessing and Feature Engineering ---
def add_indicators(df):
    # Add a 50-day Simple Moving Average (SMA)
    df['SMA_50'] = df['Close_AAPL'].rolling(window=50).mean()
    # Add a 14-day Relative Strength Index (RSI)
    delta = df['Close_AAPL'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    return df

df_processed = add_indicators(df.copy()) # Use a copy to avoid modifying the original df
df_processed.dropna(inplace=True)

# Select features and the target variable
features = ['Close_AAPL', 'Volume_AAPL', 'SMA_50', 'RSI', 'Sentiment']
data = df_processed[features].values

# Scale the data to a range of 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data)

# Create sequences for the LSTM model
def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:(i + sequence_length), :])
        y.append(data[i + sequence_length, 0])
    return np.array(X), np.array(y)

sequence_length = 60
X, y = create_sequences(scaled_data, sequence_length)

# Split data into training and testing sets
train_size = int(len(X) * 0.8)
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]

# --- Step 3: Model Building and Training ---
from tensorflow.keras.layers import Input
model = Sequential()
model.add(Input(shape=(X_train.shape[1], X_train.shape[2]))) # Using Input layer as suggested by warning
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=1))

model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=25, batch_size=32, verbose=1)

# Make predictions and inverse transform them
predictions = model.predict(X_test)
y_test_scaled = np.zeros((len(y_test), len(features)))
y_test_scaled[:, 0] = y_test
y_test_actual = scaler.inverse_transform(y_test_scaled)[:, 0]

predictions_scaled = np.zeros((len(predictions), len(features)))
predictions_scaled[:, 0] = predictions.flatten()
predictions_actual = scaler.inverse_transform(predictions_scaled)[:, 0]

# --- Step 4: Visualization ---
plt.figure(figsize=(16, 8))
plt.title('Stock Price Prediction with External Data')
plt.xlabel('Date')
plt.ylabel('Stock Price (USD)')
plt.plot(df_processed.index[train_size + sequence_length:], y_test_actual, label='Actual Price', color='blue')
plt.plot(df_processed.index[train_size + sequence_length:], predictions_actual, label='Predicted Price', color='red')
plt.legend()
plt.show()