In [1]:
pip install numpy yfinance requests feedparser beautifulsoup4 nltk vaderSentiment streamlit joblib matplotlib

Note: you may need to restart the kernel to use updated packages.


In [20]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

In [3]:
!pip install scikit-learn



## Download/ Fetch Price

In [21]:
import yfinance as yf
import pandas as pd

def download_price(ticker, start='2018-01-01', end=None):
    df = yf.download(ticker, start=start, end=end, progress=False, auto_adjust=False)
    df.index = pd.to_datetime(df.index).tz_localize(None)
    return df[['Open','High','Low','Close','Adj Close','Volume']]


In [None]:
# test
df = download_price("AAPL", start="2020-01-01")
print(df.head())


In [None]:
df.head()

In [7]:
df.to_csv(r"E:\DEV\Data_science\DataScience\Assignments\stock-sentiment\data\AAPL_price.csv")

In [8]:
!pip install python-dateutil



## Google News RSS (no API key)

In [9]:
# save as rss_fetch.p
import feedparser
import pandas as pd
from dateutil import parser as dateparser
import time

def fetch_google_news_rss(query, max_items=200, country='US', lang='en'):
    q = query.replace(' ', '+')
    rss_url = f"https://news.google.com/rss/search?q={q}&hl={lang}&gl={country}&ceid={country}:{lang}"
    feed = feedparser.parse(rss_url)
    items = []
    for entry in feed.entries[:max_items]:
        published = entry.get('published') or entry.get('updated') or entry.get('pubDate')
        published_iso = dateparser.parse(published).isoformat() if published else None
        items.append({
            "publishedAt": published_iso,
            "date": pd.to_datetime(published_iso, errors='coerce').date() if published_iso else None,
            "source": entry.get('source', {}).get('title') or entry.get('source'),
            "title": entry.get('title'),
            "summary": entry.get('summary'),
            "url": entry.get('link')
        })
    df = pd.DataFrame(items)
    return df


if __name__ == "__main__":
    df = fetch_google_news_rss('AAPL OR Apple stock', max_items=200)
    df.to_csv(r"E:\DEV\Data_science\DataScience\Assignments\stock-sentiment\data\AAPL_news_rss.csv", index=False)
    print(df.shape)


(100, 6)


## Clean Headlines

In [10]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hardik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Download NLTK stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hardik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# ========== 2. Load CSV files ==========
price = pd.read_csv(r"E:\DEV\Data_science\DataScience\Assignments\stock-sentiment\data\AAPL_price.csv",skiprows=2)
news = pd.read_csv(r"E:\DEV\Data_science\DataScience\Assignments\stock-sentiment\data\AAPL_news_rss.csv")

In [13]:
print("Price columns:", price.columns)
print("News columns:", news.columns)

Price columns: Index(['Date', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6'],
      dtype='object')
News columns: Index(['publishedAt', 'date', 'source', 'title', 'summary', 'url'], dtype='object')


In [14]:
# Ensure datetime
price['Date'] = pd.to_datetime(price['Date'])
news['publishedAt'] = pd.to_datetime(news['publishedAt'])
news['date'] = news['publishedAt'].dt.date
price['date'] = price['Date'].dt.date

  news['publishedAt'] = pd.to_datetime(news['publishedAt'])


In [15]:
print(price.head())
print(price.index)


        Date  Unnamed: 1  Unnamed: 2  Unnamed: 3  Unnamed: 4  Unnamed: 5  \
0 2020-01-02   74.059998   75.150002   73.797501   75.087502   72.538513   
1 2020-01-03   74.287498   75.144997   74.125000   74.357498   71.833290   
2 2020-01-06   73.447502   74.989998   73.187500   74.949997   72.405678   
3 2020-01-07   74.959999   75.224998   74.370003   74.597504   72.065155   
4 2020-01-08   74.290001   76.110001   74.290001   75.797501   73.224434   

   Unnamed: 6        date  
0   135480400  2020-01-02  
1   146322800  2020-01-03  
2   118387200  2020-01-06  
3   108872000  2020-01-07  
4   132079200  2020-01-08  
RangeIndex(start=0, stop=1433, step=1)


In [16]:
# Clean column names
price.columns = price.columns.str.strip()
news.columns = news.columns.str.strip()

In [17]:
price = price.merge(daily_sent, left_index=True, right_index=True, how='left')
price[['sent_mean','sent_count']] = price[['sent_mean','sent_count']].fillna(0)

price['sent_next'] = price['sent_mean'].shift(1)   # use yesterday's aggregated news to predict today

NameError: name 'daily_sent' is not defined

## clean news headlines

In [None]:
# ========== 3. Clean news headlines ==========
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = [w for w in text.split() if w not in stop_words]
    return " ".join(tokens)

news['clean'] = news['title'].apply(clean_text)

In [None]:
# ========== 4. Sentiment analysis ==========
an = SentimentIntensityAnalyzer()
news['compound'] = news['clean'].apply(lambda t: an.polarity_scores(t)['compound'])

In [None]:
# Daily aggregation
daily_sent = news.groupby('date').agg(
    sent_mean=('compound','mean'),
    sent_count=('compound','count')
).reset_index()

In [None]:
# ========== 5. Merge with stock prices ==========
df = pd.merge(price, daily_sent, on='date', how='left')

In [None]:
# Fill missing sentiment with 0
df['sent_mean'] = df['sent_mean'].fillna(0)
df['sent_count'] = df['sent_count'].fillna(0)

In [None]:
# ========== 6. Feature Engineering ==========
df['sent_next'] = df['sent_mean'].shift(1)  # lag feature
df['Return'] = df['Close'].pct_change()

In [None]:
print(df.columns.tolist())
print(df.head())

In [None]:
df = df.rename(columns={
    "Unnamed: 1": "Open",
    "Unnamed: 2": "High",
    "Unnamed: 3": "Low",
    "Unnamed: 4": "Close",
    "Unnamed: 5": "Adj Close",
    "Unnamed: 6": "Volume"
})


In [None]:
print(df.columns)

In [None]:
# Target: 1 if price goes up tomorrow, else 0
df['target'] = (df['Close'].shift(-1) > df['Close']).astype(int)


In [None]:
an = SentimentIntensityAnalyzer()
news['compound'] = news['clean'].apply(lambda t: an.polarity_scores(t)['compound'])

In [None]:
# Drop NaNs
df = df.dropna()

In [None]:
# ========== 7. Prepare ML data ==========
features = ['sent_mean', 'sent_count', 'sent_next', 'Return', 'Volume']
X = df[features]
y = df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

In [None]:
# ========== 8. Train model ==========
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = model.predict(X_test)

In [None]:
# ========== 9. Evaluate ==========
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# ========== 10. Plot actual vs predicted ==========
plt.figure(figsize=(10,5))
plt.plot(df['Date'][-len(y_test):], y_test.values, label='Actual')
plt.plot(df['Date'][-len(y_test):], y_pred, label='Predicted', alpha=0.7)
plt.legend()
plt.title("Stock Movement Prediction")
plt.show()

In [19]:
# Save model
import joblib
import joblib

features = ["return", "sentiment_mean", "sentiment_count", "sentiment_next"]  # adjust to your actual names
joblib.dump((model, features), "model.pkl")

print("✅ Model saved as model.pkl")

NameError: name 'model' is not defined

In [None]:
pip install joblib