In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from textblob import TextBlob  # type: ignore
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/gabie/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [19]:
import textblob # type: ignore


In [11]:
# Load the news dataset
news_data = pd.read_csv("../data/raw_analyst_ratings.csv")
print(news_data.head())
print(news_data.info())




   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4  B of A Securities Maintains Neutral on Agilent...   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   

                        date stock  
0  2020-06-05 10:30:54-04:00     A  
1  2020-06-03 10:45:20-04:00     A  
2  2020-05-26 04:30:07-04:00 

In [None]:
# Example: Analyze headline length
news_data['headline_length'] = news_data['headline'].str.len()
sns.histplot(news_data['headline_length'], bins=30, kde=True)
plt.show()


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd

# Initialize the sentiment analyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Calculate sentiment scores
news_data['sentiment'] = news_data['headline'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Categorize sentiment as positive, neutral, or negative
news_data['sentiment_category'] = news_data['sentiment'].apply(
    lambda score: 'positive' if score > 0.05 else ('negative' if score < -0.05 else 'neutral')
)

# Display sentiment distribution
print(news_data['sentiment_category'].value_counts())


In [None]:

sns.countplot(data=news_data, x='sentiment_category', palette='viridis')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Number of Headlines')
plt.show()


In [None]:
#  Extract Common Keywords or Phrases

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=20)

# Fit and transform headlines
tfidf_matrix = tfidf.fit_transform(news_data['headline'])
top_keywords = tfidf.get_feature_names_out()

print("Top Keywords:", top_keywords)


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Initialize LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)  # 5 topics
lda.fit(tfidf_matrix)

# Display top words for each topic
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx + 1}:")
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-10:]])


In [None]:
# save the result 
news_data.to_csv("../data/processed_sentiment_topic_modeling.csv", index=False)


In [5]:
# topic modeling

import os


yfinance_data_dir = '../data/yfinance_data/'
csv_files = [f for f in os.listdir(yfinance_data_dir) if f.endswith('.csv')]
stock_data = {file: pd.read_csv(os.path.join(yfinance_data_dir, file)) for file in csv_files}
print(stock_data['AAPL_historical_data.csv'].head())  # Checking the data for AAPL


         Date      Open      High       Low     Close  Adj Close     Volume  \
0  1980-12-12  0.128348  0.128906  0.128348  0.128348   0.098943  469033600   
1  1980-12-15  0.122210  0.122210  0.121652  0.121652   0.093781  175884800   
2  1980-12-16  0.113281  0.113281  0.112723  0.112723   0.086898  105728000   
3  1980-12-17  0.115513  0.116071  0.115513  0.115513   0.089049   86441600   
4  1980-12-18  0.118862  0.119420  0.118862  0.118862   0.091630   73449600   

   Dividends  Stock Splits  
0        0.0           0.0  
1        0.0           0.0  
2        0.0           0.0  
3        0.0           0.0  
4        0.0           0.0  


In [12]:
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce')
print(news_data['date'].isnull().sum())  # Check for invalid entries


1351341


In [None]:
print(news_data['date'].dtypes)
print(news_data['date'].head())


In [None]:
news_data['publication_date'] = news_data['date'].dt.date
daily_publication_counts = news_data['publication_date'].value_counts().sort_index()

import matplotlib.pyplot as plt

plt.figure(dpi= 200, figsize=(12, 6))
daily_publication_counts.plot(kind='line', title="Daily Publication Trends")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.grid()
plt.show()


In [None]:
news_data.head()

In [None]:
publisher_counts = news_data['publisher'].value_counts()
print(publisher_counts.head(10))  # Display the top 10 publishers


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

top_publishers = publisher_counts.head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_publishers.index, y=top_publishers.values, palette="viridis")
plt.xticks(rotation=45)
plt.title("Top 10 Publishers by Article Count")
plt.xlabel("Publisher")
plt.ylabel("Number of Articles")
plt.show()


In [None]:
for publisher in top_publishers.index:
    print(f"\nHeadlines for {publisher}:")
    print(news_data[news_data['publisher'] == publisher]['headline'].head(5))  # Show 5 sample headlines


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Analyze content for a specific publisher
selected_publisher = top_publishers.index[0]  # Example: the most frequent publisher
publisher_headlines = news_data[news_data['publisher'] == selected_publisher]['headline']

tfidf = TfidfVectorizer(stop_words='english', max_features=20)
tfidf_matrix = tfidf.fit_transform(publisher_headlines)
print(f"Top Keywords for {selected_publisher}: {tfidf.get_feature_names_out()}")


In [None]:
news_data['publisher_domain'] = news_data['publisher'].apply(
    lambda x: x.split('@')[-1] if '@' in x else None
)
domain_counts = news_data['publisher_domain'].value_counts()
print(domain_counts.head(10))  # Display the top 10 domains



In [None]:
top_domains = domain_counts.head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_domains.index, y=top_domains.values, palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Top 10 Email Domains by Article Count")
plt.xlabel("Domain")
plt.ylabel("Number of Articles")
plt.show()


In [4]:
import os
import pandas as pd

# Path to the yfinance_data folder
data_folder = "../data/yfinance_data"

# List all CSV files in the folder
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]

# Load each file into a dictionary
stock_data = {
    file.split('_')[0]: pd.read_csv(os.path.join(data_folder, file)) for file in csv_files
}

# Display the first few rows of a loaded DataFrame (e.g., AAPL)
print(stock_data['AAPL'].head())


         Date      Open      High       Low     Close  Adj Close     Volume  \
0  1980-12-12  0.128348  0.128906  0.128348  0.128348   0.098943  469033600   
1  1980-12-15  0.122210  0.122210  0.121652  0.121652   0.093781  175884800   
2  1980-12-16  0.113281  0.113281  0.112723  0.112723   0.086898  105728000   
3  1980-12-17  0.115513  0.116071  0.115513  0.115513   0.089049   86441600   
4  1980-12-18  0.118862  0.119420  0.118862  0.118862   0.091630   73449600   

   Dividends  Stock Splits  
0        0.0           0.0  
1        0.0           0.0  
2        0.0           0.0  
3        0.0           0.0  
4        0.0           0.0  


In [7]:
for symbol, df in stock_data.items():
    print(f"{symbol} columns: {df.columns}")


TSLA_historical_data.csv columns: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Dividends', 'Stock Splits'],
      dtype='object')
AAPL_historical_data.csv columns: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Dividends', 'Stock Splits'],
      dtype='object')
MSFT_historical_data.csv columns: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Dividends', 'Stock Splits'],
      dtype='object')
GOOG_historical_data.csv columns: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Dividends', 'Stock Splits'],
      dtype='object')
NVDA_historical_data.csv columns: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Dividends', 'Stock Splits'],
      dtype='object')
AMZN_historical_data.csv columns: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Dividends', 'Stock Splits'],
      dtype='object')
META_historical_data.c

In [None]:
for symbol, df in stock_data.items():
    df.columns = df.columns.str.strip()  # Strip whitespace from column names
    df = df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume']]  # Keep only relevant columns
    stock_data[symbol] = df
