In [32]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as stats
plt.style.use('ggplot')


ValueError: Key backend: 'module://matplotlib_inline.backend_inline' is not a valid value for backend; supported values are ['gtk3agg', 'gtk3cairo', 'gtk4agg', 'gtk4cairo', 'macosx', 'nbagg', 'notebook', 'qtagg', 'qtcairo', 'qt5agg', 'qt5cairo', 'tkagg', 'tkcairo', 'webagg', 'wx', 'wxagg', 'wxcairo', 'agg', 'cairo', 'pdf', 'pgf', 'ps', 'svg', 'template']

In [19]:
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt', download_dir='C:\\nltk_data')
nltk.download('stopwords', download_dir='C:\\nltk_data')
nltk.download('punkt_tab')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\est_lul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
try:
    # Load the parquet file
    news_df = pd.read_parquet('../../data/raw_analyst_ratings.parquet')
except FileNotFoundError:
    print("File not found. Please ensure the path is correct and the file exists.")
    exit(1)
except Exception as e:
    print(f"An error occurred while reading the parquet file: {e}")
    exit(1)

In [6]:
# Convert date column to datetime format
news_df['date'] = pd.to_datetime(news_df['date'], format='mixed', utc=True)

# Compute headline text length metrics (character and word count)
news_df['headline_length_chars'] = news_df['headline'].apply(len)
news_df['headline_length_words'] = news_df['headline'].apply(lambda x: len(x.split()))

# Summary Statistics for Headline Length
headline_stats = {
    "Mean Length (Words)": np.mean(news_df['headline_length_words']),
    "Median Length (Words)": np.median(news_df['headline_length_words']),
    "Standard Deviation": np.std(news_df['headline_length_words']),
    "Max Length": np.max(news_df['headline_length_words']),
    "Min Length": np.min(news_df['headline_length_words']),
    "Skewness": stats.skew(news_df['headline_length_words']),
    "Kurtosis": stats.kurtosis(news_df['headline_length_words']),
}
print("Headline Length Statistics:\n", headline_stats)

news_df.sample(10)

Headline Length Statistics:
 {'Mean Length (Words)': np.float64(11.416705984674504), 'Median Length (Words)': np.float64(10.0), 'Standard Deviation': np.float64(6.352995020898485), 'Max Length': np.int64(77), 'Min Length': np.int64(1), 'Skewness': np.float64(2.1671569878139527), 'Kurtosis': np.float64(7.132395151646767)}


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,headline_length_chars,headline_length_words
957975,962820,Traders Attributing Strength In Orexigen To A ...,https://www.benzinga.com/movers/17/01/8864191/...,Paul Quintaro,2017-01-04 00:00:00+00:00,OREX,123,20
618254,621448,HNI Corp. Sees Q2 Adj. EPS $0.54-$0.59 vs $0.4...,https://www.benzinga.com/news/16/04/7870130/hn...,Paul Quintaro,2016-04-21 00:00:00+00:00,HNI,93,17
1366076,1372510,Weight Watchers Guides FY 2011 EPS $3.75-4.00 ...,https://www.benzinga.com/news/guidance/11/05/1...,Benzinga Staff,2011-05-06 00:00:00+00:00,WTW,75,11
1390361,1396843,Groupon Could Target Online-Review Company Yel...,https://www.benzinga.com/m-a/19/09/14418038/gr...,Charles Gross,2019-09-11 00:00:00+00:00,YELP,121,17
297645,299350,"Earnings Scheduled For February 18, 2016",https://www.benzinga.com/news/earnings/16/02/6...,Monica Gerson,2016-02-18 00:00:00+00:00,CRMT,40,6
324780,326563,Feltl and Company Initiates Coverage on Cypres...,https://www.benzinga.com/analyst-ratings/price...,Juan Lopez,2012-08-28 00:00:00+00:00,CY,89,14
387779,389865,63 Biggest Movers From Yesterday,https://www.benzinga.com/news/19/11/14855313/6...,Lisa Levin,2019-11-21 00:00:00+00:00,DVAX,32,5
1082380,1087766,Mid-Morning Market Update: Markets Mixed; Deer...,https://www.benzinga.com/news/earnings/14/11/5...,Garrett Cook,2014-11-26 00:00:00+00:00,RIG,80,11
89640,90422,Mid-Day Market Update: Crude Oil Up Over 4%; E...,https://www.benzinga.com/news/earnings/16/12/8...,Lisa Levin,2016-12-01 00:00:00+00:00,ARDM,80,14
94085,94881,"Social Media Outlook for Friday May 11 (NVDA, ...",https://www.benzinga.com/news/earnings/12/05/2...,Social Market Analytics,2012-05-11 00:00:00+00:00,ARNA,62,11


In [7]:
# Temporal Analysis of News Publications (Yearly and Monthly Trends)
news_df['year'] = news_df['date'].dt.year
news_df['month'] = news_df['date'].dt.month

yearly_counts = news_df.groupby('year')['headline'].count()
monthly_counts = news_df.groupby('month')['headline'].count()

# Time-of-Day Analysis (Are certain times more news-heavy?)
news_df['hour'] = news_df['date'].dt.hour
hourly_counts = news_df.groupby('hour')['headline'].count()

print("\nYearly News Article Counts:\n", yearly_counts)
print("\nMonthly News Article Counts:\n", monthly_counts)


print("\nHourly Distribution of News:\n", hourly_counts)


Yearly News Article Counts:
 year
2009     11489
2010     81319
2011    131322
2012    122649
2013    121529
2014    134859
2015    135295
2016    141892
2017    124456
2018    146924
2019    150380
2020    105214
Name: headline, dtype: int64

Monthly News Article Counts:
 month
1     121545
2     122836
3     121949
4     121813
5     130340
6     106598
7     110764
8     124041
9      96089
10    124800
11    121430
12    105123
Name: headline, dtype: int64

Hourly Distribution of News:
 hour
0     1351472
1          82
2          48
3          27
4          67
5          14
6          57
7          93
8        1469
9        1829
10       2476
11       5033
12       5527
13       5965
14       7669
15       5701
16       5732
17       2710
18       2075
19       1612
20       3939
21       2800
22        704
23        227
Name: headline, dtype: int64


In [8]:
publisher_counts = news_df['publisher'].groupby(news_df['publisher']).count().sort_values(ascending=False)
print("\nTop Publishers by News Count:\n", publisher_counts.head(10))


Top Publishers by News Count:
 publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
Eddie Staley          57254
Hal Lindon            49047
ETF Professor         28489
Juan Lopez            28438
Benzinga Staff        28114
Name: publisher, dtype: int64


In [15]:
stop_words = set(stopwords.words('english'))

# Tokenize headlines and remove stop words
all_words = [word.lower() for headline in news_df['headline'] for word in word_tokenize(headline) 
             if word.isalpha() and word not in stop_words]

word_freq = Counter(all_words)

# Show top 10 most common words
print("\nTop 10 Most Common Words in Headlines:\n", word_freq.most_common(10))


Top 10 Most Common Words in Headlines:
 [('stocks', 161702), ('for', 154728), ('vs', 140965), ('in', 130298), ('eps', 128801), ('to', 124595), ('the', 122317), ('est', 122289), ('shares', 114140), ('reports', 108688)]


In [21]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2,2), max_features=20)
tfidf_matrix = vectorizer.fit_transform(news_df['headline'])

# Display top phrases with highest TF-IDF scores
print("\nTop Keywords from Financial News:\n", vectorizer.get_feature_names_out())


Top Keywords from Financial News:
 ['52 week' 'adj eps' 'benzinga upgrades' 'companies trading'
 'earnings scheduled' 'initiates coverage' 'market session'
 'market update' 'mid day' 'pre market' 'price target' 'q1 eps' 'q2 eps'
 'q3 eps' 'raises pt' 'stocks hit' 'stocks moving' 'trading higher'
 'trading lower' 'vs est']


In [20]:
# Convert text data to numerical format
vectorized_data = vectorizer.fit_transform(news_df['headline'])

# Apply LDA model
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(vectorized_data)

# Show top words per topic
for idx, topic in enumerate(lda_model.components_):
    print(f"\nTopic {idx+1}: ", [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

KeyboardInterrupt: 

In [None]:
# Count articles published per day
daily_news_count = news_df.groupby('date').size().reset_index(name='num_articles')

# Plot publication frequency over time
plt.figure(figsize=(12,6))
plt.plot(daily_news_count['date'], daily_news_count['num_articles'], label='Articles per Day')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.title('Daily Financial News Publication Trend')
plt.legend()
plt.show()