In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from textblob import TextBlob  # type: ignore
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer


In [None]:
# Load the news dataset
news_data = pd.read_csv("../data/raw_analyst_ratings.csv")
print(news_data.head())
print(news_data.info())




In [None]:
# Example: Analyze headline length
news_data['headline_length'] = news_data['headline'].str.len()
sns.histplot(news_data['headline_length'], bins=30, kde=True)
plt.show()


In [None]:

sns.countplot(data=news_data, x='sentiment_category', palette='viridis')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Number of Headlines')
plt.show()


In [None]:
#  Extract Common Keywords or Phrases

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=20)

# Fit and transform headlines
tfidf_matrix = tfidf.fit_transform(news_data['headline'])
top_keywords = tfidf.get_feature_names_out()

print("Top Keywords:", top_keywords)


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Initialize LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)  # 5 topics
lda.fit(tfidf_matrix)

# Display top words for each topic
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx + 1}:")
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-10:]])


In [None]:
# save the result 
news_data.to_csv("../data/processed_sentiment_topic_modeling.csv", index=False)


In [None]:
# topic modeling

import os


yfinance_data_dir = '../data/yfinance_data/'
csv_files = [f for f in os.listdir(yfinance_data_dir) if f.endswith('.csv')]
stock_data = {file: pd.read_csv(os.path.join(yfinance_data_dir, file)) for file in csv_files}
print(stock_data['AAPL_historical_data.csv'].head())  # Checking the data for AAPL


In [None]:
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce')
print(news_data['date'].isnull().sum())  # Check for invalid entries


In [None]:
print(news_data['date'].dtypes)
print(news_data['date'].head())


In [None]:
news_data['publication_date'] = news_data['date'].dt.date
daily_publication_counts = news_data['publication_date'].value_counts().sort_index()

import matplotlib.pyplot as plt

plt.figure(dpi= 200, figsize=(12, 6))
daily_publication_counts.plot(kind='line', title="Daily Publication Trends")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.grid()
plt.show()


In [None]:
news_data.head()

In [None]:
publisher_counts = news_data['publisher'].value_counts()
print(publisher_counts.head(10))  # Display the top 10 publishers


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

top_publishers = publisher_counts.head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_publishers.index, y=top_publishers.values, palette="viridis")
plt.xticks(rotation=45)
plt.title("Top 10 Publishers by Article Count")
plt.xlabel("Publisher")
plt.ylabel("Number of Articles")
plt.show()


In [None]:
for publisher in top_publishers.index:
    print(f"\nHeadlines for {publisher}:")
    print(news_data[news_data['publisher'] == publisher]['headline'].head(5))  # Show 5 sample headlines


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Analyze content for a specific publisher
selected_publisher = top_publishers.index[0]  # Example: the most frequent publisher
publisher_headlines = news_data[news_data['publisher'] == selected_publisher]['headline']

tfidf = TfidfVectorizer(stop_words='english', max_features=20)
tfidf_matrix = tfidf.fit_transform(publisher_headlines)
print(f"Top Keywords for {selected_publisher}: {tfidf.get_feature_names_out()}")


In [None]:
news_data['publisher_domain'] = news_data['publisher'].apply(
    lambda x: x.split('@')[-1] if '@' in x else None
)
domain_counts = news_data['publisher_domain'].value_counts()
print(domain_counts.head(10))  # Display the top 10 domains



In [None]:
top_domains = domain_counts.head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_domains.index, y=top_domains.values, palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Top 10 Email Domains by Article Count")
plt.xlabel("Domain")
plt.ylabel("Number of Articles")
plt.show()


In [None]:
import os
import pandas as pd

# Path to the yfinance_data folder
data_folder = "../data/yfinance_data"

# List all CSV files in the folder
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]

# Load each file into a dictionary
stock_data = {
    file.split('_')[0]: pd.read_csv(os.path.join(data_folder, file)) for file in csv_files
}

# Display the first few rows of a loaded DataFrame (e.g., AAPL)
print(stock_data['AAPL'].head())


In [None]:
import pandas as pd
import os

# Define the data folder path
data_folder = "../data/yfinance_data"

# Initialize an empty dictionary to store stock DataFrames
data = {}

# Load all CSV files in the yfinance_data folder
for file in os.listdir(data_folder):
    if file.endswith("_historical_data.csv"):
        stock_symbol = file.split("_")[0]  # Extract stock symbol from file name
        file_path = os.path.join(data_folder, file)
        df = pd.read_csv(file_path)
        df['Ticker'] = stock_symbol  # Add a column for the stock symbol
        data[stock_symbol] = df

# Combine all data into one DataFrame
stock_data = pd.concat(data.values(), ignore_index=True)
print(stock_data.head())


In [None]:
import talib

# Apply technical indicators for each stock
indicators = []
for stock_symbol, df in data.items():
    # Ensure 'Close' column is numeric
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    
    # Simple Moving Average (SMA)
    df['SMA_50'] = talib.SMA(df['Close'], timeperiod=50)
    
    # Relative Strength Index (RSI)
    df['RSI_14'] = talib.RSI(df['Close'], timeperiod=14)
    
    # Moving Average Convergence Divergence (MACD)
    df['MACD'], df['Signal_Line'], df['Histogram'] = talib.MACD(
        df['Close'], fastperiod=12, slowperiod=26, signalperiod=9
    )
    
    indicators.append(df)

# Combine updated stock data
stock_data_indicators = pd.concat(indicators, ignore_index=True)
print(stock_data_indicators.head())


In [None]:
# Ensure 'Date' is in datetime format
aapl_data = data['AAPL']  # Replace 'AAPL' with the desired stock symbol
aapl_data['Date'] = pd.to_datetime(aapl_data['Date'])  
aapl_data['Date'] = pd.to_datetime(aapl_data['Date'])

# Plot Closing Price and SMA with Date on x-axis
plt.figure(figsize=(12, 6))
plt.plot(aapl_data['Date'], aapl_data['Close'], label='Close Price', color='blue')
plt.plot(aapl_data['Date'], aapl_data['SMA_50'], label='50-Day SMA', color='red')
plt.title('AAPL Closing Price and 50-Day SMA')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.grid()
plt.show()


In [None]:
# Plot RSI with Date on x-axis
aapl_data = data['AAPL']  # Replace 'AAPL' with the desired stock symbol
aapl_data['Date'] = pd.to_datetime(aapl_data['Date'])  
plt.figure(figsize=(12, 6))
plt.plot(aapl_data['Date'], aapl_data['RSI_14'], label='RSI (14)', color='green')
plt.axhline(70, color='red', linestyle='--', label='Overbought (70)')
plt.axhline(30, color='blue', linestyle='--', label='Oversold (30)')
plt.title('AAPL RSI Indicator')
plt.xlabel('Date')
plt.ylabel('RSI')
plt.legend()
plt.grid()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import talib

# Load data for a specific stock (e.g., AAPL)
aapl_data = data['AAPL']  # Replace 'AAPL' with the desired stock symbol
aapl_data['Date'] = pd.to_datetime(aapl_data['Date'])  # Ensure 'Date' is a datetime type

# Calculate MACD, Signal Line, and Histogram
aapl_data['MACD'], aapl_data['Signal_Line'], aapl_data['Histogram'] = talib.MACD(
    aapl_data['Close'], fastperiod=12, slowperiod=26, signalperiod=9
)

# Plot the MACD, Signal Line, and Histogram
plt.figure(figsize=(14, 7))

# Plot MACD and Signal Line
plt.plot(aapl_data['Date'], aapl_data['MACD'], label='MACD', color='blue')
plt.plot(aapl_data['Date'], aapl_data['Signal_Line'], label='Signal Line', color='red')

# Plot Histogram as bars
plt.bar(aapl_data['Date'], aapl_data['Histogram'], label='Histogram', color='gray', alpha=0.5)

# Add labels and title
plt.title('MACD Indicator for AAPL')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid()
plt.show()


In [None]:
# Check columns for a specific stock (e.g., AAPL)
print(data['AAPL'].columns)

# Check news dataset
print(news_data.columns)


In [None]:
import pandas as pd

# Combine all stock DataFrames into one DataFrame
combined_stock_data = pd.concat(data.values(), ignore_index=True)

# Convert the 'Date' column to datetime format
combined_stock_data['Date'] = pd.to_datetime(combined_stock_data['Date'])

# Verify
print(combined_stock_data[['Date', 'Ticker', 'Close']].head())


In [None]:
# Convert the 'date' column in news_data to datetime format
news_data['date'] = pd.to_datetime(news_data['date'])

# Verify
print(news_data[['date', 'headline']].head())


In [None]:
# If 'news_data['date']' has time and you want to match only the date part, remove the time part
news_data['date'] = news_data['date'].dt.date

# Ensure both 'Date' columns are in the same format (datetime.date)
combined_stock_data['Date'] = pd.to_datetime(combined_stock_data['Date']).dt.date

# Merge datasets again
aligned_data = pd.merge(
    combined_stock_data, 
    news_data, 
    left_on='Date', 
    right_on='date', 
    how='inner'
)

# Check the merged data
print(aligned_data[['Date', 'Close', 'headline']].head())


In [None]:
# Assuming stock_data and news_data are already loaded

# Convert dates to datetime format
combined_stock_data['Date'] = pd.to_datetime(combined_stock_data['Date'])
news_data['date'] = pd.to_datetime(news_data['date'])

# Merge datasets on the 'Date' column (or 'date' in the news dataset)
merged_data = pd.merge(combined_stock_data, news_data, left_on='Date', right_on='date', how='inner')

# Check the merged data
print(merged_data.head())


In [None]:
from textblob import TextBlob

# Function to calculate sentiment polarity
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply sentiment analysis to the 'headline' column
merged_data['sentiment'] = merged_data['headline'].apply(get_sentiment)

# Show the data with sentiment scores
print(merged_data[['headline', 'sentiment']].head())


In [None]:
# Calculate daily stock returns (percentage change in closing prices)
merged_data['daily_return'] = merged_data['Close'].pct_change() * 100  # Percentage change
print(merged_data[['Date', 'Close', 'daily_return']].head())


In [None]:
#Aggregate Sentiments
# Group by Date and calculate average sentiment for each day
daily_sentiment = merged_data.groupby('Date')['sentiment'].mean().reset_index()

# Merge average sentiment back into the main data (to calculate correlation later)
merged_data = pd.merge(merged_data, daily_sentiment, on='Date', suffixes=('', '_avg'))

print(merged_data[['Date', 'sentiment', 'sentiment_avg']].head())


In [None]:
#calculate correlation
# Calculate the Pearson correlation coefficient between average sentiment and daily returns
correlation = merged_data[['sentiment_avg', 'daily_return']].corr().iloc[0, 1]
print(f"Pearson Correlation: {correlation}")
#A positive correlation suggests that positive sentiment correlates with positive stock returns, and vice versa.

In [None]:
import matplotlib.pyplot as plt

# Plot sentiment vs daily stock return
plt.figure(figsize=(10, 6))
plt.scatter(merged_data['sentiment_avg'], merged_data['daily_return'], alpha=0.5)
plt.title('Sentiment vs Daily Stock Returns')
plt.xlabel('Average Sentiment')
plt.ylabel('Daily Stock Return (%)')
plt.grid(True)
plt.show()
