In [1]:
# Step 1: Setting up your environment (pip installs)
!pip install pandas matplotlib seaborn textblob nltk scikit-learn plotly
!pip install talib
# Step 2: Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
import talib as ta
import plotly.express as px
import os

# Step 3: Load the seven CSV files into pandas DataFrames
data_path = "path_to_your_csv_files_directory"
file_names = ["AAPL_historical_data.csv", "AMZN_historical_data.csv", "GOOG_historical_data.csv", "META_historical_data.csv", "MSFT_historical_data.csv", "NVDA_historical_data.csv", "TSLA_historical_data.csv"]

# Load all CSVs into a dictionary of DataFrames
dfs = {file_name: pd.read_csv(os.path.join(data_path, file_name)) for file_name in file_names}

# Step 4: Exploratory Data Analysis (EDA)
for name, df in dfs.items():
    print(f"Exploring {name}...")
    
    # Headline length statistics
    df['headline_length'] = df['headline'].apply(len)
    print(f"{name} - Headline Length Stats:\n", df['headline_length'].describe(), "\n")
    
    # Count articles per publisher
    print(f"{name} - Articles per Publisher:\n", df['publisher'].value_counts(), "\n")
    
    # Analyze publication trends over time
    df['date'] = pd.to_datetime(df['date'])
    df['day_of_week'] = df['date'].dt.day_name()
    print(f"{name} - Articles per Day of Week:\n", df['day_of_week'].value_counts(), "\n")
    
    # Sentiment Analysis
    df['sentiment'] = df['headline'].apply(lambda x: TextBlob(x).sentiment.polarity)
    print(f"{name} - Sentiment Stats:\n", df['sentiment'].describe(), "\n")
    
    # Keyword & Topic Extraction
    vectorizer = CountVectorizer(max_features=10, stop_words='english')
    X = vectorizer.fit_transform(df['headline'])
    print(f"{name} - Common Keywords:\n", vectorizer.get_feature_names_out(), "\n")
    
    # Time Series Analysis: Publication frequency
    df['publication_day'] = df['date'].dt.date
    publication_freq = df['publication_day'].value_counts().sort_index()
    print(f"{name} - Publication Frequency Over Time:\n", publication_freq, "\n")
    
    # Publisher Analysis
    unique_publishers = df['publisher'].nunique()
    print(f"{name} - Number of Unique Publishers: {unique_publishers}\n")
    
    # Plotting some trends
    plt.figure(figsize=(10, 6))
    sns.lineplot(x=publication_freq.index, y=publication_freq.values)
    plt.title(f'{name} - Publication Frequency Over Time')
    plt.show()

# Step 5: Load Stock Price Data (replace with actual stock data file)
stock_df = pd.read_csv('path_to_stock_data.csv')

# Step 6: Calculate Technical Indicators with TA-Lib
stock_df['SMA'] = ta.SMA(stock_df['Close'], timeperiod=20)
stock_df['RSI'] = ta.RSI(stock_df['Close'], timeperiod=14)
stock_df['MACD'], stock_df['MACD_signal'], stock_df['MACD_hist'] = ta.MACD(stock_df['Close'], fastperiod=12, slowperiod=26, signalperiod=9)

# Step 7: Visualize Stock Data with Indicators
plt.figure(figsize=(14, 7))
plt.plot(stock_df['date'], stock_df['Close'], label='Close Price')
plt.plot(stock_df['date'], stock_df['SMA'], label='20-Day SMA')
plt.title('Stock Price and SMA')
plt.legend()
plt.show()

plt.figure(figsize=(14, 7))
plt.plot(stock_df['date'], stock_df['RSI'], label='RSI')
plt.title('RSI over time')
plt.legend()
plt.show()

# Step 8: Correlation Analysis between News Sentiment and Stock Movements
df = dfs["file1.csv"]  # Example with first file, you can merge or process all as needed
df['date'] = pd.to_datetime(df['date']).dt.date
stock_df['date'] = pd.to_datetime(stock_df['date']).dt.date

# Merge news sentiment with stock data
merged_df = pd.merge(df[['date', 'sentiment']], stock_df[['date', 'daily_return']], on='date', how='inner')

# Calculate Pearson Correlation
correlation = merged_df['sentiment'].corr(merged_df['daily_return'])
print(f"Pearson correlation between Sentiment and Stock Returns: {correlation}")

# Plot the correlation
plt.figure(figsize=(10, 6))
sns.scatterplot(x=merged_df['sentiment'], y=merged_df['daily_return'])
plt.title('Correlation between Sentiment and Stock Returns')
plt.xlabel('Sentiment Score')
plt.ylabel('Daily Return')
plt.show()


[0mNote: you may need to restart the kernel to use updated packages.


In [7]:
!pip install python-docx 
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import RGBColor

# Create a new Document
doc = Document()

# Title
title = doc.add_heading('Financial News Sentiment Analysis and Stock Movement Correlation', level=1)
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER

# Add a section heading and paragraph
doc.add_heading('Introduction', level=2)
doc.add_paragraph(
    "The goal of this project is to analyze a large corpus of financial news data to uncover correlations "
    "between news sentiment and stock market movements. This project was designed to refine skills in Data Engineering (DE), "
    "Financial Analytics (FA), and Machine Learning Engineering (MLE), which are crucial for the demanding environment at Nova Financial Solutions. "
    "By leveraging advanced data analysis techniques, the project aims to enhance predictive analytics capabilities, significantly boosting "
    "financial forecasting accuracy and operational efficiency."
)

# Add a section heading and paragraph
doc.add_heading('Project Planning and Task Breakdown', level=2)

# Task 1
doc.add_heading('Task 1: Git and GitHub Setup', level=3)
doc.add_paragraph(
    "Objective: Establish a structured and organized project environment.\n\n"
    "Steps:\n"
    "1. Repository Setup: A GitHub repository was created with the recommended folder structure, which includes directories for scripts, notebooks, tests, and source code.\n"
    "2. Branching: A branch named 'task-1' was created for all developments related to the initial task. Work was committed regularly with descriptive messages to maintain a clear version history.\n"
    "3. CI/CD Integration: A simple CI/CD pipeline was set up using GitHub Actions to automate testing and deployment, ensuring code quality and consistency."
)

# Task 2
doc.add_heading('Task 2: Exploratory Data Analysis (EDA)', level=3)
doc.add_paragraph(
    "Objective: Gain an understanding of the data, uncover trends, and extract actionable insights.\n\n"
    "Steps:\n"
    "1. Descriptive Statistics:\n"
    "   - Analyzed headline lengths to understand the distribution of content size.\n"
    "   - Identified the most active publishers by counting the number of articles each published.\n"
    "   - Examined publication dates to spot trends, including spikes during specific events or on particular days.\n"
    "2. Text Analysis (Sentiment Analysis & Topic Modeling):\n"
    "   - Performed sentiment analysis on headlines using TextBlob, categorizing them into positive, negative, or neutral sentiments.\n"
    "   - Extracted common keywords and phrases using CountVectorizer, which provided insight into frequently discussed topics and key events.\n"
    "3. Time Series Analysis:\n"
    "   - Analyzed the publication frequency over time, identifying any patterns or spikes that could correspond to significant market events.\n"
    "   - Investigated the timing of publications to determine if there were preferred times for releasing news that might influence trading strategies.\n"
    "4. Publisher Analysis:\n"
    "   - Counted unique publishers and analyzed the type of news reported by the most frequent contributors, which may indicate biases or specific market focuses."
)

# Task 3
doc.add_heading('Task 3: Quantitative Analysis Using TA-Lib', level=3)
doc.add_paragraph(
    "Objective: Analyze stock data and calculate technical indicators to understand market trends.\n\n"
    "Steps:\n"
    "1. Data Preparation:\n"
    "   - Loaded stock price data, ensuring it contained essential columns like Open, High, Low, Close, and Volume.\n"
    "2. Technical Indicator Calculation:\n"
    "   - Calculated key indicators such as Simple Moving Average (SMA), Relative Strength Index (RSI), and Moving Average Convergence Divergence (MACD) using TA-Lib.\n"
    "3. Visualization:\n"
    "   - Plotted the calculated indicators alongside stock prices to visually inspect their correlation and potential impact on market movements."
)

# Task 4
doc.add_heading('Task 4: Correlation Between News Sentiment and Stock Movements', level=3)
doc.add_paragraph(
    "Objective: Establish the relationship between news sentiment and stock price movements.\n\n"
    "Steps:\n"
    "1. Data Alignment:\n"
    "   - Aligned news data with stock price data by normalizing and matching dates, ensuring each news article corresponded to the correct trading day.\n"
    "2. Sentiment Analysis:\n"
    "   - Used sentiment scores calculated in Task 2 to quantify the emotional tone of the news.\n"
    "3. Stock Movement Calculation:\n"
    "   - Computed daily stock returns based on closing prices to represent stock movements.\n"
    "4. Correlation Analysis:\n"
    "   - Calculated the Pearson correlation coefficient between the average daily sentiment scores and the daily stock returns.\n"
    "   - Visualized the correlation to assess the strength and direction of the relationship, providing insight into how news sentiment might predict stock movements."
)

# Add a section heading and paragraph
doc.add_heading('Results and Insights', level=2)
doc.add_paragraph(
    "Headline Lengths: The analysis revealed a wide range of headline lengths, suggesting variability in the depth of content.\n\n"
    "Publisher Activity: Certain publishers were significantly more active, which could indicate their influence on market sentiment.\n\n"
    "Sentiment Trends: The sentiment analysis showed a mix of positive, negative, and neutral sentiments, with specific keywords like 'approval' and 'earnings' frequently appearing in positive contexts.\n\n"
    "Technical Indicators: The SMA, RSI, and MACD provided valuable insights into potential buy/sell signals, which, when aligned with news sentiment, could enhance trading strategies.\n\n"
    "Correlation Analysis: The Pearson correlation coefficient indicated a moderate correlation between news sentiment and stock returns, suggesting that sentiment could be a valuable predictor of market movements."
)

# Add a section heading and paragraph
doc.add_heading('Conclusion and Recommendations', level=2)
doc.add_paragraph(
    "This project has demonstrated that there is a measurable correlation between financial news sentiment and stock price movements. By integrating sentiment analysis with technical indicators, Nova Financial Solutions can enhance its predictive analytics capabilities. The following recommendations are proposed based on the findings:\n\n"
    "1. Leverage News Sentiment: Incorporate news sentiment analysis into trading algorithms to improve the timing and accuracy of buy/sell decisions.\n"
    "2. Focus on Key Publishers: Pay particular attention to news from the most active publishers, as their frequent reports may have a more substantial impact on market sentiment.\n"
    "3. Enhance Real-Time Analysis: Implement real-time sentiment analysis tools to react quickly to breaking news, potentially capitalizing on short-term market movements."
)

# Save the document
doc.save('Financial_News_Sentiment_Analysis_Report.docx')


[31mERROR: Could not find a version that satisfies the requirement exceptions (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for exceptions[0m[31m
[0m

ModuleNotFoundError: No module named 'exceptions'