In [4]:
import nltk
from textblob import TextBlob
import pandas as pd
import matplotlib.pyplot as plt
import os
os.chdir('../scripts/')
from data_loader import read_csv_file

In [20]:
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [51]:
def normailze_date(news_df, stock_df):
    """
    Normalize dates in both datasets to ensure alignment.
    """
    news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce')
    news_df['date'].fillna(news_df['date'].min(), inplace=True)
    news_df['date'] = news_df['date'].dt.date
    stock_df['Date'] = pd.to_datetime(stock_df['Date']).dt.date
    return news_df, stock_df


In [37]:
def analyze_sentiment(headline):
    analysis = TextBlob(headline)
    return analysis.sentiment.polarity

In [38]:
def calculate_daily_returns(df):
    df['Daily Returns'] = df['Close'].pct_change()
    return df

In [39]:
def aggrgate_daily_sentiment(df):
    daily_sentiment = df.groupby('date')['Sentiment'].mean().reset_index()
    return daily_sentiment


In [40]:
def calculate_correlation(stock_df, daily_sentiment):
    merged_df = pd.merge(stock_df,daily_sentiment,left_on='Date', right_on='date')
    correlation = merged_df['Daily Returns'].corr(merged_df['Sentiment'])
    return correlation

In [41]:
def visualize_correlation(stock_df, daily_sentiment): 
    """ Visualize the correlation between daily sentiment scores and stock returns. """ 
    merged_df = pd.merge(stock_df, daily_sentiment, left_on='Date', right_on='date')
    plt.figure(figsize=(14,7)) 
    plt.scatter(merged_df['Sentiment'], merged_df['Daily Returns']) 
    plt.title('Correlation between Daily Sentiment Scores and Stock Returns') 
    plt.xlabel('Daily Sentiment Scores') 
    plt.ylabel('Daily Returns')
    plt.show()

In [18]:
def main(news_file,stock_file):
    news_df = read_csv_file(news_file)
    stock_df = read_csv_file(stock_file)
    news_df, stock_df = normailze_date(news_df,stock_df)
    news_df['Sentiment'] = news_df['headline'].apply(analyze_sentiment)
    stock_df = calculate_daily_returns(stock_df)
    daily_sentiment = aggrgate_daily_sentiment(news_df)
    correlation = calculate_correlation(stock_df,daily_sentiment)
    print(f"Pearson correlation coefficient: {correlation}")
    visualize_correlation(stock_df,daily_sentiment)

In [26]:
news_file = '../data/raw_analyst_ratings.csv'
stock_file =  '../data/TSLA_historical_data.csv'


In [42]:
news_df = read_csv_file(news_file)
stock_df = read_csv_file(stock_file)

In [52]:
news_df, stock_df = normailze_date(news_df,stock_df)
news_df



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  news_df['date'].fillna(news_df['date'].min(), inplace=True)


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2009-02-14,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2009-02-14,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2009-02-14,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2009-02-14,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2009-02-14,A
...,...,...,...,...,...,...
1407323,1413844,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29,ZX
1407324,1413845,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22,ZX
1407325,1413846,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21,ZX
1407326,1413847,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21,ZX


In [54]:
news_df['Sentiment'] = news_df['headline'].apply(analyze_sentiment)

In [55]:
stock_df = calculate_daily_returns(stock_df)
daily_sentiment = aggrgate_daily_sentiment(news_df)

In [56]:
correlation = calculate_correlation(stock_df,daily_sentiment)


In [58]:
print(f"Pearson correlation coefficient: {correlation}")
stock_df

Pearson correlation coefficient: 0.05501737019119298


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,Daily Returns
0,2010-06-29,1.266667,1.666667,1.169333,1.592667,1.592667,281494500,0.0,0.0,
1,2010-06-30,1.719333,2.028000,1.553333,1.588667,1.588667,257806500,0.0,0.0,-0.002511
2,2010-07-01,1.666667,1.728000,1.351333,1.464000,1.464000,123282000,0.0,0.0,-0.078473
3,2010-07-02,1.533333,1.540000,1.247333,1.280000,1.280000,77097000,0.0,0.0,-0.125683
4,2010-07-06,1.333333,1.333333,1.055333,1.074000,1.074000,103003500,0.0,0.0,-0.160937
...,...,...,...,...,...,...,...,...,...,...
3540,2024-07-24,225.419998,225.990005,214.710007,215.990005,215.990005,167942900,0.0,0.0,-0.123346
3541,2024-07-25,216.800003,226.000000,216.229996,220.250000,220.250000,100636500,0.0,0.0,0.019723
3542,2024-07-26,221.190002,222.279999,215.330002,219.800003,219.800003,94604100,0.0,0.0,-0.002043
3543,2024-07-29,224.899994,234.270004,224.699997,232.100006,232.100006,129201800,0.0,0.0,0.055960


In [59]:
daily_sentiment

Unnamed: 0,date,Sentiment
0,2009-02-14,0.038008
1,2009-04-27,0.000000
2,2009-04-29,0.000000
3,2009-05-22,0.000000
4,2009-05-27,0.234091
...,...,...
3941,2020-05-30,0.044190
3942,2020-05-31,-0.001471
3943,2020-06-01,0.064714
3944,2020-06-02,0.009731
