In [174]:
import pandas as pd
import numpy as np
from textblob import TextBlob

Load the dataset into a pandas DataFrame

In [175]:
news_data = pd.read_csv('../data/raw_analyst_ratings.csv')
tsla_data = pd.read_csv('../data/TSLA_historical_data.csv')

check the data  

In [176]:
print(news_data)

         Unnamed: 0                                           headline  \
0                 0            Stocks That Hit 52-Week Highs On Friday   
1                 1         Stocks That Hit 52-Week Highs On Wednesday   
2                 2                      71 Biggest Movers From Friday   
3                 3       46 Stocks Moving In Friday's Mid-Day Session   
4                 4  B of A Securities Maintains Neutral on Agilent...   
...             ...                                                ...   
1407323     1413844             Top Narrow Based Indexes For August 29   
1407324     1413845  Recap: Wednesday's Top Percentage Gainers and ...   
1407325     1413846  UPDATE: Oppenheimer Color on China Zenix Auto ...   
1407326     1413847  Oppenheimer Initiates China Zenix At Outperfor...   
1407327     1413848  China Zenix Auto International Opens For Tradi...   

                                                       url          publisher  \
0        https://www.benzinga.

In [177]:
print(tsla_data)

            Date        Open        High         Low       Close   Adj Close  \
0     2010-06-29    1.266667    1.666667    1.169333    1.592667    1.592667   
1     2010-06-30    1.719333    2.028000    1.553333    1.588667    1.588667   
2     2010-07-01    1.666667    1.728000    1.351333    1.464000    1.464000   
3     2010-07-02    1.533333    1.540000    1.247333    1.280000    1.280000   
4     2010-07-06    1.333333    1.333333    1.055333    1.074000    1.074000   
...          ...         ...         ...         ...         ...         ...   
3540  2024-07-24  225.419998  225.990005  214.710007  215.990005  215.990005   
3541  2024-07-25  216.800003  226.000000  216.229996  220.250000  220.250000   
3542  2024-07-26  221.190002  222.279999  215.330002  219.800003  219.800003   
3543  2024-07-29  224.899994  234.270004  224.699997  232.100006  232.100006   
3544  2024-07-30  232.250000  232.410004  220.000000  222.619995  222.619995   

         Volume  Dividends  Stock Split

Convert the data to datetime 

In [178]:
news_data['date'] = pd.to_datetime(news_data['date'], format='ISO8601')
tsla_data['Date'] = pd.to_datetime(tsla_data['Date'],utc=True)


In [179]:
print(news_data['date'])

0         2020-06-05 10:30:54-04:00
1         2020-06-03 10:45:20-04:00
2         2020-05-26 04:30:07-04:00
3         2020-05-22 12:45:06-04:00
4         2020-05-22 11:38:59-04:00
                     ...           
1407323   2011-08-29 00:00:00-04:00
1407324   2011-06-22 00:00:00-04:00
1407325   2011-06-21 00:00:00-04:00
1407326   2011-06-21 00:00:00-04:00
1407327   2011-05-12 00:00:00-04:00
Name: date, Length: 1407328, dtype: datetime64[ns, UTC-04:00]


Sentiment Analysis

In [180]:
news_data['sentiment'] = news_data['headline'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [181]:
print(news_data['sentiment'])

0          0.00
1          0.00
2          0.00
3          0.00
4          0.00
           ... 
1407323    0.15
1407324    0.15
1407325    0.00
1407326    0.00
1407327    0.00
Name: sentiment, Length: 1407328, dtype: float64


Calculate Daily Stock Returns

In [182]:
# Compute daily percentage change in closing prices to represent stock movements

tsla_data['Daily_Returns'] = tsla_data['Close'].pct_change() * 100

Aggregate sentiment scores if multiple articles appear on the same day

In [183]:
daily_sentiment = news_data.groupby('date')['sentiment'].mean().reset_index()

Merge the datasets on the date

In [184]:
# Ensure both datasets have the same dates
merged_data = pd.concat([daily_sentiment, tsla_data], axis=1)

Print merged dataset

In [185]:
print("Merged dataset:")
print(merged_data)

Merged dataset:
                           date  sentiment                      Date  \
0     2009-02-14 00:00:00-04:00   0.000000 2010-06-29 00:00:00+00:00   
1     2009-04-27 00:00:00-04:00   0.000000 2010-06-30 00:00:00+00:00   
2     2009-04-29 00:00:00-04:00   0.000000 2010-07-01 00:00:00+00:00   
3     2009-05-22 00:00:00-04:00   0.000000 2010-07-02 00:00:00+00:00   
4     2009-05-27 00:00:00-04:00   0.234091 2010-07-06 00:00:00+00:00   
...                         ...        ...                       ...   
39952 2020-06-11 16:49:41-04:00   0.000000                       NaT   
39953 2020-06-11 16:51:33-04:00   0.000000                       NaT   
39954 2020-06-11 17:01:39-04:00  -0.085185                       NaT   
39955 2020-06-11 17:11:20-04:00   0.000000                       NaT   
39956 2020-06-11 17:12:35-04:00   0.000000                       NaT   

           Open      High       Low     Close  Adj Close       Volume  \
0      1.266667  1.666667  1.169333  1.592667 

Check for missing values

In [186]:
missing_sentiment = merged_data['sentiment'].isnull().sum()
missing_returns = merged_data['Daily_Returns'].isnull().sum()
print("Missing values in sentiment:", missing_sentiment)
print("Missing values in Daily_Returns:", missing_returns)

# Check for infinite values
infinite_sentiment = np.isinf(merged_data['sentiment']).sum()
infinite_returns = np.isinf(merged_data['Daily_Returns']).sum()
print("Infinite values in sentiment:", infinite_sentiment)
print("Infinite values in Daily_Returns:", infinite_returns)

Missing values in sentiment: 0
Missing values in Daily_Returns: 36413
Infinite values in sentiment: 0
Infinite values in Daily_Returns: 0


In [187]:

print("Merged dataset:")
print(merged_data) 

Merged dataset:
                           date  sentiment                      Date  \
0     2009-02-14 00:00:00-04:00   0.000000 2010-06-29 00:00:00+00:00   
1     2009-04-27 00:00:00-04:00   0.000000 2010-06-30 00:00:00+00:00   
2     2009-04-29 00:00:00-04:00   0.000000 2010-07-01 00:00:00+00:00   
3     2009-05-22 00:00:00-04:00   0.000000 2010-07-02 00:00:00+00:00   
4     2009-05-27 00:00:00-04:00   0.234091 2010-07-06 00:00:00+00:00   
...                         ...        ...                       ...   
39952 2020-06-11 16:49:41-04:00   0.000000                       NaT   
39953 2020-06-11 16:51:33-04:00   0.000000                       NaT   
39954 2020-06-11 17:01:39-04:00  -0.085185                       NaT   
39955 2020-06-11 17:11:20-04:00   0.000000                       NaT   
39956 2020-06-11 17:12:35-04:00   0.000000                       NaT   

           Open      High       Low     Close  Adj Close       Volume  \
0      1.266667  1.666667  1.169333  1.592667 

In [188]:
print(len(merged_data['sentiment']))
print(len(merged_data['Daily_Returns']))

39957
39957


In [189]:
from scipy.stats import pearsonr

# Check for NaN or infinite values in the arrays
nan_mask = np.isnan(merged_data['sentiment']) | np.isnan(merged_data['Daily_Returns'])
inf_mask = np.isinf(merged_data['sentiment']) | np.isinf(merged_data['Daily_Returns'])

# Combine NaN and infinite masks
invalid_mask = nan_mask | inf_mask

In [190]:
# Remove invalid values from both arrays
clean_sentiment = merged_data['sentiment'][~invalid_mask]
clean_returns = merged_data['Daily_Returns'][~invalid_mask]

Correlation Analysis

 Calculate correlation between sentiment and daily stock returns

In [196]:

correlation = merged_data['sentiment'].corr(merged_data['Daily_Returns'])

print(f"Correlation between sentiment and daily stock returns: {correlation}")

Correlation between sentiment and daily stock returns: 0.04234996262396389
