In [1]:
%pip install yfinance
import yfinance as yf

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels
Note: you may need to restart the kernel to use updated packages.


In [2]:
import yfinance as yf
import matplotlib.pyplot as plt
import datetime as dt
import pandas as pd
import numpy as np

## Read Data

In [3]:
# df = pd.read_csv('No_Holds.csv')
df = pd.read_csv('2_No_Neutrals.csv')
df.head()

Unnamed: 0,date,company_Name,ticker,broker,analytst,rating_before,rating_after,price_target_before,price_target_after,after
0,12/2/2019,Intel Corp,INTC,SUSQUEHANNA,CHRISTOPHER ROLLAND,Buy,buy,,,1
1,12/3/2019,Alphabet Cl A,GOOGL,CITI,JASON BAZINET,,buy,72.5,75.0,1
2,12/3/2019,Meta Platforms Inc,META,CITI,JASON BAZINET,BUY,buy,,240.0,1
3,12/4/2019,Apple Inc,AAPL,MAXIM GROUP,NEHAL CHOKSHI,SELL,negative,,47.5,-1
4,12/4/2019,Meta Platforms Inc,META,HSBC,NICOLAS COTE-COLISSON,,reduce,,178.0,-1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4766 entries, 0 to 4765
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   date                 4766 non-null   object
 1   company_Name         4766 non-null   object
 2   ticker               4766 non-null   object
 3   broker               4133 non-null   object
 4   analytst             4766 non-null   object
 5   rating_before        3770 non-null   object
 6   rating_after         4766 non-null   object
 7   price_target_before  3403 non-null   object
 8   price_target_after   4734 non-null   object
 9   after                4766 non-null   int64 
dtypes: int64(1), object(9)
memory usage: 372.5+ KB


In [5]:
df["date"] = pd.to_datetime(df["date"])

## PART 1 calculate a daily rating (aka analyst daily sentiment) for each stock & date

Identify stock dates that appear more than once to calculate a daily sentiment date by averaging 'after' (the sentiment rating for a particular date and stock ) and then drop duplicate rows

In [6]:
reduced_df = df.copy()
reduced_df = reduced_df[['date', 'ticker', 'after']]

# Identify dates that appear more than once
duplicate_dates = reduced_df[reduced_df.duplicated(subset=['date'], keep=False)]['date'].unique()

# Filter the DataFrame to show all rows for these duplicate dates
duplicate_date_rows = reduced_df[reduced_df['date'].isin(duplicate_dates)].sort_values(by='date')

# Calculate the mean of 'after' for each unique 'date' and 'ticker' combination
mean_after_per_day_ticker = duplicate_date_rows.groupby(['date', 'ticker'])['after'].mean().reset_index()

# Rename the 'after' column to 'daily_rating'
mean_after_per_day_ticker = mean_after_per_day_ticker.rename(columns={'after': 'avg_daily_rating'})

# Merge this new column back into the duplicate_date_rows DataFrame
# This will add the 'daily_rating' mean to all rows matching the 'date' and 'ticker' combination
duplicate_date_rows = pd.merge(duplicate_date_rows, mean_after_per_day_ticker, on=['date', 'ticker'], how='left')

rating = duplicate_date_rows.drop_duplicates(subset=['date', 'ticker'])

# Using avg_daily_rating create a daily_rating column
# Define the conditions for daily_rating
conditions = [
    (rating['avg_daily_rating'] > 0),
    (rating['avg_daily_rating'] < 0),
    (rating['avg_daily_rating'] == 0)
]

# Define the choices corresponding to the conditions
choices = [1, -1, 0]

# Create the new 'daily_rating' column using np.select
rating['daily_rating'] = np.select(conditions, choices, default=0)

# remove neutral rating observations where daily_rating = 0
rating = rating[rating['daily_rating'] != 0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating['daily_rating'] = np.select(conditions, choices, default=0)


In [7]:
# Select only the relevant columns from 'rating' to merge: 'date', 'ticker', 'avg_daily_rating', 'daily_rating'
rating_to_merge = rating[['date', 'ticker', 'daily_rating']]

# Perform a left merge with reduced_df on 'date' and 'ticker'
# This adds the sentiment columns to reduced_df where dates and tickers match
merged_df = pd.merge(reduced_df, rating_to_merge, on=['date', 'ticker'], how='left')

# Fill null values in 'daily_rating' with values from 'after'
merged_df['daily_rating'] = merged_df['daily_rating'].fillna(merged_df['after'])

# drop column 'after'
merged_df = merged_df.drop(columns=['after'])

# drop
daily_rating_df = merged_df.drop_duplicates(subset=['date', 'ticker'])

In [8]:
daily_rating_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2426 entries, 0 to 4765
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          2426 non-null   datetime64[ns]
 1   ticker        2426 non-null   object        
 2   daily_rating  2426 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 75.8+ KB


## Part 2 Get Closing Stock Price Data

## 1. Get closing stock price and 5 day forward closing stock price from Yahoo Finance for 8 Stocks

API info: https://ranaroussi.github.io/yfinance/reference/index.html



In [9]:
# Stocks of interest
tickers = ["NVDA", "META", "TSLA", "GOOGL", "MSFT", "AAPL", "AMZN", "INTC"]

# access yfinance data
multi_stock_data = yf.download(tickers, start="2019-12-02")

[*********************100%***********************]  8 of 8 completed


In [10]:
# Get the closing stock_df price 5 days in the future.
for ticker in tickers:
    # Create a new column for the closing price 5 days in the future.
    # shift(-5) moves the data 5 steps 'up', meaning the value at date X
    # will be the closing price of date X+5.
    multi_stock_data[('Close_5d', ticker)] = multi_stock_data[('Close', ticker)].shift(-5)

# Save closing price and closing price 5 days into the future
multi_stock_data = multi_stock_data[['Close', 'Close_5d']]

# Stack the 'Ticker' level from the columns to create a new index level.
# The result will have a MultiIndex for rows (Date, Ticker) and the price metrics as columns.
stacked_df = multi_stock_data.stack(level='Ticker')

# Reset the index to convert the MultiIndex (Date, Ticker) into regular columns,
# creating a long-format DataFrame.
stock_df = stacked_df.reset_index()

##2. Calculate 5 day close return

Using 'close' Price and close price 5 days in the future calculate the 5 day return of close price.

$$\frac{\text{(close_future_5d} - \text{close})}{\text{close}}$$

In [11]:
# calc 5 day rate of return and add column to stock_df df
stock_df['return_5d'] = (stock_df['Close_5d'] - stock_df['Close']) / stock_df['Close']
stock_df

Price,Date,Ticker,Close,Close_5d,return_5d
0,2019-12-02,AAPL,63.736347,64.402313,0.010449
1,2019-12-02,AMZN,89.080002,87.475502,-0.018012
2,2019-12-02,GOOGL,63.959732,66.645943,0.041998
3,2019-12-02,INTC,50.861416,49.864647,-0.019598
4,2019-12-02,META,198.314713,199.943329,0.008212
...,...,...,...,...,...
12275,2026-01-09,INTC,45.549999,,
12276,2026-01-09,META,653.059998,,
12277,2026-01-09,MSFT,479.279999,,
12278,2026-01-09,NVDA,184.860001,,


## Part 3 Merge Sentiment Data (daily rating) and Stock Data

In [12]:
# convert column names to lowercase
stock_df.columns = stock_df.columns.str.lower()
daily_rating_df.columns = daily_rating_df.columns.str.lower()

# Convert 'date' column in daily_rating_df to datetime objects
daily_rating_df['date'] = pd.to_datetime(daily_rating_df['date'])

# Merge the two dataframes on 'Date' and 'ticker'
combined_df = pd.merge(daily_rating_df, stock_df, on=['date', 'ticker'], how='inner')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_rating_df['date'] = pd.to_datetime(daily_rating_df['date'])


In [13]:
combined_df

Unnamed: 0,date,ticker,daily_rating,close,close_5d,return_5d
0,2019-12-02,INTC,1.0,50.861416,49.864647,-0.019598
1,2019-12-03,GOOGL,1.0,64.251541,66.640984,0.037189
2,2019-12-03,META,1.0,197.440826,199.476562,0.010311
3,2019-12-04,AAPL,-1.0,63.152462,65.331215,0.034500
4,2019-12-04,META,-1.0,197.331573,200.856949,0.017865
...,...,...,...,...,...,...
2421,2025-09-08,NVDA,1.0,168.291138,177.740097,0.056147
2422,2025-09-09,AAPL,1.0,234.123047,237.919357,0.016215
2423,2025-09-10,AAPL,1.0,226.570358,238.758560,0.053794
2424,2025-09-11,NVDA,1.0,177.160141,176.230194,-0.005249


In [14]:
# post AI Indicator
# pre AI = 0, post AI = 1
combined_df["post_ai"] = (combined_df["date"] >= "2022-11-30").astype(int)

In [15]:
# combined_df.to_csv('daily_ratings_and_stock_prices.csv')

In [17]:
# separate Pre_AI and Post_AI data into 2 dfs
pre_df = combined_df[combined_df['post_ai'] == 0]
post_df = combined_df[combined_df['post_ai'] == 1]

NameError: name 'combined_df' is not defined

In [18]:
pre_df

NameError: name 'pre_df' is not defined

In [19]:
post_df

NameError: name 'post_df' is not defined

## T-Test 1: Analyst Rating (Sentiment) Effect Pre-AI

In [None]:
from scipy import stats
## Pre AI Hypothesis test

# Separate groups based on daily_rating
group_positive_pre = pre_df[pre_df['daily_rating'] == 1]['return_5d']
group_negative_pre = pre_df[pre_df['daily_rating'] == -1]['return_5d']

# Perform independent t-test
t_statistic, p_value = stats.ttest_ind(group_positive_pre, group_negative_pre, equal_var=False)

print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("There is a statistically significant difference between the means of the two groups (daily_rating 1 and daily_rating -1).")
else:
    print("There is no statistically significant difference between the means of the two groups (daily_rating 1 and daily_rating -1).")

T-statistic: -2.062656712737574
P-value: 0.041629022668791406
There is a statistically significant difference between the means of the two groups (daily_rating 1 and daily_rating -1).


### Sample Statistics for Pre-AI Groups

In [None]:
print("Pre-AI Period (daily_rating = 1) Statistics:")
display(group_positive_pre.describe())

print("\nPre-AI Period (daily_rating = -1) Statistics:")
display(group_negative_pre.describe())

Pre-AI Period (daily_rating = 1) Statistics:


Unnamed: 0,return_5d
count,1202.0
mean,0.007646
std,0.064508
min,-0.337354
25%,-0.02644
50%,0.009628
75%,0.041412
max,0.397799



Pre-AI Period (daily_rating = -1) Statistics:


Unnamed: 0,return_5d
count,99.0
mean,0.029491
std,0.10374
min,-0.182378
25%,-0.026665
50%,0.006107
75%,0.052174
max,0.564756


## T-Test2: Analyst Rating (Sentiment) Effect Post AI

In [None]:
## Post AI Hypothesis test

# Separate groups based on daily_rating
group_positive_post = post_df[post_df['daily_rating'] == 1]['return_5d']
group_negative_post = post_df[post_df['daily_rating'] == -1]['return_5d']

# Perform independent t-test
t_statistic, p_value = stats.ttest_ind(group_positive_post, group_negative_post, equal_var=False)

print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("There is a statistically significant difference between the means of the two groups (daily_rating 1 and daily_rating -1).")
else:
    print("There is no statistically significant difference between the means of the two groups (daily_rating 1 and daily_rating -1).")

T-statistic: 0.107439553797923
P-value: 0.9148383559984173
There is no statistically significant difference between the means of the two groups (daily_rating 1 and daily_rating -1).


In [None]:
print("Post-AI Period (daily_rating = 1) Statistics:")
display(group_positive_post.describe())

print("\nPost-AI Period (daily_rating = -1) Statistics:")
display(group_negative_post.describe())

Post-AI Period (daily_rating = 1) Statistics:


Unnamed: 0,return_5d
count,1073.0
mean,0.008139
std,0.05934
min,-0.272036
25%,-0.024454
50%,0.004993
75%,0.039263
max,0.366068



Post-AI Period (daily_rating = -1) Statistics:


Unnamed: 0,return_5d
count,52.0
mean,0.006983
std,0.076504
min,-0.173583
25%,-0.032167
50%,-0.00186
75%,0.057153
max,0.250024


### Sample Statistics for Post-AI Groups

## T-Test3: Difference-in-Differences (DiD)
## Q: Did the Analyst Rating (Sentiment) Effect Change post vs pre AI?




### Regression is the correct T-Test for DiD

Want to run regression (daily_rating x ai_interaction)

In [None]:
# run regression (daily_rating x AI interaction) using combined_df

# Add post_ai interaction column (daily_rating*post_ai)
combined_df['post_ai_interaction'] = (combined_df['daily_rating'] * combined_df['post_ai']).astype(int)

In [None]:
combined_df

Unnamed: 0,date,ticker,daily_rating,close,close_5d,return_5d,post_ai,post_ai_interaction
0,2019-12-02,INTC,1.0,50.861420,49.864651,-0.019598,0,0
1,2019-12-03,GOOGL,1.0,64.251534,66.640976,0.037189,0,0
2,2019-12-03,META,1.0,197.440811,199.476593,0.010311,0,0
3,2019-12-04,AAPL,-1.0,63.152473,65.331207,0.034500,0,0
4,2019-12-04,META,-1.0,197.331573,200.856949,0.017865,0,0
...,...,...,...,...,...,...,...,...
2421,2025-09-08,NVDA,1.0,168.291138,177.740097,0.056147,1,1
2422,2025-09-09,AAPL,1.0,234.123047,237.919357,0.016215,1,1
2423,2025-09-10,AAPL,1.0,226.570358,238.758560,0.053794,1,1
2424,2025-09-11,NVDA,1.0,177.160141,176.230194,-0.005249,1,1


*   Y:'return_5d'
*   X1:'daily_rating'
*   X2:'post_ai'
*   X3:'post_ai_interaction'


In [None]:
# Columns
# Y (dependent variable): 'return_5d' = 5 trading day forward return for 8 Stocks
# X1: 'daily_rating' = daily sentiment for the day where 1:net postive sentiment, 0:net negative sentiment
# X2: 'post_ai' = 1 if date > 2022-11-30
# X3: 'post_ai_interaction' = daily_rating * post_ai

#We want to test the change in daily_rating effect or the 'post_ai_interaction'

# Hypothesis test is essentially a T-Test on the interaction coefficient: 'post_ai_interaction'

# To test the Hypothesis we need to run a OLS regression

# H0: coef('post_ai_interaction') = 0

# If H0 rejected, analyst rating matters differently after AI
# If H0 not rejected, we cannot can neither confirm nor deny that analyst matters differently after AI


In [None]:
import statsmodels.api as sm

# run regression (sentiment x AI interaction)

X = combined_df[['daily_rating', 'post_ai', 'post_ai_interaction']]
y = combined_df['return_5d']

# Add a constant term to the independent variables
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              return_5d   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     3.557
Date:                Fri, 09 Jan 2026   Prob (F-statistic):             0.0138
Time:                        19:51:21   Log-Likelihood:                 3203.6
No. Observations:                2426   AIC:                            -6399.
Df Residuals:                    2422   BIC:                            -6376.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.0186    

In [None]:
# extract OLS results
# The Null hypothesis is:
# H0: coef('post_ai_interaction') = 0

ols_coeffs = model.params
ols_pvalues = model.pvalues

daily_rating_ols_coef = ols_coeffs['daily_rating']
daily_rating_ols_pvalue = ols_pvalues['daily_rating']

post_ai_ols_coef = ols_coeffs['post_ai']
post_ai_ols_pvalue = ols_pvalues['post_ai']

post_ai_interaction_ols_coef = ols_coeffs['post_ai_interaction']
post_ai_interaction_ols_pvalue = ols_pvalues['post_ai_interaction']

print(f"Daily Rating Coefficient: {daily_rating_ols_coef:.4f} (p-value: {daily_rating_ols_pvalue:.4f})")
print(f"Post AI Coefficient: {post_ai_ols_coef:.4f} (p-value: {post_ai_ols_pvalue:.4f})")
print(f"Post AI Interaction Coefficient: {post_ai_interaction_ols_coef:.4f} (p-value: {post_ai_interaction_ols_pvalue:.4f})")

Daily Rating Coefficient: -0.0109 (p-value: 0.0012)
Post AI Coefficient: -0.0110 (p-value: 0.0536)
Post AI Interaction Coefficient: 0.0115 (p-value: 0.0438)


### Summary Table

In [None]:
results_df = pd.DataFrame({
    'Test': ['pre-AI daily analyst rating', 'post-AI daily analyst rating', 'change in post-ai vs pre-ai daily analyst rating'],
    'Comparison': ['+ vs -', '+ vs -', 'DiD'],
    'p-value': [0.0416, 0.9148, 0.0438],
    't-stat': [-2.0626, 0.1074, 2.017],
    'Conclusion': ['Significant', 'Not Significant', 'Significant']
})

results_df


Unnamed: 0,Test,Comparison,p-value,t-stat,Conclusion
0,pre-AI daily analyst rating,+ vs -,0.0416,-2.0626,Significant
1,post-AI daily analyst rating,+ vs -,0.9148,0.1074,Not Significant
2,change in post-ai vs pre-ai daily analyst rating,DiD,0.0438,2.017,Significant


## With AI (pre vs post)

*   pre-AI daily rating effect: how much analyst sentiment mattered before AI
*   post-AI daily rating effect: how much analyst sentiment mattered after AI
*   Change in daily rating effect: whether analyst opinions became more or less informative after AI



