In [160]:
import pandas as pd
import numpy as np 
import ast
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [161]:
# read datasets & create dataframes

comdata = pd.read_csv('final_data.csv', index_col= 'Unnamed: 0')
ret_data = pd.read_csv('return_daily.csv', index_col='Unnamed: 0')

In [162]:
market_ret = ret_data[ret_data.index == 'mean']
# market_ret

In [163]:
stock_ret = ret_data.drop('mean', axis=0)
# stock_ret

In [164]:
# save small sample of data for testing
stock_ret.sample(100).to_csv("Data/test_stock_ret.csv")

In [165]:
def parse_isin_list(isin_str):
    # Convert string representation of list into actual list
    try:
        isin_list = ast.literal_eval(isin_str)
        # Replace 'None' and actual None with np.nan
        return [np.nan if (item is None or item == 'None') else item for item in isin_list]
    except:
        return []  # Return empty list if parsing fails

In [166]:
comdata['bidders_isin'] = comdata['bidders_isin'].apply(parse_isin_list)

In [167]:
df_exploded = comdata.explode('bidders_isin')
df_exploded['is_bidder'] = df_exploded['bidders_isin'].notna().astype(int)

In [168]:
df_exploded['targets_isin'] = df_exploded['targets_isin'].apply(parse_isin_list)

In [169]:
df_exploded = df_exploded.explode('targets_isin')
df_exploded['is_target'] = df_exploded['targets_isin'].notna().astype(int)

In [170]:
df_exploded.columns

Index(['number', 'date_completion', 'bidders_name', 'date_completion_assumed',
       'date_last_status_update', 'status', 'bidders_industry', 'id',
       'comments', 'targets_industry', 'targets_isin', 'date_rumor',
       'date_postponed', 'date_announcement', 'bidders_isin',
       'date_completion_expected', 'date_withdrawn', 'targets_country', 'type',
       'targets_name', 'bidders_country', 'sentences', 'cleaned_sentences',
       'stemmed_sentences', 'lemmatized_sentences', 'pos_tagged_sentences',
       'dependency_parsed_sentences', 'sentiment', 'vader_sentiment',
       'named_entities', 'topic', 'emotion', 'is_bidder', 'is_target'],
      dtype='object')

In [171]:
senti_cols = ['status', 'bidders_industry', 'bidders_isin', 'targets_industry', 
              'targets_isin', 'date_rumor', 'date_announcement', 'targets_country',
              'bidders_country', 'sentiment', 'vader_sentiment', 'named_entities', 
              'topic', 'emotion', 'is_bidder', 'is_target']

In [172]:
senti_df = df_exploded[senti_cols]

In [173]:
senti_df['ISIN'] = senti_df['bidders_isin'].str.cat(senti_df['targets_isin'], sep='', na_rep='')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  senti_df['ISIN'] = senti_df['bidders_isin'].str.cat(senti_df['targets_isin'], sep='', na_rep='')


In [174]:
senti_df.head()
senti_df.sample(100).to_csv("Data/test_senti.csv")

In [175]:
senti_df

Unnamed: 0,status,bidders_industry,bidders_isin,targets_industry,targets_isin,date_rumor,date_announcement,targets_country,bidders_country,sentiment,vader_sentiment,named_entities,topic,emotion,is_bidder,is_target,ISIN
0,Completed,['6499'],,['2910'],INE451A01017,2014-07-31,2014-07-31,['IN'],['IN'],0.9118,positive,"[('20140801', 'DATE'), ('310714', 'CARDINAL'),...",3,"[('positive', 0.25)]",0,1,INE451A01017
1,Completed,['2931'],CNE100004C11,['2561'],,2022-04-27,2022-04-27,['CN'],['CN'],0.3612,positive,"[('20220427', 'DATE'), ('automotive electronic...",4,"[('positive', 0.5)]",1,0,CNE100004C11
2,Completed Assumed,['2651'],CNE100000643,['4669'],,2019-09-16,2019-09-16,['CN'],['CN'],0.2960,positive,"[(""'20190916"", 'DATE'), ('beijing', 'GPE'), ('...",1,"[('positive', 0.45)]",1,0,CNE100000643
3,Completed,['2932'],KYG693691092,['2932'],,2015-06-02,2015-06-02,['SG'],['KY'],-0.2960,negative,"[(""'20150602"", 'DATE'), ('5 million', 'CARDINA...",4,"[('positive', 1.0)]",1,0,KYG693691092
4,Withdrawn,['4519'],,['2931'],AU000000SIX0,2019-04-23,2019-04-23,['AU'],['CN'],0.9531,positive,"[('sprintex ltd', 'ORG'), ('11939765', 'DATE')...",3,"[('positive', 0.296875)]",0,1,AU000000SIX0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3390,Completed Assumed,['2930'],CNE1000022D8,['2932'],,2016-03-23,2016-03-23,['CN'],['CN'],0.1779,positive,"[('20160323', 'DATE'), ('changzhou', 'GPE'), (...",1,"[('positive', 0.5263157894736842)]",1,0,CNE1000022D8
3391,Completed,['2219'],CNE100002F77,['2899'],,2017-11-20,2017-11-20,['CN'],['CN'],0.0000,neutral,"[(""'20171120"", 'DATE'), ('51 per cent', 'MONEY...",1,"[('positive', 0.5555555555555556)]",1,0,CNE100002F77
3392,Completed,['2910'],KR7067170001,['2521'],,2011-09-30,2011-09-30,['KR'],['KR'],0.5267,positive,"[(""'20110930"", 'DATE'), ('501 million', 'CARDI...",3,"[('positive', 0.36363636363636365)]",1,0,KR7067170001
3393,Completed Assumed,['2910'],CNE000000ZT3,[],,2011-10-18,2011-10-18,[],['CN'],0.8225,positive,"[(""'20111018"", 'DATE'), ('dongfeng automobile ...",2,"[('positive', 0.4666666666666667)]",1,0,CNE000000ZT3


In [176]:
# Ensure the ISIN column in test_senti is standardized
senti_df['ISIN'] = senti_df['ISIN'].astype(str).str.strip().str.upper()

# Reset index of test_stock_ret to make ISIN a column
stock_ret.reset_index(inplace=True)
stock_ret.rename(columns={'index': 'ISIN'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  senti_df['ISIN'] = senti_df['ISIN'].astype(str).str.strip().str.upper()


In [177]:
stock_ret

Unnamed: 0,ISIN,2010-01-05,2010-01-06,2010-01-07,2010-01-08,2010-01-11,2010-01-12,2010-01-13,2010-01-14,2010-01-15,...,2023-12-15,2023-12-18,2023-12-19,2023-12-20,2023-12-21,2023-12-22,2023-12-26,2023-12-27,2023-12-28,2023-12-29
0,DE0005190003,0.009048,0.014069,0.009148,-0.011331,-0.014364,-0.034109,0.012520,0.009195,-0.006912,...,0.000974,-0.011877,-0.004532,-0.011580,-0.005407,0.010369,,,0.003789,0.001391
1,DE0007100000,-0.005603,-0.003329,-0.009155,0.000259,0.008556,-0.031749,0.008330,0.019979,-0.012972,...,0.017350,-0.012093,-0.009887,-0.010303,0.001602,0.007675,,,-0.011136,0.007079
2,DE0007664005,0.015633,0.001348,0.008022,0.014982,-0.010064,-0.024413,0.007301,-0.012360,-0.008695,...,-0.015091,-0.006855,-0.001218,0.003659,-0.024706,-0.000831,,,-0.015520,-0.000426
3,US88262P1021,0.056468,-0.025180,-0.008330,-0.025210,-0.002762,-0.005882,0.015308,-0.040771,-0.004639,...,-0.004190,0.003218,0.015602,-0.008406,0.002683,-0.001372,0.014055,-0.000903,-0.023769,-0.016063
4,DE0005439004,0.068941,0.006240,0.138334,0.022166,-0.063386,-0.043002,0.043728,-0.018061,-0.019836,...,0.010582,-0.008377,-0.000528,0.004754,-0.002366,0.005534,,,0.008179,0.007590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,AU0000224040,0.009296,-0.009211,0.016268,-0.008494,0.011862,-0.004559,0.000327,0.006540,-0.027940,...,0.007559,0.005359,0.010661,0.012658,-0.010417,0.013684,,,-0.012618,0.009585
214,CA98474P5013,,,,,,,,,,...,-0.024096,0.012346,0.012195,0.024096,0.000000,0.000000,,,0.023256,-0.011364
215,JP3955800002,-0.010000,0.008418,-0.050083,-0.010545,-0.001776,0.017794,-0.005245,-0.001757,0.007042,...,-0.009804,0.000000,-0.019802,0.030303,-0.009804,0.009901,,,0.000000,0.009804
216,CA98936C8584,,,,,,,,,,...,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,,,0.000000,0.000000


In [178]:
# search for ISIN DE0007856023 in stock_ret
stock_returns = stock_ret[stock_ret['ISIN'] == 'DE0007856023'].iloc[:, 1:]

In [179]:
# search for isin DE0007856023 in senti_df
senti_df[senti_df['ISIN'] == 'DE0007856023'].iloc[:, 17:]

889
1109
1110
1129
1976
2001
2050
2054
2448
2693
3211


In [180]:
senti_df

Unnamed: 0,status,bidders_industry,bidders_isin,targets_industry,targets_isin,date_rumor,date_announcement,targets_country,bidders_country,sentiment,vader_sentiment,named_entities,topic,emotion,is_bidder,is_target,ISIN
0,Completed,['6499'],,['2910'],INE451A01017,2014-07-31,2014-07-31,['IN'],['IN'],0.9118,positive,"[('20140801', 'DATE'), ('310714', 'CARDINAL'),...",3,"[('positive', 0.25)]",0,1,INE451A01017
1,Completed,['2931'],CNE100004C11,['2561'],,2022-04-27,2022-04-27,['CN'],['CN'],0.3612,positive,"[('20220427', 'DATE'), ('automotive electronic...",4,"[('positive', 0.5)]",1,0,CNE100004C11
2,Completed Assumed,['2651'],CNE100000643,['4669'],,2019-09-16,2019-09-16,['CN'],['CN'],0.2960,positive,"[(""'20190916"", 'DATE'), ('beijing', 'GPE'), ('...",1,"[('positive', 0.45)]",1,0,CNE100000643
3,Completed,['2932'],KYG693691092,['2932'],,2015-06-02,2015-06-02,['SG'],['KY'],-0.2960,negative,"[(""'20150602"", 'DATE'), ('5 million', 'CARDINA...",4,"[('positive', 1.0)]",1,0,KYG693691092
4,Withdrawn,['4519'],,['2931'],AU000000SIX0,2019-04-23,2019-04-23,['AU'],['CN'],0.9531,positive,"[('sprintex ltd', 'ORG'), ('11939765', 'DATE')...",3,"[('positive', 0.296875)]",0,1,AU000000SIX0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3390,Completed Assumed,['2930'],CNE1000022D8,['2932'],,2016-03-23,2016-03-23,['CN'],['CN'],0.1779,positive,"[('20160323', 'DATE'), ('changzhou', 'GPE'), (...",1,"[('positive', 0.5263157894736842)]",1,0,CNE1000022D8
3391,Completed,['2219'],CNE100002F77,['2899'],,2017-11-20,2017-11-20,['CN'],['CN'],0.0000,neutral,"[(""'20171120"", 'DATE'), ('51 per cent', 'MONEY...",1,"[('positive', 0.5555555555555556)]",1,0,CNE100002F77
3392,Completed,['2910'],KR7067170001,['2521'],,2011-09-30,2011-09-30,['KR'],['KR'],0.5267,positive,"[(""'20110930"", 'DATE'), ('501 million', 'CARDI...",3,"[('positive', 0.36363636363636365)]",1,0,KR7067170001
3393,Completed Assumed,['2910'],CNE000000ZT3,[],,2011-10-18,2011-10-18,[],['CN'],0.8225,positive,"[(""'20111018"", 'DATE'), ('dongfeng automobile ...",2,"[('positive', 0.4666666666666667)]",1,0,CNE000000ZT3


In [181]:
# calculate CARs
event_window = [-2, -1, 0, 1, 2]
baseline_window = range(-30, -3)

# Create a function to calculate CARs for each ISIN
def calculate_cars(isin, event_date):
    try:
        # Filter stock return data for the specific ISIN
        stock_returns = stock_ret[stock_ret['ISIN'] == isin].iloc[:, 1:]

        # Convert event_date to column format
        event_idx = stock_returns.columns.get_loc(event_date)

        # Event window returns
        event_indices = [event_idx + offset for offset in event_window]
        event_returns = stock_returns.iloc[:, event_indices].values.flatten()

        # Baseline mean return
        baseline_indices = [event_idx + offset for offset in baseline_window]
        baseline_returns = stock_returns.iloc[:, baseline_indices].values.flatten()
        expected_return = np.nanmean(baseline_returns)

        # Calculate abnormal returns and CARs
        abnormal_returns = event_returns - expected_return
        cars = np.nansum(abnormal_returns)
        return cars
    except Exception as e:
        return np.nan  # Return NaN if there is an issue


senti_df['CARs'] = senti_df.apply(
lambda row: calculate_cars(row['ISIN'], row['date_rumor']), axis=1
)

  expected_return = np.nanmean(baseline_returns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  senti_df['CARs'] = senti_df.apply(


In [182]:
senti_df

Unnamed: 0,status,bidders_industry,bidders_isin,targets_industry,targets_isin,date_rumor,date_announcement,targets_country,bidders_country,sentiment,vader_sentiment,named_entities,topic,emotion,is_bidder,is_target,ISIN,CARs
0,Completed,['6499'],,['2910'],INE451A01017,2014-07-31,2014-07-31,['IN'],['IN'],0.9118,positive,"[('20140801', 'DATE'), ('310714', 'CARDINAL'),...",3,"[('positive', 0.25)]",0,1,INE451A01017,0.0
1,Completed,['2931'],CNE100004C11,['2561'],,2022-04-27,2022-04-27,['CN'],['CN'],0.3612,positive,"[('20220427', 'DATE'), ('automotive electronic...",4,"[('positive', 0.5)]",1,0,CNE100004C11,0.0
2,Completed Assumed,['2651'],CNE100000643,['4669'],,2019-09-16,2019-09-16,['CN'],['CN'],0.2960,positive,"[(""'20190916"", 'DATE'), ('beijing', 'GPE'), ('...",1,"[('positive', 0.45)]",1,0,CNE100000643,0.0
3,Completed,['2932'],KYG693691092,['2932'],,2015-06-02,2015-06-02,['SG'],['KY'],-0.2960,negative,"[(""'20150602"", 'DATE'), ('5 million', 'CARDINA...",4,"[('positive', 1.0)]",1,0,KYG693691092,0.0
4,Withdrawn,['4519'],,['2931'],AU000000SIX0,2019-04-23,2019-04-23,['AU'],['CN'],0.9531,positive,"[('sprintex ltd', 'ORG'), ('11939765', 'DATE')...",3,"[('positive', 0.296875)]",0,1,AU000000SIX0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3390,Completed Assumed,['2930'],CNE1000022D8,['2932'],,2016-03-23,2016-03-23,['CN'],['CN'],0.1779,positive,"[('20160323', 'DATE'), ('changzhou', 'GPE'), (...",1,"[('positive', 0.5263157894736842)]",1,0,CNE1000022D8,0.0
3391,Completed,['2219'],CNE100002F77,['2899'],,2017-11-20,2017-11-20,['CN'],['CN'],0.0000,neutral,"[(""'20171120"", 'DATE'), ('51 per cent', 'MONEY...",1,"[('positive', 0.5555555555555556)]",1,0,CNE100002F77,0.0
3392,Completed,['2910'],KR7067170001,['2521'],,2011-09-30,2011-09-30,['KR'],['KR'],0.5267,positive,"[(""'20110930"", 'DATE'), ('501 million', 'CARDI...",3,"[('positive', 0.36363636363636365)]",1,0,KR7067170001,0.0
3393,Completed Assumed,['2910'],CNE000000ZT3,[],,2011-10-18,2011-10-18,[],['CN'],0.8225,positive,"[(""'20111018"", 'DATE'), ('dongfeng automobile ...",2,"[('positive', 0.4666666666666667)]",1,0,CNE000000ZT3,0.0


In [183]:
# Merge on ISIN
merged_data = pd.merge(senti_df, stock_ret, on='ISIN', how='inner')

# check if nan values are present
print(merged_data['CARs'].isnull().sum())

# drop nan values
merged_data = merged_data.dropna(subset=['CARs'])

# check if nan values are present
print(merged_data['CARs'].isnull().sum())

20
0


In [200]:
# Define dependent variable (CAR bzw. CAAR) and independent variables (Sentiment, Market Returns, ...)

X = merged_data[['sentiment']]  # adjust for actual data
y = merged_data['CARs'] # bzw. CAAR

X = X.dropna()
y = y[X.index]

In [194]:
# split data into training and test datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Linear Regression


In [195]:
# fit regression

model = LinearRegression()
model.fit(X_train, y_train)

In [196]:
# print coeff and intercept

print(f"Intercept: {model.intercept_}")
print(f"Sentiment Coefficient: {model.coef_[0]}")
print(f"Market Return Coefficient: {model.coef_[1]}")

Intercept: -0.00060412727192234
Sentiment Coefficient: 0.021244003957141003


IndexError: index 1 is out of bounds for axis 0 with size 1

In [197]:
# Optionally, evaluate the model on the test set

y_pred = model.predict(X_test)
print(f"Predicted CAR: {y_pred}")
print(f"Actual CAR: {y_test.values}")

Predicted CAR: [ 0.01706239  0.01427305 -0.01845334  0.00706921  0.00988829  0.00706921
 -0.00060413  0.00835022  0.00420339  0.02063563  0.01534162 -0.00060413
  0.00793384 -0.00170032  0.01058509  0.01341054 -0.00060413 -0.00060413
  0.01564754  0.01619138  0.01876616  0.00750683 -0.01333778  0.00615571
 -0.00689235  0.01956281  0.01954581 -0.00060413  0.00988829 -0.00385658
  0.00157763 -0.00060413  0.00615571  0.01903808 -0.00170032  0.01876616
 -0.00060413  0.01447062  0.01363998  0.01856221  0.01407123  0.01363998
 -0.00736397  0.01706239  0.00750683  0.00750683  0.01686907  0.00369353
  0.0154967  -0.00689235  0.01749364 -0.00060413 -0.00060413 -0.00689235
  0.00706921  0.01292618  0.0191528  -0.00689235  0.00952289  0.00420339
  0.00470687  0.00988829  0.01534162  0.01909756 -0.01179334  0.00706921
 -0.00689235 -0.00689235 -0.00689235 -0.00060413  0.01317473 -0.00689235
  0.02063563  0.0124057  -0.00689235  0.0056841   0.01154532 -0.00689235
  0.01058509  0.01407123 -0.00689235

In [198]:
# print R-squared value to check how well the model fits the data

print(f"R-squared: {model.score(X_test, y_test)}")

R-squared: -0.040117426614711515


In [201]:
from sklearn.metrics import r2_score, mean_squared_error

# Initialize and fit the model
model = LinearRegression()
model.fit(X, y)

# Make predictions and evaluate
y_pred = model.predict(X)
print("R-squared:", r2_score(y, y_pred))
print("Mean Squared Error:", mean_squared_error(y, y_pred))

# Step 7: Interpret results
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
})
print(coefficients)

R-squared: 0.01311527402008994
Mean Squared Error: 0.004127120981394329
     Feature  Coefficient
0  sentiment     0.016423


Ridge Regression

In [202]:
ridge_regressor = Ridge(alpha=1.0)
ridge_regressor.fit(X_train, y_train)

In [203]:
print(f"Ridge Intercept: {ridge_regressor.intercept_}")
print(f"Ridge Sentiment Coefficient: {ridge_regressor.coef_[0]}")
print(f"Ridge Market Return Coefficient: {ridge_regressor.coef_[1]}")

Ridge Intercept: -0.000514563989950389
Ridge Sentiment Coefficient: 0.020957946410175177


IndexError: index 1 is out of bounds for axis 0 with size 1

In [204]:
print(ridge_regressor.summary())

AttributeError: 'Ridge' object has no attribute 'summary'

Lasso Regression

In [205]:
lasso_regressor = Lasso(alpha=0.1)
lasso_regressor.fit(X_train, y_train)

In [206]:
print(f"Lasso Intercept: {lasso_regressor.intercept_}")
print(f"Lasso Sentiment Coefficient: {lasso_regressor.coef_[0]}")
print(f"Lasso Market Return Coefficient: {lasso_regressor.coef_[1]}")

Lasso Intercept: 0.00604727115055651
Lasso Sentiment Coefficient: 0.0


IndexError: index 1 is out of bounds for axis 0 with size 1

In [207]:
print(lasso_regressor.summary())

AttributeError: 'Lasso' object has no attribute 'summary'

Decision Tree

In [208]:
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train, y_train)

In [209]:
print(f"Decision Tree Feature Importances: {dt_regressor.feature_importances_}")

Decision Tree Feature Importances: [1.]


Random Forest

In [210]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

In [211]:
print(f"Random Forest Feature Importances: {rf_regressor.feature_importances_}")

Random Forest Feature Importances: [1.]
