# Load packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load preprocessed news data

In [None]:
path = "../Database"
usdataset = "/news_df.csv"
totalpath = path + usdataset
df = pd.read_csv(totalpath)
df['release_date'] = pd.to_datetime(df['release_date'])

# FinBERT

In [None]:
#create two columns in df: sentiment title and sentiment body
df['sentiment_title'] = np.nan
df['sentiment_body'] = np.nan
df['sentiment_body'] = df['sentiment_body'].astype(object)
df['sentiment_title'] = df['sentiment_title'].astype(object)
# df.dtypes

In [None]:
#import FinBERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

I did this ones for the news headlines and ones for the news bodies. Csv files sentimentheadlines_postFINBERT and bodies_sentiment_postFINBERT show the results

In [None]:
list_index = []
for i in range(len(df)):
    if i % 1000 == 0:
        print("Processed", i, "values")

    text = df['content'][i]

    try:
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        puts = F.softmax(output.logits, dim=1)
        puts = puts.detach().numpy()

        positive_score = puts[0][0]
        negative_score = puts[0][1]
        neutral_score = puts[0][2]

        sentiment_score = (positive_score,negative_score,neutral_score)
        sentiment_score = np.round(sentiment_score, 3)
        puts = np.array(sentiment_score)

        df["sentiment_body"][i]=puts
    except RuntimeError:
        print(f"Skipping article {i} due to RuntimeError")
        #create a list with all the index of the articles that are too long
        list_index.append(i)
        continue

here saving first sentiment score obtained by FinBERT. Results are loaded again in following section

# Load sentiment data scores -correlation analysis

In [None]:
path = "../Database"

In [None]:
path_price = "../Database/price.csv"
price = pd.read_csv(path_price)
price = price[['Date','Close']]
#create extra column % difference with daily close price difference
price['% Daily diff'] = price['Close'].pct_change()
price['Date'] = pd.to_datetime(price['Date'])


In [None]:
#TICKERS PROP
totalpath = path + "/ticker_prop.csv"
ticker_prop = pd.read_csv(totalpath)

In [None]:
#bodies
totalpath = path + "/sentimentbodies_postFINBERT.csv"
bodies = pd.read_csv(totalpath)
bodies['release_date'] = pd.to_datetime(bodies['release_date'])
#create columns
bodies['diff'] = 0
bodies['rmax'] = 0
bodies['max'] = 0
bodies['sigmoid'] = 0
bodies['weights'] = 0
bodies['abs_max'] = 0
bodies['discrete_max'] = 0
bodies

In [None]:
#headlines
totalpath = path + "/sentimentheadlines_postFINBERT.csv"
headlines = pd.read_csv(totalpath)
headlines['release_date'] = pd.to_datetime(headlines['release_date'])
headlines['diff'] = 0
headlines['rmax'] = 0
headlines['max'] = 0
headlines['sigmoid'] = 0
headlines['weights'] = 0
headlines['abs_max'] = 0
headlines['discrete_max'] = 0
headlines

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# from scipy import stats
# colors = ['#228A83', '#002060', '#7A5F3F']
# columns = ['positive', 'negative', 'neutral']
# #print kde of of positive, negative and neutral sentiment in dataframe sentiment
# for i in columns:
#     sns.kdeplot(headlines[i], shade=True, color=colors[columns.index(i)])

# #add a legend
# plt.legend(labels=['positive', 'negative', 'neutral'])
# #add title x axis 'probability score'
# plt.xlabel('probability score')
# plt.show()


In [None]:
#bodies and headlines merged
bodies1 = bodies[['release_date','positive', 'negative', 'neutral','diff','rmax','max','sigmoid','weights','abs_max','discrete_max']]
sentiment1 = headlines[['release_date','positive', 'negative', 'neutral','diff','rmax','max','sigmoid','weights','abs_max','discrete_max']]
#merge bodies1 and sentiment1
merged = sentiment1.append(bodies1, ignore_index=True)
merged = merged.sort_values(by=['release_date'])
merged = merged.reset_index(drop=True)
merged

#  Converting 3 polarity sentiment scores to one score

6 sentiment formulas are outlined below

## Headlines

In [None]:
#diff
headlines['diff'] = headlines.apply(lambda row: row['positive'] - row['negative'], axis=1)

#weights
pos_weight = 0.5
neg_weight = 0.5
neu_weight = 0

headlines['weights'] = headlines.apply(lambda row: pos_weight * row['positive'] + neg_weight * row['negative'] + neu_weight * row['neutral'], axis=1)

#score ratio
headlines['sratio'] = headlines.apply(lambda row: row['positive'] / row['negative'] if row['negative'] != 0 else 0, axis=1)

#max
headlines['max'] = headlines[['positive', 'negative']].max(axis=1)

#sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
def sigmoid_official(pos_score, neg_score, neu_score):
    x = pos_score - neg_score
    final_score = sigmoid(x)
    return final_score
headlines['sigmoid'] = headlines.apply(lambda row: sigmoid_official(row['positive'], row['negative'], row['neutral']), axis=1)

def discrete_max(pos_score, neg_score, neu_score):
    if pos_score > neg_score and pos_score > neu_score:
        final_score = 1
    elif neg_score > pos_score and neg_score > neu_score:
        final_score = -1
    else:
        final_score = 0
    return final_score

headlines['discrete_max'] = headlines.apply(lambda row: discrete_max(row['positive'], row['negative'], row['neutral']), axis=1)


#abs_max
def abs_max(pos_score, neg_score, neu_score):
    final_score=max(pos_score, neg_score, neu_score)
    if final_score == pos_score:
        final_score= pos_score
    elif final_score == neg_score:
        final_score= -neg_score
    else:
        final_score= 0
    return final_score

headlines['abs_max'] = headlines.apply(lambda row: abs_max(row['positive'], row['negative'], row['neutral']), axis=1)


In [None]:
#delete columns positive negative neutral and headlines title
headlines = headlines.drop(['positive', 'negative', 'neutral', 'Sentiment title'], axis=1)

In [None]:
#keep a copy
sentiment_copy = sentiment

## Bodies

In [None]:
#diff
bodies['diff'] = bodies.apply(lambda row: row['positive'] - row['negative'], axis=1)

#weights
pos_weight = 0.5
neg_weight = 0.5
neu_weight = 0

bodies['weights'] = bodies.apply(lambda row: pos_weight * row['positive'] + neg_weight * row['negative'] + neu_weight * row['neutral'], axis=1)

#score ratio
bodies['sratio'] = bodies.apply(lambda row: row['positive'] / row['negative'] if row['negative'] != 0 else 0, axis=1)

#max
bodies['max'] = bodies[['positive', 'negative']].max(axis=1)

#sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
def sigmoid_official(pos_score, neg_score, neu_score):
    x = pos_score - neg_score
    final_score = sigmoid(x)
    return final_score
bodies['sigmoid'] = bodies.apply(lambda row: sigmoid_official(row['positive'], row['negative'], row['neutral']), axis=1)

def discrete_max(pos_score, neg_score, neu_score):
    if pos_score > neg_score and pos_score > neu_score:
        final_score = 1
    elif neg_score > pos_score and neg_score > neu_score:
        final_score = -1
    else:
        final_score = 0
    return final_score

bodies['discrete_max'] = bodies.apply(lambda row: discrete_max(row['positive'], row['negative'], row['neutral']), axis=1)


#abs_max
def abs_max(pos_score, neg_score, neu_score):
    final_score=max(pos_score, neg_score, neu_score)
    if final_score == pos_score:
        final_score= pos_score
    elif final_score == neg_score:
        final_score= -neg_score
    else:
        final_score= 0
    return final_score

bodies['abs_max'] = bodies.apply(lambda row: abs_max(row['positive'], row['negative'], row['neutral']), axis=1)


In [None]:
#delete columns positive negative neutral and bodies title
bodies = bodies.drop(['positive', 'negative', 'neutral', 'sentiment_body', 'content'], axis=1)

In [None]:
bodies_copy  = bodies

## Headlines + bodies

In [None]:
#diff
merged['diff'] = merged.apply(lambda row: row['positive'] - row['negative'], axis=1)

#weights
pos_weight = 0.5
neg_weight = 0.5
neu_weight = 0

merged['weights'] = merged.apply(lambda row: pos_weight * row['positive'] + neg_weight * row['negative'] + neu_weight * row['neutral'], axis=1)

#score ratio
merged['sratio'] = merged.apply(lambda row: row['positive'] / row['negative'] if row['negative'] != 0 else 0, axis=1)

#max
merged['max'] = merged[['positive', 'negative']].max(axis=1)

def sigmoid(pos_score, neg_score, neu_score):
    x = pos_score - neg_score
    final_score = 1 / (1 + np.exp(-x))
    return final_score
merged['sigmoid'] = merged.apply(lambda row: sigmoid(row['positive'], row['negative'], row['neutral']), axis=1)

def discrete_max(pos_score, neg_score, neu_score):
    if pos_score > neg_score and pos_score > neu_score:
        final_score = 1
    elif neg_score > pos_score and neg_score > neu_score:
        final_score = -1
    else:
        final_score = 0
    return final_score

merged['discrete_max'] = merged.apply(lambda row: discrete_max(row['positive'], row['negative'], row['neutral']), axis=1)


#abs_max
def abs_max(pos_score, neg_score, neu_score):
    final_score=max(pos_score, neg_score, neu_score)
    if final_score == pos_score:
        final_score= pos_score
    elif final_score == neg_score:
        final_score= -neg_score
    else:
        final_score= 0
    return final_score

merged['abs_max'] = merged.apply(lambda row: abs_max(row['positive'], row['negative'], row['neutral']), axis=1)


In [None]:
#delete columns positive negative neutral and merged title
merged = merged.drop(['positive', 'negative', 'neutral'], axis=1)
merged

In [None]:
merged_copy = merged

# Groupby

In [None]:
headlines = headlines.groupby(['ticker', 'release_date']).mean()
headlines = headlines.groupby(['release_date']).mean()

bodies = bodies.groupby(['ticker', 'release_date']).mean()
bodies = bodies.groupby(['release_date']).mean()

merged = merged.groupby(['ticker', 'release_date']).mean()
merged = merged.groupby(['release_date']).mean()

In [None]:
price_dif_array = price['% Daily diff'].to_numpy()
price_array = price['Close'].to_numpy()

In [None]:
#create for each column in sentiment: diff, sratio, max, sigmoid, weights a different numpy array_sentiment
diff_array_sentiment = sentiment['diff'].to_numpy()
sratio_array_sentiment = sentiment['sratio'].to_numpy()
sigmoid_array_sentiment = sentiment['sigmoid'].to_numpy()
weights_array_sentiment = sentiment['weights'].to_numpy()
max_array_sentiment = sentiment['max'].to_numpy()
abs_max_array_sentiment = sentiment['abs_max'].to_numpy()
discrete_max_array_sentiment = sentiment['discrete_max'].to_numpy()

In [None]:
#create for each column in bodies: diff, sratio, max, sigmoid, weights a different numpy array_bodies
diff_array_bodies = bodies['diff'].to_numpy()
sratio_array_bodies = bodies['sratio'].to_numpy()
sigmoid_array_bodies = bodies['sigmoid'].to_numpy()
weights_array_bodies = bodies['weights'].to_numpy()
max_array_bodies = bodies['max'].to_numpy()
abs_max_array_bodies = bodies['abs_max'].to_numpy()
discrete_max_array_bodies = bodies['discrete_max'].to_numpy()

In [None]:
#create for each column in merged: diff, sratio, max, sigmoid, weights a different numpy array_merged
diff_array_merged = merged['diff'].to_numpy()
sratio_array_merged = merged['sratio'].to_numpy()
sigmoid_array_merged = merged['sigmoid'].to_numpy()
weights_array_merged = merged['weights'].to_numpy()
max_array_merged = merged['max'].to_numpy()
abs_max_array_merged = merged['abs_max'].to_numpy()
discrete_max_array_merged = merged['discrete_max'].to_numpy()

# Analysis

In [None]:
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr

Compare two vectors(!) of length 755
Scatterplot with linear regression
Moving average

In [None]:
# for price_dif_array and price_array add zero at the beginning and two zeros at the end
price_dif_array = np.insert(price_dif_array, 0, 0)
price_dif_array = np.append(price_dif_array, [0, 0])
price_dif_array.shape
price_array = np.insert(price_array, 0, 0)
price_array = np.append(price_array, [0, 0])
price_array.shape

In [None]:
price_diff_before = price_array[:-3]
price_diff_intraday = price_array[1:-2]
price_diff_1day = price_array[2:-1]
price_diff_2days = price_array[3:]

close_diff_before = close_array[:-3]
close_diff_intraday = close_array[1:-2]
close_diff_1day = close_array[2:-1]
close_diff_2days = close_array[3:]



In [None]:
price= price_diff_before #variable to change
pearson_table = pd.DataFrame(columns=['Differencing', 'Score ratio', 'Sigmoid', 'Weights', 'Adapted maximum', 'Discrete maximum'])
pearson_table.loc['Headlines'] = [pearsonr(diff_array_sentiment, price)[0], pearsonr(sratio_array_sentiment, price)[0], pearsonr(sigmoid_array_sentiment, price)[0], pearsonr(weights_array_sentiment, price)[0], pearsonr(abs_max_array_sentiment, price)[0], pearsonr(discrete_max_array_sentiment, price)[0]]
pearson_table.loc['Bodies'] = [pearsonr(diff_array_bodies, price)[0], pearsonr(sratio_array_bodies, price)[0], pearsonr(sigmoid_array_bodies, price)[0], pearsonr(weights_array_bodies, price)[0], pearsonr(abs_max_array_bodies, price)[0], pearsonr(discrete_max_array_bodies, price)[0]]
pearson_table.loc['Headlines and bodies'] = [pearsonr(diff_array_merged, price)[0], pearsonr(sratio_array_merged, price)[0], pearsonr(sigmoid_array_merged, price)[0], pearsonr(weights_array_merged, price)[0], pearsonr(abs_max_array_merged, price)[0], pearsonr(discrete_max_array_merged, price)[0]]
pearson_table


In [None]:
# LINEAR REGRESSION
X = np.array([sentiment_data]).T
y = np.array([prix]).T
reg = LinearRegression().fit(X, y)

# Print regression coefficients
print("Regression Coefficients:")
print(f"Intercept: {reg.intercept_[0]:.3f}")
print(f"Sentiment: {reg.coef_[0][0]:.3f}")

# Print R-squared value
r2 = reg.score(X, y)
print(f"R-squared: {r2:.3f}")

# Visualize the regression results
plt.scatter(X, y, c = '#228A83')
plt.plot(X, reg.predict(X), color="#FFC000")
plt.xlabel("Intraday news bodies & headlines sentiment scores", color = '#2E5651')
plt.ylabel("Daily % price differences", color = '#2E5651')
plt.title("Differencing method", fontsize=10, color = '#2E5651')
#print Regression Coefficients:Intercept Sentiment: R-squared

corr, _ = pearsonr(prix,sentiment_data) # CORRELATION
plt.text(-0.55, 0.06, f"Pearson correlation: {0.365}", fontsize=9, color = '#2E5651')
plt.text(-0.55, 0.055, f"Intercept: {reg.intercept_[0]:.3f}", fontsize=9, color = '#2E5651')
plt.text(-0.55, 0.05, f"Slope: {reg.coef_[0][0]:.3f}", fontsize=9, color = '#2E5651')
plt.text(-0.55, 0.045, f"R-squared: {r2:.3f}", fontsize=9, color = '#2E5651')


plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
sentiment_data_std = scaler.fit_transform(sentiment_data.reshape(-1,1))

scaler1 = StandardScaler()
prix_std = scaler1.fit_transform(prix.reshape(-1, 1))

scaler2 = StandardScaler()
price_data_std = scaler2.fit_transform(close_array.reshape(-1,1))

#flattent the arrays
sentiment_data_std = sentiment_data_std.flatten()
prix_std = prix_std.flatten()
price_data_std = price_data_std.flatten()


In [None]:
#moving average

x = 30 #best for the moment
sentiment_rolling = pd.Series(sentiment_data_std).rolling(window=x).mean()
price_rolling = pd.Series(prix_std).rolling(window=x).mean()
adj_close = pd.Series(price_data_std).rolling(window=x).mean()
#show correlation between sentimment rolling and price rolling
# corr, _ = pearsonr(sentiment_rolling,price_rolling)
# print('Pearsons correlation: %.3f' % corr)
plt.plot(sentiment_rolling, label="Sentiment scores", color = '#228A83')
plt.plot(adj_close, label="Adj Close price", color = '#002060')
# plt.plot(adj_close, label="Price", color = '#C00000')

plt.legend()
plt.xlabel("Time", color = '#2E5651')
plt.ylabel("Moving Average", color = '#2E5651')
plt.title("Price-sentiment intraday moving average", fontsize=10,fontweight='bold', color = '#2E5651')
plt.show()

In [None]:
#save best time series sentiment which is intraday sentiment with ticker proportion
np.save('sentiment_data.npy', sentiment_data)