in this notebook we will create one dataset from datasets:
* https://www.kaggle.com/datasets/sidarcidiacono/news-sentiment-analysis-for-stock-data-by-company
* https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis/data
* https://www.kaggle.com/datasets/ankurzing/aspect-based-sentiment-analysis-for-financial-news

In [35]:
import kagglehub
import numpy as np
import pandas as pd 
import os
import nltk

In [36]:
# download data
path1 = kagglehub.dataset_download("sidarcidiacono/news-sentiment-analysis-for-stock-data-by-company")
path2 = kagglehub.dataset_download("sbhatti/financial-sentiment-analysis")
path3 = kagglehub.dataset_download("ankurzing/aspect-based-sentiment-analysis-for-financial-news")

# get csv
# dataset 1
csv1 = os.path.join(path1,"djia_news copy.csv", "djia_news copy.csv")
csv2 = os.path.join(path1,"nasdaq.csv", "nasdaq.csv")

# dataset 2
csv3 = os.path.join(path2, "data.csv")

#dataset 3 
csv4 = os.path.join(path3, "SEntFiN-v1.1.csv")

In [37]:
# make dataframe
df1 = pd.read_csv(csv1)
df2 = pd.read_csv(csv2)
df3 = pd.read_csv(csv3)
df4 = pd.read_csv(csv4)

let's create on big df

In [38]:
df1

Unnamed: 0,Label,Ticker,Headline
0,0,MMM,Employer who stole nearly $3M in wages from 15...
1,1,MMM,Huge new Facebook data leak exposed intimate d...
2,0,MMM,A campaign has accelerated to turn a disused r...
3,1,MMM,Google launches global human trafficking helpl...
4,1,MMM,Over 3m Saudi Women Don’t Have ID Cards; Saudi...
...,...,...,...
2376,0,WMT,Walmart dumps e-cigarettes: Largest store in U...
2377,0,WMT,Walmart makes a $16 billion bet on India's boo...
2378,0,WMT,Walmart raises minimum age to buy tobacco to 2...
2379,0,WMT,Walmart Took Over Chile In Only Three Years An...


In [39]:
df1 = df1.drop(columns=['Ticker'])
df1.columns = ['Sentiment', "Headline"]
df1

Unnamed: 0,Sentiment,Headline
0,0,Employer who stole nearly $3M in wages from 15...
1,1,Huge new Facebook data leak exposed intimate d...
2,0,A campaign has accelerated to turn a disused r...
3,1,Google launches global human trafficking helpl...
4,1,Over 3m Saudi Women Don’t Have ID Cards; Saudi...
...,...,...
2376,0,Walmart dumps e-cigarettes: Largest store in U...
2377,0,Walmart makes a $16 billion bet on India's boo...
2378,0,Walmart raises minimum age to buy tobacco to 2...
2379,0,Walmart Took Over Chile In Only Three Years An...


In [40]:
df2 = df2.drop(columns=['Ticker'])
df2.columns = ['Sentiment', "Headline"]
df2

Unnamed: 0,Sentiment,Headline
0,0,@TotesTravel : Airline shares tumble as New Yo...
1,1,@TotesTravel : American United call off Hong K...
2,0,@TotesTravel : U.S. airline stocks hit highest...
3,1,@TotesTravel : American Airlines reaches deal ...
4,1,@TotesTravel : US airlines Treasury Department...
...,...,...
13176,1,Bitcoin Tops $1000 Again as Zynga Accepts Virt...
13177,1,Zynga Accepts Bitcoin For Microtransactions
13178,1,Zumiez (ZUMZ) unusual put activity into earnin...
13179,1,Zumiez Is Going Bankrupt


In [41]:
map_dict = {'positive': 2,
            'neutral': 1,
            'negative': 0}

df3['Sentiment'] = df3['Sentiment'].map(map_dict)
df3.columns = ['Headline', "Sentiment"]
df3

Unnamed: 0,Headline,Sentiment
0,The GeoSolutions technology will leverage Bene...,2
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",0
2,"For the last quarter of 2010 , Componenta 's n...",2
3,According to the Finnish-Russian Chamber of Co...,1
4,The Swedish buyout firm has sold its remaining...,1
...,...,...
5837,RISING costs have forced packaging producer Hu...,0
5838,Nordic Walking was first used as a summer trai...,1
5839,"According shipping company Viking Line , the E...",1
5840,"In the building and home improvement trade , s...",1


In [42]:
import json
df4 = df4.drop(columns=['S No.', 'Words'])
df4.columns = ['Headline', 'Sentiment']

def fun_map(j):
    data = json.loads(j)
    sentiment = [map_dict[value] for key, value in data.items()]    
    return sum(sentiment) / len(sentiment)

df4['Sentiment'] = df4['Sentiment'].map(fun_map)

df4

Unnamed: 0,Headline,Sentiment
0,SpiceJet to issue 6.4 crore warrants to promoters,1.0
1,MMTC Q2 net loss at Rs 10.4 crore,1.0
2,"Mid-cap funds can deliver more, stay put: Experts",2.0
3,Mid caps now turn into market darlings,2.0
4,"Market seeing patience, if not conviction: Pra...",1.0
...,...,...
10748,"Negative on Chambal, Advanta: Mitesh Thacker",0.0
10749,"Small, Mid-cap stocks may emerge outperformers",2.0
10750,Rupee slips against US dollar,0.5
10751,Rupee weak against US dollar,0.5


In [43]:
df = pd.concat([df1, df2, df3, df4], axis=0, ignore_index=True)
df['Sentiment'] = df['Sentiment'].map(lambda x: x / 2)
df

Unnamed: 0,Sentiment,Headline
0,0.00,Employer who stole nearly $3M in wages from 15...
1,0.50,Huge new Facebook data leak exposed intimate d...
2,0.00,A campaign has accelerated to turn a disused r...
3,0.50,Google launches global human trafficking helpl...
4,0.50,Over 3m Saudi Women Don’t Have ID Cards; Saudi...
...,...,...
32152,0.00,"Negative on Chambal, Advanta: Mitesh Thacker"
32153,1.00,"Small, Mid-cap stocks may emerge outperformers"
32154,0.25,Rupee slips against US dollar
32155,0.25,Rupee weak against US dollar


now we have to clean dataset

In [44]:
df.iloc[2]['Headline']

'A campaign has accelerated to turn a disused railway line in Yorkshire into England’s longest cycle tunnel – instead of using £3m of public money to close it for ever. Campaigners say they could enhance West Yorkshire’s health and economy by converting an old railway line.'

In [45]:
import sys
import os

def punctuation_removal(text: str) -> str: 
    return "".join([u for u in text if u == ' ' or u == "'" or ord('0') <= ord(u) <= ord('9') or ord('a') <= ord(u) <= ord('z') or ord('A') <= ord(u) <= ord('Z')])

df['Headline'] = df['Headline'].map(punctuation_removal)
df['Headline']

0        Employer who stole nearly 3M in wages from 157...
1        Huge new Facebook data leak exposed intimate d...
2        A campaign has accelerated to turn a disused r...
3        Google launches global human trafficking helpl...
4        Over 3m Saudi Women Dont Have ID Cards Saudi G...
                               ...                        
32152           Negative on Chambal Advanta Mitesh Thacker
32153         Small Midcap stocks may emerge outperformers
32154                        Rupee slips against US dollar
32155                         Rupee weak against US dollar
32156                   Australia shares flat energy drags
Name: Headline, Length: 32157, dtype: object

In [46]:
df['Headline'].iloc[3] # we have to remove &amp;

'Google launches global human trafficking helpline amp data network  Commits 3M 2M to build an international helpline network fueled by data Human trafficking enslaves 21M people with 25M forced into labor Most are ages 1824 amp 43 are forced into the sex trade'

In [47]:
df['Headline'] = df['Headline'].map(lambda s: s.replace('amp', '').lower())
df['Headline'].iloc[3] 

'google launches global human trafficking helpline  data network  commits 3m 2m to build an international helpline network fueled by data human trafficking enslaves 21m people with 25m forced into labor most are ages 1824  43 are forced into the sex trade'

In [48]:
df

Unnamed: 0,Sentiment,Headline
0,0.00,employer who stole nearly 3m in wages from 157...
1,0.50,huge new facebook data leak exposed intimate d...
2,0.00,a caign has accelerated to turn a disused rail...
3,0.50,google launches global human trafficking helpl...
4,0.50,over 3m saudi women dont have id cards saudi g...
...,...,...
32152,0.00,negative on chambal advanta mitesh thacker
32153,1.00,small midcap stocks may emerge outperformers
32154,0.25,rupee slips against us dollar
32155,0.25,rupee weak against us dollar


now we have to analyse dataset

In [49]:
words = {}
for column in df['Headline']:
    for word in column.split(): 
        words[word] = words.get(word, 0) + 1

sorted_words = sorted(words.items(), key=lambda x: x[1], reverse=True)
sorted_words

[('the', 16003),
 ('to', 13799),
 ('of', 11258),
 ('in', 11254),
 ('and', 7667),
 ('a', 7373),
 ('on', 5918),
 ('for', 5616),
 ('as', 3092),
 ('is', 3025),
 ('from', 2713),
 ('with', 2626),
 ('by', 2612),
 ('at', 2453),
 ('that', 1964),
 ('has', 1930),
 ('it', 1776),
 ('its', 1748),
 ('us', 1737),
 ('will', 1723),
 ('be', 1619),
 ('rs', 1587),
 ('up', 1547),
 ('after', 1514),
 ('net', 1452),
 ('new', 1433),
 ('over', 1388),
 ('company', 1338),
 ('eur', 1284),
 ('an', 1200),
 ('are', 1180),
 ('profit', 1165),
 ('have', 1128),
 ('first', 1111),
 ('was', 1077),
 ('bank', 1052),
 ('market', 1028),
 ('stocks', 1008),
 ('million', 977),
 ('said', 973),
 ('inc', 956),
 ("'s", 948),
 ('crore', 941),
 ('shares', 898),
 ('says', 856),
 ('down', 837),
 ('group', 827),
 ('india', 811),
 ('more', 804),
 ('china', 798),
 ('not', 764),
 ('mn', 763),
 ('sales', 708),
 ('global', 706),
 ('their', 706),
 ('this', 703),
 ('may', 697),
 ('oil', 696),
 ('news', 684),
 ('year', 675),
 ('than', 659),
 ('per'

In [50]:
len(sorted_words) 

37478

In [51]:
words_sum = sum([b for a, b in sorted_words])
words_sum

507767

In [52]:
nums = [50, 100, 250, 500, 1000, 2500, 5000, 10000, 25000]
for num in nums:
    ans = sum([b for a, b in sorted_words[:num]]) / words_sum * 100
    print(f"top{num}: ", ans)
    

top50:  27.576427771005207
top100:  33.009037609769834
top250:  41.99938160613037
top500:  50.86013860688071
top1000:  61.078408009973074
top2500:  74.65313815194764
top5000:  83.78882440174333
top10000:  91.06657187253208
top25000:  97.54257366075385


we can improve this by using lemmatization

In [53]:
import nltk
nltk.download('wordnet')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem(text: str) -> str:
    tokens = text.split()
    return ' '.join([stemmer.stem(token) for token in tokens])

stem("i love reading books")

[nltk_data] Downloading package wordnet to /home/kuba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'i love read book'

In [54]:
df['Headline'] = df['Headline'].map(stem)
df

Unnamed: 0,Sentiment,Headline
0,0.00,employ who stole nearli 3m in wage from 157 wo...
1,0.50,huge new facebook data leak expos intim detail...
2,0.00,a caign ha acceler to turn a disus railway lin...
3,0.50,googl launch global human traffick helplin dat...
4,0.50,over 3m saudi women dont have id card saudi gr...
...,...,...
32152,0.00,neg on chambal advanta mitesh thacker
32153,1.00,small midcap stock may emerg outperform
32154,0.25,rupe slip against us dollar
32155,0.25,rupe weak against us dollar


In [55]:
words = {}
for column in df['Headline']:
    for word in column.split(): 
        words[word] = words.get(word, 0) + 1

sorted_words = sorted(words.items(), key=lambda x: x[1], reverse=True)
words_sum = sum([b for a, b in sorted_words])

nums = [50, 100, 250, 500, 1000, 2500, 5000, 10000, 25000]
for num in nums:
    ans = sum([b for a, b in sorted_words[:num]]) / words_sum * 100
    print(f"top{num}: ", ans)

top50:  28.540255668446356
top100:  34.67436836186676
top250:  45.53958803939602
top500:  55.99674653925915
top1000:  67.72279411619897
top2500:  81.66580340983167
top5000:  89.35397534696033
top10000:  94.5597094730457
top25000:  99.03439963605355


In [56]:
df['Headline']

0        employ who stole nearli 3m in wage from 157 wo...
1        huge new facebook data leak expos intim detail...
2        a caign ha acceler to turn a disus railway lin...
3        googl launch global human traffick helplin dat...
4        over 3m saudi women dont have id card saudi gr...
                               ...                        
32152                neg on chambal advanta mitesh thacker
32153              small midcap stock may emerg outperform
32154                          rupe slip against us dollar
32155                          rupe weak against us dollar
32156                     australia share flat energi drag
Name: Headline, Length: 32157, dtype: object

now our dataset is ready

In [57]:
path = os.path.join('..', 'dataset.csv')
df.to_csv(path, index=False)