In [54]:
import os
import pandas as pd
from tqdm import tqdm

INPUTS_PATH = os.path.join(os.getcwd(), 'data', 'inputs')

DS_NAME = 'news'
NEWS_VENUE = 'yahoo-finance'

In [55]:
target_path = os.path.join(INPUTS_PATH, DS_NAME, NEWS_VENUE)
years = os.listdir(target_path)

# get all the years available - assume the user has downloaded the raw jsons and parsed them using "load_and_break_news_data.py"
years_data = []
for year in tqdm(years, total=len(years), desc=f'Loading Sentiment Data for {NEWS_VENUE} Venue...'):
    files = os.listdir(os.path.join(target_path, year))

    moths_data = []
    for file in files:
        data = pd.read_csv(os.path.join(target_path, year, file))

        # select necessary columns for the analysis
        selected_cols = ['date_publish', 'title', 'description', 'maintext',
                        'mentioned_companies', 'named_entities', 'sentiment', 'emotion']
        selected_data = data[selected_cols]

        # go over all the rows of the news dataset and select the needed columns for the analysis
        tmp_final_data = []
        for idx, row in selected_data.iterrows():
            companies = row['mentioned_companies']
            companies = companies.replace('[', '').replace(']', '').replace("'", "").replace(' ', '').split(',')
            tmp_data = []
            for company in companies:
                tmp_data.append(pd.DataFrame(
                    [
                        {
                            'date_publish': row['date_publish'],
                            'company': company,
                            'dummy': 1,
                            'negative_sentiment': float(row['sentiment'].split(', ')[0].split(": ")[-1].replace('}', '')),
                            'neutral_sentiment': float(row['sentiment'].split(', ')[1].split(": ")[-1].replace('}', '')),
                            'positive_sentiment': float(row['sentiment'].split(', ')[2].split(": ")[-1].replace('}', '')),
                            'negative_emotion': float(row['emotion'].split(', ')[0].split(": ")[-1].replace('}', '')),
                            'neutral_emotion': float(row['emotion'].split(', ')[1].split(": ")[-1].replace('}', '')),
                            'positive_emotion': float(row['emotion'].split(', ')[2].split(": ")[-1].replace('}', '')) 
                        }
                    ]
                ))
            tmp_data = pd.concat(tmp_data, axis=0)
            tmp_final_data.append(tmp_data)
        tmp_final_data = pd.concat(tmp_final_data, axis=0)
        years_data.append(tmp_final_data)
sentiment_data = pd.concat(years_data, axis=0)
sentiment_data['venue'] = NEWS_VENUE

Loading Sentiment Data for yahoo-finance Venue...: 100%|██████████| 3/3 [00:09<00:00,  3.09s/it]


In [58]:
sentiment_data

Unnamed: 0,date_publish,company,dummy,negative_sentiment,neutral_sentiment,positive_sentiment,negative_emotion,neutral_emotion,positive_emotion,venue
0,2022-06-13 12:41:37,AMZN,1,0.000315,0.000738,0.998947,0.931345,0.038961,0.010387,yahoo-finance
0,2022-06-13 12:41:37,AAPL,1,0.000315,0.000738,0.998947,0.931345,0.038961,0.010387,yahoo-finance
0,2022-06-01 07:33:29,AMZN,1,0.000186,0.000452,0.999363,0.517492,0.372164,0.084493,yahoo-finance
0,2022-06-14 13:40:00,AMZN,1,0.000176,0.000393,0.999431,0.805267,0.122374,0.048255,yahoo-finance
0,2022-06-07 12:39:08,AMZN,1,0.000548,0.000263,0.999189,0.880739,0.059622,0.043610,yahoo-finance
...,...,...,...,...,...,...,...,...,...,...
0,2021-08-19 12:29:54,NKE,1,0.011196,0.001315,0.987489,0.903121,0.041338,0.031304,yahoo-finance
0,2021-08-19 16:12:30,AMZN,1,0.008613,0.983593,0.007794,0.620192,0.268949,0.049556,yahoo-finance
0,2021-08-19 16:12:30,FB,1,0.008613,0.983593,0.007794,0.620192,0.268949,0.049556,yahoo-finance
0,2021-08-04 23:13:00,GOOGL,1,0.000455,0.838351,0.161193,0.797497,0.121184,0.040590,yahoo-finance


In [59]:
sentiment_data.columns

Index(['date_publish', 'company', 'dummy', 'negative_sentiment',
       'neutral_sentiment', 'positive_sentiment', 'negative_emotion',
       'neutral_emotion', 'positive_emotion', 'venue'],
      dtype='object')