In [30]:
import libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

import yfinance as yf
import os

In [31]:
# Parameters
start_date = '2018-01-01'
end_date = '2024-10-15'

folders = [
    "stocks_to_company_names",
    "stocks_to_keywords_broad",
    "stocks_to_keywords_specific",
]

root_in = "filter"
root_out = "processing"

In [32]:
moving_avg = lambda series, windows, periods : (series.rolling(window=f'{windows[0]}D', min_periods=periods[0]).mean()).rolling(window=f'{windows[1]}D', min_periods=periods[1]).mean().dropna()

In [33]:
for folder in folders:
    
    folder_path_in = root_in + '/' + folder
    folder_path_out = root_out + '/' + folder
    input_source = [file for file in os.listdir(folder_path_in) if file.endswith('.csv')]
    
    print(folder)
    
    for file in input_source:
        
        stock_code = file.split('_')[0]
        company_name = file.split('_')[1].split('.')[0]
        print(f'Running: {stock_code}, {company_name.capitalize()}, {folder_path_out}')
        
        df = pd.read_csv(os.path.join(folder_path_in, file)) # article data
        stock_data = yf.download(stock_code, start=start_date, end=end_date, progress=False)
        
        # Prepare columns needed
        df.rename(columns={'AvgTone':'Tone'}, inplace=True)
        df.rename(columns={'GoldsteinScale':'Goldstein'}, inplace=True)
        df.drop(columns=['NumArticles'], inplace=True)
        df.drop(columns=['NumMentions'], inplace=True)
    
        full_date_range = pd.date_range(start_date, end_date)
        df["Date"] = pd.to_datetime(df["Date"])
        df.sort_values("Date")
        
        df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    
        count_per_day = df.groupby('Date').size().reset_index(name='Count')
        goldstein_per_day = df.groupby('Date')['Goldstein'].sum().reset_index(name='Goldstein')
        tone_per_day = df.groupby('Date')['Tone'].sum().reset_index(name='Tone')
    
        df = pd.DataFrame({'Date': full_date_range}).merge(count_per_day, on='Date', how='left')
        df = df.merge(goldstein_per_day, on='Date', how='left')
        df = df.merge(tone_per_day, on='Date', how='left')
        df = df.merge(stock_data['Close'], on='Date', how='left')
    
        # raw
        df['Count'].fillna(0, inplace=True)
        df['Goldstein'].fillna(0, inplace=True)
        df['Tone'].fillna(0, inplace=True)
        
        df.set_index('Date', inplace=True)
    
        # raw rolling average
        windows = [90, 30]
        periods = [30, 1]
        
        for col in df.columns:
            
            # raw rolling
            col_name_new = (col + '_' + 'RA').upper()
            df[col_name_new] = moving_avg(df[col], windows, periods)
            prev = col_name_new
            
    
    
        # diff 1 rolling
            col_name_new = (col + '_DIFF1_' + 'RA').upper()
            df[col_name_new] = moving_avg(df[prev], windows, periods)
            prev = col_name_new
    
    
        # diff 2 rolling
            col_name_new = (col + '_DIFF2_' + 'RA').upper()
            df[col_name_new] = moving_avg(df[prev], windows, periods)
            prev = col_name_new

        
        df.to_csv(os.path.join(folder_path_out, file))
    print()
        


stocks_to_company_names
Running: AAPL, Apple, processing/stocks_to_company_names
Running: ABBV, Abbvie, processing/stocks_to_company_names
Running: ADM, Archer-daniels, processing/stocks_to_company_names
Running: BP, Bp, processing/stocks_to_company_names
Running: COP, Conocophillips, processing/stocks_to_company_names
Running: CTVA, Corteva, processing/stocks_to_company_names
Running: CVX, Chevron, processing/stocks_to_company_names
Running: DE, Deere, processing/stocks_to_company_names
Running: DOW, Dow-jones, processing/stocks_to_company_names
Running: FCX, Freeport-mcmoran, processing/stocks_to_company_names
Running: JNJ, Johnson-and-johnson, processing/stocks_to_company_names
Running: JPM, Jpmorgan-chase, processing/stocks_to_company_names
Running: KO, Coca-cola, processing/stocks_to_company_names
Running: MA, Mastercard, processing/stocks_to_company_names
Running: MSFT, Microsoft, processing/stocks_to_company_names
Running: NEM, Newmont-corporation, processing/stocks_to_company_n