sentiment_results/mda/top30

In [1]:
import pandas as pd
import glob
import os

# Define the sentiment columns to extract, including the new ones
sentiment_columns = [
    'polarity_detection', 'emotion_detection', 'intent_analysis', 'subjectivity_objectivity',
    'gemini_overall_sentiment', 'gemini_emotional_sentiment', 'gemini_contextual_sentiment',
    'fine_grained_sentiment_avg', 'aspect_based_sentiment_avg', 'topic_sentiment_analysis_avg',
    'contextual_sentiment_avg', 'aspect_financial_performance_revenue_growth',
    'aspect_financial_performance_earnings', 'aspect_financial_performance_profit_margins',
    'aspect_financial_performance_debt_levels', 'aspect_financial_performance_cash_flow',
    'aspect_management_and_leadership', 'aspect_product_service_performance',
    'aspect_future_outlook', 'aspect_legal_and_risk', 'topic_financial_performance',
    'topic_management_and_leadership', 'topic_product_service_performance',
    'topic_industry_and_market_factors', 'topic_future_outlook', 'topic_legal_and_risk',
    'vader_compound', 'vader_pos', 'vader_neg', 'vader_neu',
    'finbert_positive', 'finbert_negative', 'finbert_neutral', 'finbert_sentiment'
]

# Columns to keep: Ticker, FiledAt, and sentiment columns
columns_to_keep = ['Ticker', 'FiledAt'] + sentiment_columns

# Define the specific list of tickers to process
specific_tickers = [
    "AAPL", "MSFT", "NVDA", "GOOGL", "AMZN", "META", "BRK.B", "AVGO", "TSLA", "LLY",
    "WMT", "JPM", "V", "MA", "XOM", "COST", "UNH", "HD", "PG", "JNJ",
    "ABBV", "CRM", "BAC", "ORCL", "MRK", "CVX", "WFC", "KO", "CSCO", "ACN"
]

# Get the list of files matching the pattern
files = glob.glob('sentiment_results/mda/mda_sentiment_*_results_v3.csv')

# Initialize a list to hold DataFrames
dfs = []

# Loop through each file
for file in files:
    try:
        df = pd.read_csv(file)
        # Extract only the specified columns, filling missing with NaN
        df_extracted = df.reindex(columns=columns_to_keep)
        dfs.append(df_extracted)
    except Exception as e:
        print(f"Error processing {file}: {e}")

# Concatenate all DataFrames
if dfs:
    master_df = pd.concat(dfs, ignore_index=True)
else:
    print("No data to process.")
    exit()

# Convert 'FiledAt' to datetime, coercing errors to NaT
master_df['FiledAt'] = pd.to_datetime(master_df['FiledAt'], errors='coerce')

# Drop rows where 'FiledAt' is null (NaT)
master_df = master_df.dropna(subset=['FiledAt'])

# Filter the master_df to only include the specific tickers
master_df = master_df[master_df['Ticker'].isin(specific_tickers)]

# Generate daily date range from 2023-01-01 to 2024-12-31
daily_dates = pd.date_range(start='2023-01-01', end='2024-12-31', freq='D')
daily_df = pd.DataFrame({'Date': daily_dates})

# Get unique tickers from the filtered master_df
tickers = master_df['Ticker'].unique()

# Ensure the 'data' directory exists
os.makedirs('data', exist_ok=True)

# Process each ticker in the specific list
for ticker in tickers:
    # Filter data for the current ticker
    ticker_df = master_df[master_df['Ticker'] == ticker]
    
    # Select 'FiledAt' and sentiment columns
    ticker_df = ticker_df[['FiledAt'] + sentiment_columns]
    
    # Sort by 'FiledAt' for merge_asof
    ticker_df = ticker_df.sort_values('FiledAt')
    
    # If no valid data, skip this ticker
    if ticker_df.empty:
        print(f"No valid data for {ticker} after filtering.")
        continue
    
    # Perform the merge with daily dates using backward direction
    merged_df = pd.merge_asof(
        daily_df.sort_values('Date'),
        ticker_df,
        left_on='Date',
        right_on='FiledAt',
        direction='backward'
    )
    
    # Select relevant columns (Date and sentiment scores)
    result_df = merged_df[['Date'] + sentiment_columns]
    
    # Save the result to a CSV file
    result_df.to_csv(f'sentiment_results/mda/top30/{ticker}_2023_24.csv', index=False)
    print(f"Saved time series data for {ticker} to sentiment_results/mda/top30/{ticker}_2023_24.csv")

Saved time series data for ABBV to sentiment_results/mda/top30/ABBV_2023_24.csv
Saved time series data for ACN to sentiment_results/mda/top30/ACN_2023_24.csv
Saved time series data for GOOGL to sentiment_results/mda/top30/GOOGL_2023_24.csv
Saved time series data for AMZN to sentiment_results/mda/top30/AMZN_2023_24.csv
Saved time series data for AAPL to sentiment_results/mda/top30/AAPL_2023_24.csv
Saved time series data for BAC to sentiment_results/mda/top30/BAC_2023_24.csv
Saved time series data for BRK.B to sentiment_results/mda/top30/BRK.B_2023_24.csv
Saved time series data for AVGO to sentiment_results/mda/top30/AVGO_2023_24.csv
Saved time series data for CVX to sentiment_results/mda/top30/CVX_2023_24.csv
Saved time series data for CSCO to sentiment_results/mda/top30/CSCO_2023_24.csv
Saved time series data for KO to sentiment_results/mda/top30/KO_2023_24.csv
Saved time series data for COST to sentiment_results/mda/top30/COST_2023_24.csv
Saved time series data for XOM to sentiment_re

In [2]:
df = pd.read_csv('sentiment_results/mda/top30/NVDA_2023_24.csv')
df.head()

Unnamed: 0,Date,polarity_detection,emotion_detection,intent_analysis,subjectivity_objectivity,gemini_overall_sentiment,gemini_emotional_sentiment,gemini_contextual_sentiment,fine_grained_sentiment_avg,aspect_based_sentiment_avg,...,topic_future_outlook,topic_legal_and_risk,vader_compound,vader_pos,vader_neg,vader_neu,finbert_positive,finbert_negative,finbert_neutral,finbert_sentiment
0,2023-01-01,-0.3,-0.4,0.1,-0.2,-0.3,-0.4,-0.2,-0.25,-0.23,...,-0.4,-0.4,0.9998,0.086,0.043,0.87,0.022159,0.041328,0.936512,-0.019169
1,2023-01-02,-0.3,-0.4,0.1,-0.2,-0.3,-0.4,-0.2,-0.25,-0.23,...,-0.4,-0.4,0.9998,0.086,0.043,0.87,0.022159,0.041328,0.936512,-0.019169
2,2023-01-03,-0.3,-0.4,0.1,-0.2,-0.3,-0.4,-0.2,-0.25,-0.23,...,-0.4,-0.4,0.9998,0.086,0.043,0.87,0.022159,0.041328,0.936512,-0.019169
3,2023-01-04,-0.3,-0.4,0.1,-0.2,-0.3,-0.4,-0.2,-0.25,-0.23,...,-0.4,-0.4,0.9998,0.086,0.043,0.87,0.022159,0.041328,0.936512,-0.019169
4,2023-01-05,-0.3,-0.4,0.1,-0.2,-0.3,-0.4,-0.2,-0.25,-0.23,...,-0.4,-0.4,0.9998,0.086,0.043,0.87,0.022159,0.041328,0.936512,-0.019169


In [3]:
df.tail()

Unnamed: 0,Date,polarity_detection,emotion_detection,intent_analysis,subjectivity_objectivity,gemini_overall_sentiment,gemini_emotional_sentiment,gemini_contextual_sentiment,fine_grained_sentiment_avg,aspect_based_sentiment_avg,...,topic_future_outlook,topic_legal_and_risk,vader_compound,vader_pos,vader_neg,vader_neu,finbert_positive,finbert_negative,finbert_neutral,finbert_sentiment
726,2024-12-27,0.4,0.2,0.3,0.2,0.4,0.2,0.3,0.3,0.41,...,0.4,-0.4,0.9999,0.104,0.034,0.862,0.022343,0.039776,0.937881,-0.017433
727,2024-12-28,0.4,0.2,0.3,0.2,0.4,0.2,0.3,0.3,0.41,...,0.4,-0.4,0.9999,0.104,0.034,0.862,0.022343,0.039776,0.937881,-0.017433
728,2024-12-29,0.4,0.2,0.3,0.2,0.4,0.2,0.3,0.3,0.41,...,0.4,-0.4,0.9999,0.104,0.034,0.862,0.022343,0.039776,0.937881,-0.017433
729,2024-12-30,0.4,0.2,0.3,0.2,0.4,0.2,0.3,0.3,0.41,...,0.4,-0.4,0.9999,0.104,0.034,0.862,0.022343,0.039776,0.937881,-0.017433
730,2024-12-31,0.4,0.2,0.3,0.2,0.4,0.2,0.3,0.3,0.41,...,0.4,-0.4,0.9999,0.104,0.034,0.862,0.022343,0.039776,0.937881,-0.017433
