In [1]:
######## SENTIMENT TABLE AS DATAFRAME ################
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()

# Define your table path
table_path = "sentiment-lewagon.sentiment_db.SENTIMENT"

# Load the table into a DataFrame
df = client.list_rows(table_path).to_dataframe()

df = df.drop_duplicates()

In [2]:
import pandas as pd

# Example: Assuming df is already your DataFrame
# and it has these three columns:
# 'count_positive_chunks', 'count_negative_chunks', 'count_neutral_chunks'

def determine_sentiment(row):
    max_val = max(row['count_positive_chunks'], row['count_negative_chunks'], row['count_neutral_chunks'])
    if row['count_positive_chunks'] == max_val:
        return 'positive'
    elif row['count_negative_chunks'] == max_val:
        return 'negative'
    else:
        return 'neutral'

# Apply the function row-wise
df['overall_sentiment'] = df.apply(determine_sentiment, axis=1)

In [3]:
########### ADD NEW COLUMNS ###################
import numpy as np
total = df[['count_positive_chunks', 'count_negative_chunks', 'count_neutral_chunks']].sum(axis=1)
total = total.replace(0, np.nan)  # Avoid division by zero
df['count_pos_over_total_count'] = df['count_positive_chunks'] / total
df['count_neg_over_total_count'] = df['count_negative_chunks'] / total
df['count_neut_over_total_count'] = df['count_neutral_chunks'] / total
df['net_sentiment'] = ((df['count_positive_chunks'] -df['count_negative_chunks']) / total)
df['total_chunks_analysed'] = df['count_positive_chunks'] + df['count_negative_chunks'] + df['count_neutral_chunks']

In [4]:
col_order = ['cik', 'filename', 'bucket_filepath', 'total_chunks_analysed', 'count_positive_chunks',
       'count_negative_chunks', 'count_neutral_chunks', 'max_positive_score',
       'max_negative_score', 'max_neutral_score', 'sum_positive',
       'sum_negative', 'sum_neutral', 'avg_positive', 'avg_negative',
       'avg_neutral', 'overall_sentiment', 'count_pos_over_total_count',
       'count_neg_over_total_count', 'count_neut_over_total_count',
       'net_sentiment' ]
df = df[col_order]

In [5]:
### get core_v3 from BG
core_v3 = "sentiment-lewagon.sentiment_db.META"
# Load the table in DataFrame
c = client.list_rows(core_v3).to_dataframe()

In [6]:
### only selects columns we would want
d = c[['bucket_file_path', 'conformed_period_of_report', 'quarter', 'year', 'ticker']]

In [7]:
### create new column for quarter-year combined
d['quarter_year'] = d['quarter'] + '-' + d['year'].astype(str).str[-2:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['quarter_year'] = d['quarter'] + '-' + d['year'].astype(str).str[-2:]


In [8]:
df.rename(columns={'bucket_filepath': 'bucket_file_path'}, inplace=True)

In [9]:
sentiment_clean_v2 = pd.merge(d, df, on="bucket_file_path", how = "left")

In [10]:
sentiment_clean_v2 = sentiment_clean_v2.drop_duplicates()

In [11]:
sentiment_clean_v2

Unnamed: 0,bucket_file_path,conformed_period_of_report,quarter,year,ticker,quarter_year,cik,filename,total_chunks_analysed,count_positive_chunks,...,sum_negative,sum_neutral,avg_positive,avg_negative,avg_neutral,overall_sentiment,count_pos_over_total_count,count_neg_over_total_count,count_neut_over_total_count,net_sentiment
0,clean_data_2020q1/20200221_10-K_edgar_data_180...,2019-12-31,Q4,2019,ABT,Q4-19,1800,20200221_10-K_edgar_data_1800_0001104659-20-02...,33,8,...,4.293786,20.050535,0.262293,0.130115,0.607592,neutral,0.242424,0.090909,0.666667,0.151515
1,clean_data_2021q1/20210219_10-K_edgar_data_180...,2020-12-31,Q4,2020,ABT,Q4-20,1800,20210219_10-K_edgar_data_1800_0001104659-21-02...,28,1,...,7.187339,17.199859,0.129029,0.256691,0.614281,neutral,0.035714,0.285714,0.678571,-0.25
2,clean_data_2022q1/20220218_10-K_edgar_data_180...,2021-12-31,Q4,2021,ABT,Q4-21,1800,20220218_10-K_edgar_data_1800_0001104659-22-02...,24,6,...,2.186888,15.622400,0.257946,0.091120,0.650933,neutral,0.25,0.0,0.75,0.25
3,clean_data_2023q1/20230217_10-K_edgar_data_180...,2022-12-31,Q4,2022,ABT,Q4-22,1800,20230217_10-K_edgar_data_1800_0001628280-23-00...,322,36,...,79.067672,195.315415,0.147879,0.245552,0.606570,neutral,0.111801,0.245342,0.642857,-0.13354
4,clean_data_2020q1/20200204_10-K_edgar_data_248...,2019-12-28,Q4,2019,AMD,Q4-19,2488,20200204_10-K_edgar_data_2488_0000002488-20-00...,73,13,...,12.578708,46.198399,0.194834,0.172311,0.632855,neutral,0.178082,0.136986,0.684932,0.041096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11711,clean_data_2024q2/20240424_10-Q_edgar_data_199...,2024-03-31,Q1,2024,BG,Q1-24,1996862,20240424_10-Q_edgar_data_1996862_0001996862-24...,98,10,...,39.986106,44.453362,0.138373,0.408021,0.453606,neutral,0.102041,0.438776,0.459184,-0.336735
11712,clean_data_2024q3/20240801_10-Q_edgar_data_199...,2024-06-30,Q2,2024,BG,Q2-24,1996862,20240801_10-Q_edgar_data_1996862_0001996862-24...,124,13,...,60.640358,46.779108,0.133714,0.489035,0.377251,negative,0.104839,0.491935,0.403226,-0.387097
11713,clean_data_2024q4/20241030_10-Q_edgar_data_199...,2024-09-30,Q3,2024,BG,Q3-24,1996862,20241030_10-Q_edgar_data_1996862_0001996862-24...,136,17,...,62.012728,54.733902,0.141569,0.455976,0.402455,negative,0.125,0.448529,0.426471,-0.323529
11714,clean_data_2024q4/20241108_10-Q_edgar_data_200...,2024-09-30,Q3,2024,SW,Q3-24,2005951,20241108_10-Q_edgar_data_2005951_0002005951-24...,1,0,...,0.024545,0.945819,0.029636,0.024545,0.945819,neutral,0.0,0.0,1.0,0.0


In [12]:
from google.cloud import bigquery

def upload_sentiment_clean_to_bq_(df):
    """
    Uploads parsed MDA data from EDGAR filings to BigQuery.
    Expects columns: cik, filename, management_discussion
    """
    # Rename columns to lowercase
    df.columns = df.columns.str.lower()


    try:
        # Keep only the required columns

        BQ_PROJECT_ID = 'sentiment-lewagon'
        BQ_DATASET_ID = 'sentiment_db'
        BQ_TABLE_ID = 'SENTIMENT_TRAIN_ALL'
        table_ref = f"{BQ_PROJECT_ID}.{BQ_DATASET_ID}.{BQ_TABLE_ID}"

        client = bigquery.Client()

        job_config = bigquery.LoadJobConfig(
            write_disposition="WRITE_APPEND"
        )

        job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
        job.result()

        print(f"✅ Uploaded {job.output_rows} rows to {table_ref}")

    except Exception as e:
        print(f"❌ Failed to upload DataFrame to BigQuery: {e}")

In [13]:
upload_sentiment_clean_to_bq_(sentiment_clean_v2)

✅ Uploaded 11716 rows to sentiment-lewagon.sentiment_db.SENTIMENT_TRAIN_ALL


In [46]:
(sentiment_clean_v2.total_chunks_analysed > 0).sum()

np.int64(10534)