In [None]:
import nbimporter
import pandas as pd
import json
import os
import re
%load_ext autoreload
%autoreload 2

In [None]:
from step_2_ticker_converter import ticker_converter
from step_3_cloud_clean_data import cloud_clean_data
from step_1_historical_index import historical_data, get_quarter

In [None]:
from google.cloud import storage
import re
import pandas as pd
from io import BytesIO

def cloud_clean_data_gcs(bucket_name, folder_prefix):
    # Initialize client
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # List blobs in the folder
    blobs = bucket.list_blobs(prefix=folder_prefix)

    rows = []
    for blob in blobs:
        filename = blob.name.split("/")[-1]
        if not filename.endswith(".txt"):
            continue
        if "10-Q" not in filename and "10-K" not in filename:
            continue

        # Extract CIK and filing type
        cik_match = re.search(r'edgar_data_(\d+)_', filename)
        type_match = re.search(r'10-[QK]', filename)
        cik = cik_match.group(1) if cik_match else None
        filing_type = type_match.group(0) if type_match else None

        # Read first part of the blob (up to 5000 bytes)
        try:
            content = blob.download_as_bytes(start=0, end=4999).decode('utf-8', errors='ignore')
            period_match = re.search(r'CONFORMED PERIOD OF REPORT:\s*(\d{8})', content)
            period = period_match.group(1) if period_match else None
        except Exception as e:
            period = None

        rows.append({
            'cik': cik,
            'filing_type': filing_type,
            'filename': filename,
            'conformed_period_of_report': period,
            'bucket_file_path': f"gs://{bucket_name}/{blob.name}"
        })

    return pd.DataFrame(rows)


Below script did not work for q4 have to add mannually

In [None]:
bucket_name = "sentiment_chloe-curtis"
quarters = [f"{year}q{q}" for year in range(2019, 2025) for q in range(1, 5)]
dataframes = []

for quarter in quarters:
    try:
        print(f"Processing {quarter}...")
        df = cloud_clean_data_gcs(bucket_name, f"{quarter}/")
        dataframes.append(df)
    except Exception as e:
        print(f"Failed to process {quarter}: {e}")

cloud_combined = pd.concat(dataframes, ignore_index=True)

In [None]:
########## CLEANS FUNCTION 3 ##################
# Convert to datetime if not already
cloud_combined['conformed_period_of_report'] = pd.to_datetime(cloud_combined['conformed_period_of_report'])

# Create the 'year' column
cloud_combined['year'] = cloud_combined['conformed_period_of_report'].dt.year

# Create the 'quarter' column in the format 'Q1', 'Q2', etc.
cloud_combined['quarter'] = 'Q' + cloud_combined['conformed_period_of_report'].dt.quarter.astype(str)
cloud_combined.loc[:, 'cik'] = cloud_combined['cik'].astype(str).str.zfill(10)

cloud_combined['bucket_file_path'] = cloud_combined['bucket_file_path'].str.replace(
    'gs://sentiment_chloe-curtis/', 'clean_data_', regex=False
)

Below script gets 2024 Q4, cleans and adds columns

In [None]:
df_2024q4_clean =  cloud_clean_data_gcs("sentiment_chloe-curtis", "clean_data_2024q4/")

df_2024q4_clean['conformed_period_of_report'] = pd.to_datetime(df_2024q4_clean['conformed_period_of_report'])

# Create the 'year' column
df_2024q4_clean['year'] = df_2024q4_clean['conformed_period_of_report'].dt.year

# Create the 'quarter' column in the format 'Q1', 'Q2', etc.
df_2024q4_clean['quarter'] = 'Q' + df_2024q4_clean['conformed_period_of_report'].dt.quarter.astype(str)

# Ensure 'cik' is a 10-character string with leading zeros
df_2024q4_clean.loc[:, 'cik'] = df_2024q4_clean['cik'].astype(str).str.zfill(10)

# Modify 'bucket_file_path'
df_2024q4_clean['bucket_file_path'] = df_2024q4_clean['bucket_file_path'].str.replace(
    'gs://sentiment_chloe-curtis/', '', regex=False
)

In [None]:
####### STEP 1
file_path = "historical_data.txt"
historical_index = historical_data(file_path)

In [None]:
###### STEP 2
ticker_path = "ticker_converter.json"
ticker_conversion = ticker_converter(ticker_path)

In [None]:
########## JOIN STEP 1 & 2 ###################
base_index = pd.merge(historical_index, ticker_conversion, on = "ticker", how = "left")
### JOIN CAUSED NAS - AS TICKER CONVERSION DIDNT HAVE COMPANY NAME AND CIK
### REMOVE THE NAS FROM THE JOIN
x = base_index.dropna()
## CREATE DF WITH THE NAS
y = base_index[base_index.isnull().any(axis=1)]
## DROP THE COLUMNS WE DONT NEED FROM NA DF
g = y[['quarter','ticker']]
## CSV HAS THE DATA WE NEED
missing_company = pd.read_csv('missing_companies.csv')
### MERGE BACK TO HAVE THE CIX AND COMPANY NAME 
z = pd.merge(g, missing_company, on = "ticker", how = "left")
### PAD THE CIK NUMBERS WITH ZEROS - 10 CHARCTERS
z['cik'] = z['cik'].astype(str).str.zfill(10)
## CONCAT BOTH DFS
df = pd.concat([x, z], axis=0, ignore_index=True)
# Split the 'quarter' column into two parts
df[['quarter', 'year']] = df['quarter'].str.split('-', expand=True)

# Convert 2-digit year to 4-digit
df['year'] = df['year'].apply(lambda x: '20' + x if int(x) < 50 else '19' + x)
df['year'] = df['year'].astype(int)

In [None]:
########## JOIN STEP 3 WITH 1 & 2
core_v3 =pd.merge(df, df_2024q4_clean, on =['cik','quarter','year'], how = "inner")

In [None]:
from google.cloud import bigquery

def upload_core_v3_to_bq_(df):
    """
    Uploads parsed MDA data from EDGAR filings to BigQuery.
    Expects columns: cik, filename, management_discussion
    """
    # Rename columns to lowercase
    df.columns = df.columns.str.lower()


    try:
        # Keep only the required columns

        BQ_PROJECT_ID = 'sentiment-lewagon'
        BQ_DATASET_ID = 'sentiment_db'
        BQ_TABLE_ID = 'core_v3'
        table_ref = f"{BQ_PROJECT_ID}.{BQ_DATASET_ID}.{BQ_TABLE_ID}"

        client = bigquery.Client()

        job_config = bigquery.LoadJobConfig(
            write_disposition="WRITE_APPEND"
        )

        job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
        job.result()

        print(f"✅ Uploaded {job.output_rows} rows to {table_ref}")

    except Exception as e:
        print(f"❌ Failed to upload DataFrame to BigQuery: {e}")

In [None]:
upload_core_v3_to_bq_(core_v3)