### 1. Setup and Import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)

### 2. Load Raw Data

In [3]:
ROOT = Path.cwd().parent.parent.parent

annual_path = ROOT / "data/raw/fundamentals/fmp/fundamentals_annual.parquet"
quarter_path = ROOT / "data/raw/fundamentals/fmp/fundamentals_quarter.parquet"

dfA = pd.read_parquet(annual_path)
dfQ = pd.read_parquet(quarter_path)

dfA.head(), dfQ.head()

(         date symbol       revenue   grossProfit  operatingIncome     netIncome    eps   totalAssets  totalStockholdersEquity  totalLiabilities     totalDebt  operatingCashFlow  freeCashFlow  \
 0  2017-01-29   NVDA  6.910000e+09  4.063000e+09     1.934000e+09  1.666000e+09  0.077  9.841000e+09             5.762000e+09      4.079000e+09  2.816000e+09       1.672000e+09  1.496000e+09   
 1  2018-01-28   NVDA  9.714000e+09  5.822000e+09     3.210000e+09  3.047000e+09  0.130  1.124100e+10             7.471000e+09      3.770000e+09  2.000000e+09       3.502000e+09  2.909000e+09   
 2  2019-01-27   NVDA  1.171600e+10  7.171000e+09     3.804000e+09  4.141000e+09  0.170  1.329200e+10             9.342000e+09      3.950000e+09  1.988000e+09       3.743000e+09  3.143000e+09   
 3  2020-01-26   NVDA  1.091800e+10  6.768000e+09     2.846000e+09  2.796000e+09  0.120  1.731500e+10             1.220400e+10      5.111000e+09  2.643000e+09       4.761000e+09  4.272000e+09   
 4  2021-01-31   NVDA  1.

### 3. Explore Raw Data

In [4]:
print("Annual shape:", dfA.shape)
print("Quarter shape:", dfQ.shape)

print("Annual years:", dfA['year'].unique())
print("Quarter years:", dfQ['year'].unique())

dfA['period'].value_counts(), dfQ['period'].value_counts()


Annual shape: (4144, 24)
Quarter shape: (5089, 24)
Annual years: [2017 2018 2019 2020 2021 2022 2023 2024 2025]
Quarter years: [2023 2024 2025 2022]


(period
 A    4144
 Name: count, dtype: int64,
 period
 Q    5089
 Name: count, dtype: int64)

In [5]:
print("Annual symbol count:", dfA['symbol'].nunique())
print("Quarter symbol count:", dfQ['symbol'].nunique())

Annual symbol count: 503
Quarter symbol count: 503


### 4. Normalize Year Column

In [6]:
dfA['year'] = pd.to_datetime(dfA['date'], errors='coerce').dt.year
dfQ['year'] = pd.to_datetime(dfQ['date'], errors='coerce').dt.year

### 5. Filter Data >= 2017

In [7]:
dfA = dfA[dfA['year'] >= 2017]
dfQ = dfQ[dfQ['year'] >= 2017]

dfA.shape, dfQ.shape


((4144, 24), (5089, 24))

### 6. Clean Quarterly: Pick Latest Quarter per Symbol-Year

In [8]:
dfQ_latest = (
    dfQ.sort_values(['symbol', 'year', 'date'])
       .groupby(['symbol', 'year'], as_index=False)
       .last()
)

dfQ_latest.head()

Unnamed: 0,symbol,year,date,revenue,grossProfit,operatingIncome,netIncome,eps,totalAssets,totalStockholdersEquity,totalLiabilities,totalDebt,operatingCashFlow,freeCashFlow,capitalExpenditure,numberOfShares,enterpriseValue,stockPrice,marketCapitalization,minusCashAndCashEquivalents,addTotalDebt,sharesOutstanding,marketCap,period
0,A,2023,2023-10-31,1688000000.0,915000000.0,408000000.0,475000000.0,1.63,10763000000.0,5845000000.0,4918000000.0,2735000000.0,516000000.0,432000000.0,-84000000.0,292000000.0,31329040000.0,103.37,30184040000.0,1590000000.0,2735000000.0,292000000.0,30184040000.0,Q
1,A,2024,2024-10-31,1701000000.0,916000000.0,408000000.0,351000000.0,1.23,11846000000.0,5898000000.0,5948000000.0,3390000000.0,481000000.0,388000000.0,-93000000.0,290000000.0,39850900000.0,130.31,37789900000.0,1329000000.0,3390000000.0,290000000.0,37789900000.0,Q
2,A,2025,2025-07-31,1738000000.0,914000000.0,360000000.0,336000000.0,1.18,12226000000.0,6370000000.0,5856000000.0,3409000000.0,362000000.0,573000000.0,211000000.0,285000000.0,34594850000.0,114.81,32720850000.0,1535000000.0,3409000000.0,285000000.0,32720850000.0,Q
3,AAPL,2023,2023-12-30,119575000000.0,54855000000.0,40373000000.0,33916000000.0,2.19,353514000000.0,74100000000.0,279414000000.0,108040000000.0,39895000000.0,37503000000.0,-2392000000.0,15509760000.0,3053375000000.0,192.53,2986095000000.0,40760000000.0,108040000000.0,15509760000.0,2986095000000.0,Q
4,AAPL,2024,2024-12-28,124300000000.0,58275000000.0,42832000000.0,36330000000.0,2.41,344085000000.0,66758000000.0,277327000000.0,96799000000.0,29935000000.0,26995000000.0,-2940000000.0,15081720000.0,3921238000000.0,255.59,3854738000000.0,30299000000.0,96799000000.0,15081720000.0,3854738000000.0,Q


### 7. Clean Annual: Pick Final Annual Filing per Symbol-Year

In [9]:
dfA_latest = (
    dfA.sort_values(['symbol', 'year', 'date'])
       .groupby(['symbol', 'year'], as_index=False)
       .last()
)

dfA_latest.head()


Unnamed: 0,symbol,year,date,revenue,grossProfit,operatingIncome,netIncome,eps,totalAssets,totalStockholdersEquity,totalLiabilities,totalDebt,operatingCashFlow,freeCashFlow,capitalExpenditure,numberOfShares,enterpriseValue,stockPrice,marketCapitalization,minusCashAndCashEquivalents,addTotalDebt,sharesOutstanding,marketCap,period
0,A,2017,2017-10-31,4472000000.0,2409000000.0,841000000.0,684000000.0,2.12,8426000000.0,4826000000.0,3591000000.0,2011000000.0,889000000.0,713000000.0,-176000000.0,322000000.0,21238660000.0,68.03,21905660000.0,2678000000.0,2011000000.0,322000000.0,21905660000.0,A
1,A,2018,2018-10-31,4914000000.0,2687000000.0,904000000.0,316000000.0,0.98,8541000000.0,4567000000.0,3970000000.0,1799000000.0,1087000000.0,910000000.0,-177000000.0,321000000.0,20349590000.0,64.79,20797590000.0,2247000000.0,1799000000.0,321000000.0,20797590000.0,A
2,A,2019,2019-10-31,5163000000.0,2805000000.0,941000000.0,1071000000.0,3.41,9452000000.0,4748000000.0,4704000000.0,2407000000.0,1021000000.0,865000000.0,-156000000.0,314000000.0,24810500000.0,75.75,23785500000.0,1382000000.0,2407000000.0,314000000.0,23785500000.0,A
3,A,2020,2020-10-31,5339000000.0,2837000000.0,846000000.0,719000000.0,2.33,9627000000.0,4873000000.0,4754000000.0,2359000000.0,921000000.0,802000000.0,-119000000.0,309000000.0,32463810000.0,102.09,31545810000.0,1441000000.0,2359000000.0,309000000.0,31545810000.0,A
4,A,2021,2021-10-31,6319000000.0,3407000000.0,1347000000.0,1210000000.0,3.98,10705000000.0,5389000000.0,5316000000.0,2729000000.0,1485000000.0,1296000000.0,-189000000.0,304000000.0,48991240000.0,157.06,47746240000.0,1484000000.0,2729000000.0,304000000.0,47746240000.0,A


### 8. Merge annual and quarter data (Annual priority)

In [10]:
dfC = pd.concat([dfA_latest, dfQ_latest], ignore_index=True)

df_clean = (
    dfC.sort_values(['symbol', 'year', 'period'])  # A before Q
       .groupby(['symbol', 'year'], as_index=False)
       .first()
)

df_clean.head(20), df_clean.shape

(   symbol  year        date       revenue   grossProfit  operatingIncome     netIncome   eps   totalAssets  totalStockholdersEquity  totalLiabilities     totalDebt  operatingCashFlow  freeCashFlow  \
 0       A  2017  2017-10-31  4.472000e+09  2.409000e+09     8.410000e+08  6.840000e+08  2.12  8.426000e+09             4.826000e+09      3.591000e+09  2.011000e+09       8.890000e+08  7.130000e+08   
 1       A  2018  2018-10-31  4.914000e+09  2.687000e+09     9.040000e+08  3.160000e+08  0.98  8.541000e+09             4.567000e+09      3.970000e+09  1.799000e+09       1.087000e+09  9.100000e+08   
 2       A  2019  2019-10-31  5.163000e+09  2.805000e+09     9.410000e+08  1.071000e+09  3.41  9.452000e+09             4.748000e+09      4.704000e+09  2.407000e+09       1.021000e+09  8.650000e+08   
 3       A  2020  2020-10-31  5.339000e+09  2.837000e+09     8.460000e+08  7.190000e+08  2.33  9.627000e+09             4.873000e+09      4.754000e+09  2.359000e+09       9.210000e+08  8.020000e+0

### 9. Transform wide format to long format

In [12]:
df_wide = df_clean.copy()
df_wide.columns.tolist()

['symbol',
 'year',
 'date',
 'revenue',
 'grossProfit',
 'operatingIncome',
 'netIncome',
 'eps',
 'totalAssets',
 'totalStockholdersEquity',
 'totalLiabilities',
 'totalDebt',
 'operatingCashFlow',
 'freeCashFlow',
 'capitalExpenditure',
 'numberOfShares',
 'enterpriseValue',
 'stockPrice',
 'marketCapitalization',
 'minusCashAndCashEquivalents',
 'addTotalDebt',
 'sharesOutstanding',
 'marketCap',
 'period']

In [17]:
metric_map = {
    "revenue" : "revenue",
    "grossProfit" : "gross_profit",
    "operatingIncome" : "operating_income",
    "netIncome" : "net_income",
    "eps" : "eps",
    "totalAssets" : "total_assets",
    "totalStockholdersEquity" : "total_stockholders_equity",
    "totalLiabilities" : "total_liabilities",
    "totalDebt" : "total_debt",
    "operatingCashFlow" : "operating_cash_flow",
    "freeCashFlow" : "free_cash_flow",
    "capitalExpenditure" : "capital_expenditure",
    "numberOfShares" : "number_of_shares",
    "enterpriseValue" : "enterprise_value",
    "marketCapitalization" : "market_capitalization",
    "sharesOutstanding" : "shares_outstanding",
}

value_cols = list(metric_map.keys())

# =============================================
# Period_end at long format
# =============================================

df_long = df_wide.copy()
df_long["period_end"] = pd.to_datetime(df_long["date"]).dt.date

df_long["period_type"] = df_long["period"].map({
    "A" : "annual",
    "Q" : "quarterly"
})

if df_long["period_type"].isna().any():
    raise ValueError("Some rows have invalid period value (not A/Q).")

# =============================================
# Melt to long format
# =============================================

df_long = df_long.melt(
    id_vars = ["symbol", "period_end", "period_type"],
    value_vars = value_cols,
    var_name = "metric_raw",
    value_name = "value"
)

df_long["metric"] = df_long["metric_raw"].map(metric_map)
df_long = df_long.dropna(subset=["value"])

print("Long format rows:", len(df_long))
df_long.head(10)

Long format rows: 72001


Unnamed: 0,symbol,period_end,period_type,metric_raw,value,metric
0,A,2017-10-31,annual,revenue,4472000000.0,revenue
1,A,2018-10-31,annual,revenue,4914000000.0,revenue
2,A,2019-10-31,annual,revenue,5163000000.0,revenue
3,A,2020-10-31,annual,revenue,5339000000.0,revenue
4,A,2021-10-31,annual,revenue,6319000000.0,revenue
5,A,2022-10-31,annual,revenue,6848000000.0,revenue
6,A,2023-10-31,annual,revenue,6833000000.0,revenue
7,A,2024-10-31,annual,revenue,6510000000.0,revenue
8,A,2025-07-31,quarterly,revenue,1738000000.0,revenue
9,AAPL,2017-09-30,annual,revenue,229234000000.0,revenue


In [18]:
# =============================================
# Final tidy dataframe
# =============================================

df_out = df_long[[
    "symbol",
    "period_end",
    "period_type",
    "metric",
    "value"
]]

# =============================================
# Save to parquet
# =============================================

OUTPUT_DIR = ROOT / "data/curated/fundamentals/"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
df_out.to_parquet(OUTPUT_DIR / "fundamentals_clean.parquet", index=False)
print("Saved cleaned fundamentals to:", OUTPUT_DIR / "fundamentals_clean.parquet")

Saved cleaned fundamentals to: /home/clsx6609/ds5110/data/curated/fundamentals/fundamentals_clean.parquet
