# Data Cleaning, Validation & Macroeconomic Integration

## Objective
This notebook cleans and validates loan-level origination and monthly performance data
and integrates macroeconomic and capital markets indicators for downstream risk modeling.

## Key Components
- Loan lifecycle validation (consecutive loan age, valid term structure)
- Removal of anomalous loan statuses (e.g., RA)
- Feature sanitation and missing value handling
- Integration of macro indicators (GDP, CPI, Unemployment, Recession Index)
- Integration of capital markets data (SIFMA issuance, outstanding, trading volumes)

## Output
- Clean loan-level origination and performance datasets
- Macro-enriched inputs for feature engineering and ML modeling


In [2]:
# Import Necessary Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import sklearn
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import (confusion_matrix, 
                             roc_auc_score, 
                             average_precision_score)
from sklearn.model_selection import cross_val_score
import warnings
import time
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning, module="distributed.utils_perf")
from multiprocessing import Pool
import dask.dataframe as dd
from dask.distributed import Client
from dask.diagnostics import ProgressBar
from collections import Counter
import os
import re


df_orig = pd.read_csv(r"C:\dwaraka\github projects\Risk Analytics\Freddie_mac_raw_data\Standard_Quaterly\sampled_origination_2021_2025Q2.csv")
df_monthly = pd.read_csv(r"C:\dwaraka\github projects\Risk Analytics\Freddie_mac_raw_data\Standard_Quaterly\sampled_monthly_perf_2021_2025Q2.csv")



charged_loans_indices = set(
    df_monthly.loc[df_monthly['Zero Balance Code'].isin([2.0, 3.0, 9.0]), 'Loan Sequence Number']
)
paid_loans_indices = set(
    df_monthly.loc[df_monthly['Zero Balance Code'] == 1.0, 'Loan Sequence Number']
)


def missing_rate(df):
    """Returns the missing rate for each column in a dataframe."""
    return pd.DataFrame(df.isnull().sum() / len(df) * 100)

orig_missing_rate = missing_rate(df_orig).rename(columns={0: 'Column Name'})
orig_missing_rate




df_orig= df_orig[df_orig['Original Loan Term'] == 360]
df_orig = df_orig[df_orig['Credit Score'] != 9999]

# Create new binary columns
df_orig['Valid DTI Ratio'] = (df_orig['Original Debt-to-Income (DTI) Ratio'] != 999).astype(int)

# Filter DataFrame using indices
df_charged_off = df_orig[df_orig['Loan Sequence Number'].isin(charged_loans_indices)]
df_paid_off = df_orig[df_orig['Loan Sequence Number'].isin(paid_loans_indices)]


print(f"Charged off loans with invalid DTI Ratio: {(df_charged_off['Valid DTI Ratio'] == 0).sum()}")
print(f"Paid off loans with invalid DTI Ratio: {(df_paid_off['Valid DTI Ratio'] == 0).sum()}")

# Replace values based on conditions
df_orig['Super Conforming Flag'].replace({np.nan: '0', 'Y': '1'}, inplace=True)
df_orig['HARP Indicator'].replace({np.nan: '0', 'Y': '1'}, inplace=True)
df_orig['First Time Homebuyer Flag'].replace({'9':'Unknown'}, inplace=True)
df_orig['Program Indicator'].replace({np.nan:'Unknown'}, inplace=True)

# Handle missing values
df_orig.loc[df_orig['Mortgage Insurance Percentage (MI %)'] == 999, 'Mortgage Insurance Percentage (MI %)'] = np.nan
df_orig['Mortgage Insurance Percentage (MI %)'] = df_orig['Mortgage Insurance Percentage (MI %)'].fillna(df_orig['Mortgage Insurance Percentage (MI %)'].median())

# Drop specified columns
columns_to_drop = [
    'Metropolitan Statistical Area (MSA) Or Metropolitan Division', 
    'Prepayment Penalty Mortgage (PPM) Flag', 
    'Original Combined Loan-to-Value (CLTV)', 
    'Amortization Type (Formerly Product Type)', 
    'Interest Only (I/O) Indicator',
    'Pre-HARP Loan Sequence Number',
    'Seller Name',
    'Servicer Name'
]
df_orig.drop(columns=columns_to_drop, inplace=True)

if 'Original Debt-to-Income (DTI) Ratio' in df_orig.columns:
    mean_dti_ratio = df_orig.loc[df_orig['Valid DTI Ratio'] == 0, 'Original Debt-to-Income (DTI) Ratio'].mean()
    df_orig.loc[df_orig['Valid DTI Ratio'] == 0, 'Original Debt-to-Income (DTI) Ratio'] = mean_dti_ratio



feature_to_keep = [
    ## loan
    'Loan Sequence Number',
    'First Payment Date',
    'Original Loan Term',
    'Original UPB',
    'Mortgage Insurance Percentage (MI %)',
    'Original Loan-to-Value (LTV)',
    'Original Interest Rate',
    'Channel',
    'Loan Purpose',
    'Super Conforming Flag',
    
    ## borrower
    'Credit Score',
    'First Time Homebuyer Flag',
    'Original Debt-to-Income (DTI) Ratio',
    'Number of Borrowers',
    
    ## property
    'Number of Units',
    'Occupancy Status',
    'Property State',
    'Property Type',
    'Property Valuation Method',
    
    ## Missing value indicator
    'Valid DTI Ratio'
]

df_orig = df_orig.loc[:, feature_to_keep].reset_index(drop=True)

month_missing_rate = missing_rate(df_monthly).rename(columns={0: 'Column Name'})
month_missing_rate.head()



# Before filtering, count the initial number of rows and unique loans
initial_row_count = df_monthly.shape[0]
initial_loan_count = df_monthly['Loan Sequence Number'].nunique()

# Find all 'Loan Sequence Number' values with 'Current Loan Delinquency Status' equal to 'RA'
ra_loans = df_monthly[df_monthly['Current Loan Delinquency Status'] == 'RA']['Loan Sequence Number'].unique()

# Count how many rows and loans will be removed
rows_to_remove = df_monthly[df_monthly['Loan Sequence Number'].isin(ra_loans)].shape[0]
loans_to_remove = len(ra_loans)

# Then, filter out all rows with these 'Loan Sequence Numbers' from the DataFrame
df_monthly = df_monthly[~df_monthly['Loan Sequence Number'].isin(ra_loans)]

# After filtering, count the remaining number of rows and unique loans
final_row_count = df_monthly.shape[0]
final_loan_count = df_monthly['Loan Sequence Number'].nunique()

# Replace values based on conditions
df_monthly['Delinquency Due to Disaster'].replace({np.nan: 0}, inplace=True)
df_monthly.loc[df_monthly['Delinquency Due to Disaster'] == 'Y', 'Delinquency Due to Disaster'] = 1

# Replace '0' with integer 0 in 'Current Loan Delinquency Status'
df_monthly['Current Loan Delinquency Status'].replace({'0': 0}, inplace=True)

# Convert 'Current Loan Delinquency Status' to integer type
df_monthly['Current Loan Delinquency Status'] = df_monthly['Current Loan Delinquency Status'].astype(int)

# Drop specified columns
columns_to_drop_monthly = ['Estimated Loan-to-Value (ELTV)', 'Defect Settlement Date']
df_monthly.drop(columns=columns_to_drop_monthly, inplace=True)

# Output the results
print(f"Removed {rows_to_remove} rows associated with {loans_to_remove} loans due to 'RA' status.")
print(f"Initial rows: {initial_row_count}, Remaining rows: {final_row_count}")
print(f"Initial unique loans: {initial_loan_count}, Remaining unique loans: {final_loan_count}")

missing_rate(df_monthly).rename(columns={0: 'Column Name'})



# Identify the unique 'Loan Sequence Number' values present in both dataframes after the operations from the provided codes
loan_seq_nums_orig = df_orig['Loan Sequence Number'].unique()
loan_seq_nums_monthly = df_monthly['Loan Sequence Number'].unique()

# Find the common 'Loan Sequence Number' values between the two dataframes
common_loan_seq_nums = set(loan_seq_nums_orig) & set(loan_seq_nums_monthly)


# Filter both dataframes to retain only the rows with 'Loan Sequence Number' values present in both dataframes
df_orig = df_orig[df_orig['Loan Sequence Number'].isin(common_loan_seq_nums)]
df_monthly = df_monthly[df_monthly['Loan Sequence Number'].isin(common_loan_seq_nums)]




def filter_and_stats_loans(df_orig, df_monthly):
    # Find all loans with 'Loan Age' starting from 0
    loans_with_zero_age = set(df_monthly[df_monthly['Loan Age'] == 0]['Loan Sequence Number'].unique())
    print(f"Loans with zero age: {len(loans_with_zero_age)}")

    # Find loans that have a non-NaN 'Zero Balance Code' at least once
    loans_with_ending = set(df_monthly.dropna(subset=['Zero Balance Code'])['Loan Sequence Number'].unique())
    print(f"Loans with a recorded ending: {len(loans_with_ending)}")

    # Filter the loans_with_zero_age to only include those that also have a recorded ending
    valid_loans = loans_with_zero_age & loans_with_ending

    # Filter df_monthly to only include loans with zero age and a recorded ending
    df_monthly_filtered = df_monthly[df_monthly['Loan Sequence Number'].isin(valid_loans)]

    # Check if ages are consecutive for each loan
    consecutively_valid_loans = set()
    for loan, group in df_monthly_filtered.groupby('Loan Sequence Number'):
        if list(group['Loan Age']) == list(range(len(group))):
            consecutively_valid_loans.add(loan)

    # Define valid loans as those with consecutive ages and a recorded ending
    valid_loans = valid_loans & consecutively_valid_loans

    # Define invalid loans
    invalid_loans = loans_with_zero_age - valid_loans
    
    # Remove invalid loans from dataframes
    df_orig_filtered = df_orig[df_orig['Loan Sequence Number'].isin(valid_loans)]
    df_monthly_filtered = df_monthly_filtered[df_monthly_filtered['Loan Sequence Number'].isin(valid_loans)]
    
    # Output statistics
    print(f"Total valid loans: {len(valid_loans)}")
    print(f"Total invalid loans: {len(invalid_loans)}")
    
    return df_orig_filtered, df_monthly_filtered

# Usage
df_orig_valid, df_monthly_valid = filter_and_stats_loans(df_orig, df_monthly)


df_orig_valid.to_csv(r"C:\dwaraka\github projects\Risk Analytics\Freddie_mac_raw_data\Standard_Quaterly\orig_valid.csv", index=False)
df_monthly_valid.to_csv(r"C:\dwaraka\github projects\Risk Analytics\Freddie_mac_raw_data\Standard_Quaterly\monthly_valid.csv", index=False)


# np.random.seed(42)
# rand_loan_idx = np.random.choice(np.array(list(selected_loan_idxes)), 1)[0]
rand_loan_idx = df_orig_valid.sample(1, random_state=42)['Loan Sequence Number'].values[0]
df_orig_valid.loc[df_orig_valid['Loan Sequence Number']==rand_loan_idx, :]



# Define paths for the CSV files on the base path 
orig_csv_path = r"C:\dwaraka\github projects\Risk Analytics\Freddie_mac_raw_data\Standard_Quaterly\df_orig.csv"
monthly_csv_path = r"C:\dwaraka\github projects\Risk Analytics\Freddie_mac_raw_data\Standard_Quaterly\df_monthly.csv"

# Output dataframes to CSV
df_orig_valid.to_csv(orig_csv_path, index=False)
df_monthly_valid.to_csv(monthly_csv_path, index=False) 


Charged off loans with invalid DTI Ratio: 0
Paid off loans with invalid DTI Ratio: 0
Removed 10769 rows associated with 370 loans due to 'RA' status.
Initial rows: 46658, Remaining rows: 35889
Initial unique loans: 1974, Remaining unique loans: 1604
Loans with zero age: 1270
Loans with a recorded ending: 1325
Total valid loans: 1263
Total invalid loans: 7


In [6]:
 # Load Excel
df = pd.read_excel(r"C:\dwaraka\github projects\Risk Analytics\Economic Data\SIFMA\2025Q2\US-Fixed-Income-Securities-Statistics-SIFMA.xlsx", sheet_name='Outstanding')

# Set header from row 6 (index 6)
df.columns = df.iloc[6]
df = df.iloc[7:].copy()

# Strip whitespace and convert columns to string
df.columns = df.columns.astype(str).str.strip()

# Manually deduplicate columns
def dedup_columns(cols):
    seen = {}
    new_cols = []
    for col in cols:
        if col not in seen:
            seen[col] = 0
            new_cols.append(col)
        else:
            seen[col] += 1
            new_cols.append(f"{col}_{seen[col]}")
    return new_cols

df.columns = dedup_columns(df.columns)

# Rename first column as 'Year'
df.rename(columns={df.columns[0]: 'Year'}, inplace=True)


# Keep only rows where Year is a 4-digit number
df = df[df['Year'].astype(str).str.match(r'^\d{4}$')]

# Define relevant base columns
base_cols = ['Year', 'UST', 'MBS', 'Corporates', 'Munis', 'Agency', 'ABS', 'CP', 'Total']


# Keep only one instance of each base column
main_cols = []
for col in base_cols:
    for c in df.columns:
        if c.startswith(col):
            main_cols.append(c)
            break

df = df[main_cols].copy()

# Convert numeric columns
for col in df.columns:
    if col != 'Year':
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Reset index
df = df.reset_index(drop=True)

# Final preview
print(df.head())


df_sifma_market = pd.read_excel(r"C:\dwaraka\github projects\Risk Analytics\Economic Data\SIFMA\2025Q2\Market_size.xlsx")
    
df_sifma_market

# Load the Excel sheet
df_sifma_issu = pd.read_excel(
    r"C:\dwaraka\github projects\Risk Analytics\Economic Data\SIFMA\2025Q2\US-Fixed-Income-Securities-Statistics-SIFMA.xlsx",
    sheet_name='Issuance'
)


# Set header from row 6 (index 6)
df_sifma_issu.columns = df_sifma_issu.iloc[6]
df_sifma_issu = df_sifma_issu.iloc[7:].copy()

# Clean column names
df_sifma_issu.columns = df_sifma_issu.columns.astype(str).str.strip()

# Deduplicate column names
def dedup_columns(cols):
    seen = {}
    new_cols = []
    for col in cols:
        if col not in seen:
            seen[col] = 0
            new_cols.append(col)
        else:
            seen[col] += 1
            new_cols.append(f"{col}_{seen[col]}")
    return new_cols

df_sifma_issu.columns = dedup_columns(df_sifma_issu.columns)

# Rename first column to 'Year'
df_sifma_issu.rename(columns={df_sifma_issu.columns[0]: 'Year'}, inplace=True)

# Filter for rows where 'Year' is a 4-digit number
df_sifma_issu = df_sifma_issu[df_sifma_issu['Year'].astype(str).str.match(r'^\d{4}$')]

# Define base columns of interest
base_cols = ['Year', 'UST', 'MBS', 'Corporates', 'Munis', 'Agency', 'ABS', 'Total']

# Keep only the first instance of each relevant column
main_cols = []
for col in base_cols:
    for c in df_sifma_issu.columns:
        if c.startswith(col):
            main_cols.append(c)
            break

df_sifma_issu = df_sifma_issu[main_cols].copy()

# Convert relevant columns to numeric
for col in df_sifma_issu.columns:
    if col != 'Year':
        df_sifma_issu[col] = pd.to_numeric(df_sifma_issu[col], errors='coerce')

# Rename columns for clarity
rename_dict = {
    "Corporates": "Corporate Securities",
    "MBS": "Mortgage-Related Securities",
    "ABS": "Asset-Backed Securities",
    "Agency": "Federal Agency Securities",
    "UST": "Treasury Securities",
    "Munis": "Municipal Securities",
    "Total": "Issuance (Total)"
}
df_sifma_issu.rename(columns=rename_dict, inplace=True)

# Reset index
df_sifma_issu = df_sifma_issu.reset_index(drop=True)

# Preview
print(df_sifma_issu.head())



# Load the Excel sheet
df_sifma_trade = pd.read_excel(
    r"C:\dwaraka\github projects\Risk Analytics\Economic Data\SIFMA\2025Q2\US-Fixed-Income-Securities-Statistics-SIFMA.xlsx",
    sheet_name='Trading Volume'
)

# Set header from row 6 (index 6)
df_sifma_trade.columns = df_sifma_trade.iloc[6]
df_sifma_trade = df_sifma_trade.iloc[7:].copy()

# Clean column names
df_sifma_trade.columns = df_sifma_trade.columns.astype(str).str.strip()

# Deduplicate column names
def dedup_columns(cols):
    seen = {}
    new_cols = []
    for col in cols:
        if col not in seen:
            seen[col] = 0
            new_cols.append(col)
        else:
            seen[col] += 1
            new_cols.append(f"{col}_{seen[col]}")
    return new_cols

df_sifma_trade.columns = dedup_columns(df_sifma_trade.columns)

# Rename first column to 'Year'
df_sifma_trade.rename(columns={df_sifma_trade.columns[0]: 'Year'}, inplace=True)

# Keep only rows where Year is a 4-digit number
df_sifma_trade = df_sifma_trade[df_sifma_trade['Year'].astype(str).str.match(r'^\d{4}$')]

# Define expected base columns
base_cols = ['Year', 'UST', 'Agency MBS', 'Non-Agency MBS', 'Corporates', 'Munis', 'Agency', 'ABS', 'Total']

# Select only the first instance of each matching column
main_cols = []
for col in base_cols:
    for c in df_sifma_trade.columns:
        if c.startswith(col):
            main_cols.append(c)
            break

df_sifma_trade = df_sifma_trade[main_cols].copy()

# Convert numeric columns safely
for col in df_sifma_trade.columns:
    if col != 'Year' and df_sifma_trade[col].ndim == 1:
        df_sifma_trade[col] = pd.to_numeric(df_sifma_trade[col], errors='coerce')

# Reset index
df_sifma_trade = df_sifma_trade.reset_index(drop=True)

# Final preview
print(df_sifma_trade.head())
print(df_sifma_trade.shape)




df_gdp = pd.read_csv(r"C:\dwaraka\github projects\Risk Analytics\Economic Data\Macro\GDP.csv")
df_gdp 


df_recession = pd.read_csv(r"C:\dwaraka\github projects\Risk Analytics\Economic Data\Macro\GDP-based Recession index.csv")
df_recession 


df_cpi = pd.read_csv(r"C:\dwaraka\github projects\Risk Analytics\Economic Data\Macro\cpi.csv")
df_cpi 


df_YC = pd.read_csv(r"C:\dwaraka\github projects\Risk Analytics\Economic Data\Macro\T10Y2Y.csv")
df_YC



df_housing_price = pd.read_csv(r"C:\dwaraka\github projects\Risk Analytics\Economic Data\Macro\fmhpi_master_file.csv")
df_housing_price 



df_unemployment = pd.read_csv(r"C:\dwaraka\github projects\Risk Analytics\Economic Data\Macro\UNRATE.csv")
df_unemployment 

   Year        UST          MBS  Corporates     Munis       Agency        ABS  \
0  2014  12504.782  8841.996778    7465.750  3865.665  2028.749936  1349.3802   
1  2015  13191.555  8894.813341    7704.250  3892.677  1995.396249  1376.5868   
2  2016  13908.241  9023.211929    7956.764  3947.421  1971.692165  1391.7603   
3  2017  14468.781  9304.524737    8305.107  3968.518  1934.670761  1457.9101   
4  2018  15607.967  9732.329779    8505.424  3926.704  1842.577463  1615.6286   

           CP         Total  
0  930.385883  36986.709797  
1  941.492441  37996.770831  
2  884.870210  39083.960604  
3  965.932706  40405.444304  
4  995.971356  42226.602198  
   Year  Treasury Securities  Mortgage-Related Securities  \
0  2015          2122.517098                  1800.703618   
1  2016          2169.443414                  2044.186751   
2  2017          2224.339460                  2003.380375   
3  2018          2684.657405                  1872.952098   
4  2019          2935.479517

Unnamed: 0,observation_date,UNRATE
0,1948-01-01,3.4
1,1948-02-01,3.8
2,1948-03-01,4.0
3,1948-04-01,3.9
4,1948-05-01,3.5
...,...,...
925,2025-02-01,4.1
926,2025-03-01,4.2
927,2025-04-01,4.2
928,2025-05-01,4.2


In [8]:
# ===== 0. Settings =====
target_area_type = 'A'  # 'A'=Statewide, 'B'=MSA, etc.

# ===== 1. File paths =====
data_file   = r"C:\dwaraka\github projects\Risk Analytics\Economic Data\LAUS\la.data.3.AllStatesS"
series_file = r"C:\dwaraka\github projects\Risk Analytics\Economic Data\LAUS\la.series"
area_file   = r"C:\dwaraka\github projects\Risk Analytics\Economic Data\LAUS\la.area"
measure_file= r"C:\dwaraka\github projects\Risk Analytics\Economic Data\LAUS\la.measure"
srd_file    = r"C:\dwaraka\github projects\Risk Analytics\Economic Data\LAUS\la.state_region_division"

# ===== 2. Load main statewide data =====
# This file is TAB-separated with a real header, so:
df_data = pd.read_csv(data_file, sep='\t', dtype=str)

# Ensure clean columns
df_data.columns = df_data.columns.str.strip()
for c in df_data.columns:
    df_data[c] = df_data[c].astype(str).str.strip()

# Keep only monthly rows (M01..M12)
df_data = df_data[df_data['period'].str.match(r'^M\d{2}$')].copy()

# Convert columns
df_data['month'] = df_data['period'].str[1:].astype(int)
df_data['year']  = df_data['year'].astype(int)
df_data['value'] = pd.to_numeric(df_data['value'], errors='coerce')

print(f"Data file loaded: {df_data['series_id'].nunique()} unique series IDs")

# ===== 3. Load metadata files =====
df_series  = pd.read_csv(series_file, sep='\t', dtype=str)
df_area    = pd.read_csv(area_file, sep='\t', dtype=str)
df_measure = pd.read_csv(measure_file, sep='\t', dtype=str)
df_srd     = pd.read_csv(srd_file, sep='\t', dtype=str, names=['srd_code','state_name'])

# Strip spaces
for df in [df_series, df_area, df_measure, df_srd]:
    df.columns = df.columns.str.strip()
    for c in df.columns:
        df[c] = df[c].astype(str).str.strip()

# ===== 4. Filter series by target area type =====
target_area_codes = df_area[df_area['area_type_code'] == target_area_type]['area_code']
df_series_filtered = df_series[df_series['area_code'].isin(target_area_codes)].copy()

# Debug: Check overlap of series IDs
overlap_series = set(df_data['series_id']) & set(df_series_filtered['series_id'])
print(f"Matching series IDs between data & series metadata: {len(overlap_series)}")

# ===== 5. Merge steps =====
merge1 = df_data.merge(
    df_series_filtered[['series_id','area_code','measure_code','srd_code']],
    on='series_id', how='inner'
)
print(f"Rows after merging data + filtered series: {merge1.shape[0]}")

merge2 = merge1.merge(df_area[['area_code','area_text','area_type_code']], 
                      on='area_code', how='left')
merge3 = merge2.merge(df_measure, on='measure_code', how='left')
merge4 = merge3.merge(df_srd, on='srd_code', how='left')  # state/region text

# Prefer area_text, fallback to state_name from srd
merge4['area_name'] = merge4['area_text']
merge4.loc[merge4['area_name'].isna() | (merge4['area_name'] == ''), 'area_name'] = merge4['state_name']

print(f"Final merged rows: {merge4.shape[0]}")

# ===== 6. Pivot to wide format =====
out = merge4.pivot_table(
    index=['area_code','area_name','year','month'],
    columns='measure_text', values='value'
).reset_index()

# Rename key columns if present
rename_map = {
    'employment': 'Employed',
    'unemployment': 'Unemployed',
    'unemployment rate': '% Unemployed',
    'labor force': 'Labor Force',
    'population': 'Population'
}
out = out.rename(columns={k: v for k,v in rename_map.items() if k in out.columns})

# ===== 7. Save =====
output_path = r"C:\dwaraka\github projects\Risk Analytics\Economic Data\LAUS\laus_filtered_monthly_cleaned.csv"
out.to_csv(output_path, index=False)
print(f"✅ Saved to {output_path}")

Data file loaded: 310 unique series IDs
Matching series IDs between data & series metadata: 310
Rows after merging data + filtered series: 184140
Final merged rows: 184140
✅ Saved to C:\dwaraka\github projects\Risk Analytics\Economic Data\LAUS\laus_filtered_monthly_cleaned.csv
