In [None]:
import pandas as pd

file1_path = "GDP_data/_initial-loads_GDP_GDP_2022-2024_1 (1).csv"
file2_path = "GDP_data/_initial-loads_GDP_personal_income_2022_2024.csv"

gdp_df = pd.read_csv(file1_path, header=None)
income_df = pd.read_csv(file2_path, header=None)

us_states = {
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida',
    'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine',
    'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
    'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
    'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas',
    'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
}


def extract_years(df, base_year_row, supplement_year_row, quarter_row):
    base_years = df.iloc[base_year_row].fillna(method='ffill')
    supplement_years = df.iloc[supplement_year_row]
    quarters = df.iloc[quarter_row].fillna('')

    formatted_years = []
    last_valid_year = None

    for base, sup, qtr in zip(base_years, supplement_years, quarters):
        if pd.notna(base) and str(base).isdigit():
            year = str(int(base))
            last_valid_year = year
        elif pd.notna(sup) and str(sup).isdigit():
            year = str(int(sup))
            last_valid_year = year
        elif pd.notna(qtr) and qtr != '' and last_valid_year is not None:
            year = last_valid_year
        else:
            year = None

        if year is not None and pd.notna(qtr) and qtr != '':
            year += qtr.lower()

        formatted_years.append(year)

    return formatted_years


gdp_years = extract_years(gdp_df, 3, 4, 5)
income_years = extract_years(income_df, 2, 3, 4)
print(gdp_years)

import pandas as pd


def filter_valid_columns(df, category_row, unit_row, valid_unit="Millions of", force_income_category=False):
    categories = df.iloc[category_row].fillna(method='ffill')
    units = df.iloc[unit_row].fillna(method='ffill')

    valid_columns = [i for i, unit in enumerate(units) if isinstance(unit, str) and valid_unit in unit]

    selected_categories = categories[valid_columns]

    if force_income_category:
        selected_categories = ["personal_income_gdp"] * len(valid_columns)

    return valid_columns, selected_categories


gdp_valid_columns, gdp_categories = filter_valid_columns(gdp_df, 1, 2)
print(gdp_valid_columns, gdp_categories)

income_valid_columns, income_categories = filter_valid_columns(income_df, 0, 1, force_income_category=True)
print(income_valid_columns, income_categories)

gdp_states = gdp_df.iloc[6:, 0].dropna().tolist()
income_states = income_df.iloc[5:, 0].dropna().tolist()

def extract_values_safe(df, valid_columns, states, years, categories, start_row):
    extracted_data = []
    num_rows, num_cols = df.shape

    for i, state in enumerate(states):
        if not isinstance(state, str):
            continue
        state = state.strip()

        row_idx = start_row + i
        if row_idx >= num_rows:
            break

        for col_idx, category in zip(valid_columns, categories):
            if col_idx >= num_cols:
                continue

            values = df.iloc[row_idx, col_idx]
            year = years[col_idx] if col_idx < len(years) else None

            if pd.notna(values) and year:
                category = str(category).strip() if isinstance(category, str) else "Unknown"
                extracted_data.append([year, state, category, values])

    return extracted_data


def filter_us_states(df):
    df_filtered = df[df['state'].isin(us_states)]
    return df_filtered


gdp_data_safe = extract_values_safe(gdp_df, gdp_valid_columns, gdp_states, gdp_years, gdp_categories, 6)
income_data_safe = extract_values_safe(income_df, income_valid_columns, income_states, income_years, income_categories,5)

combined_data_safe = gdp_data_safe + income_data_safe

final_df_safe = pd.DataFrame(combined_data_safe, columns=["year", "state", "category", "value"])
output_file_safe = "GDP_data/processed_GDP_income_data.csv"
df_cleaned = filter_us_states(final_df_safe)
df_cleaned.to_csv(output_file_safe, index=False)
