# California Climate Investments Analysis
This notebook includes hierarchical regression

## 1. Hierarchical Regression

In [2]:
# Hierarchical Regression Analysis
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

df = pd.read_csv('cci_programs_data_reduced.csv' , low_memory=False)
df = df[~df['Reporting Cycle Name'].str.contains('Semi|Mid-Year', na=False)]

df['Total Program GGRFFunding'] = pd.to_numeric(df['Total Program GGRFFunding'], errors='coerce')
df['Total Project GHGReductions'] = pd.to_numeric(df['Total Project GHGReductions'], errors='coerce')
df['Total GGRFDisadvantaged Community Funding'] = pd.to_numeric(df['Total GGRFDisadvantaged Community Funding'], errors='coerce')

df['cost_per_ton'] = df['Total Program GGRFFunding'] / df['Total Project GHGReductions']
df.loc[df['Total Project GHGReductions'] == 0, 'cost_per_ton'] = np.nan
df['share_DAC'] = df['Total GGRFDisadvantaged Community Funding'] / df['Total Program GGRFFunding']

project_counties = df.groupby('Project ID Number')['County'].nunique()
df['n_partners'] = df['Project ID Number'].map(project_counties)
df['multi_county'] = (df['n_partners'] > 1).astype(int)
df['log_funding'] = np.log1p(df['Total Program GGRFFunding'])

south_counties = ["Los Angeles","Orange","San Diego","Riverside","San Bernardino","Imperial","Ventura"]
df['Region_South'] = df['County'].isin(south_counties).astype(int)

df = df.rename(columns={
    'Agency Name': 'Agency_Name',
    'County': 'County'  # keep as is unless renamed too
})


model1 = smf.ols('cost_per_ton ~ log_funding + C(Agency_Name) + C(County)', data=df).fit()
model2 = smf.ols('cost_per_ton ~ log_funding + C(Agency_Name) + C(County) + n_partners + multi_county', data=df).fit()
model3 = smf.ols('cost_per_ton ~ log_funding + C(Agency_Name) + C(County) + n_partners*Region_South', data=df).fit()

print(model3.summary())


                            OLS Regression Results                            
Dep. Variable:           cost_per_ton   R-squared:                       0.034
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     18.79
Date:                Thu, 10 Apr 2025   Prob (F-statistic):               0.00
Time:                        11:07:08   Log-Likelihood:            -1.2269e+06
No. Observations:              113346   AIC:                         2.454e+06
Df Residuals:                  113130   BIC:                         2.456e+06
Df Model:                         215                                         
Covariance Type:            nonrobust                                         
                                                                                                                                                                                                                  