In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# Load data
household_data = pd.read_csv("Guatemala - Household Characteristics.csv")
region_features = pd.read_csv("Guatemala - Region Features.csv")

# Merge household and region data
merged = household_data.merge(region_features, on=["i", "t", "m"])

# Create demographic variables
age_columns = household_data.columns[3:]
merged["hh_size"] = merged[age_columns].sum(axis=1)
merged["children"] = merged[["Males 00-03", "Females 00-03", "Males 04-08", "Females 04-08", 
                             "Males 09-13", "Females 09-13"]].sum(axis=1)
merged["adults"] = merged[["Males 19-30", "Females 19-30", "Males 31-50", "Females 31-50"]].sum(axis=1)
merged["elderly"] = merged[["Males 51-99", "Females 51-99"]].sum(axis=1)

# Simulate food expenditures
np.random.seed(42)
food_items = ["Rice", "Beans", "Cooking Oil"]
for item in food_items:
    merged[f"exp_{item}"] = np.exp(
        2 + 0.1 * merged["hh_size"] +
        0.2 * merged["children"] +
        0.3 * merged["adults"] +
        0.15 * merged["elderly"] -
        0.1 * merged["Rural"] +
        np.random.normal(0, 0.3, size=len(merged))
    )

# Reshape to long format
expenditures_long = pd.melt(
    merged,
    id_vars=["i", "t", "m", "hh_size", "children", "adults", "elderly", "Rural"],
    value_vars=[f"exp_{item}" for item in food_items],
    var_name="j",
    value_name="x"
)
expenditures_long["j"] = expenditures_long["j"].str.replace("exp_", "")
expenditures_long["log_x"] = np.log(expenditures_long["x"])

# Estimate demand regressions per item
results = {}
for item in food_items:
    df_item = expenditures_long[expenditures_long["j"] == item]
    model = smf.ols("log_x ~ hh_size + children + adults + elderly + Rural", data=df_item).fit()
    results[item] = model

# Print summaries
for item, model in results.items():
    print(f"\n=== Demand for {item} ===")
    print(model.summary())



=== Demand for Rice ===
                            OLS Regression Results                            
Dep. Variable:                  log_x   R-squared:                       0.861
Model:                            OLS   Adj. R-squared:                  0.861
Method:                 Least Squares   F-statistic:                     9034.
Date:                Tue, 01 Apr 2025   Prob (F-statistic):               0.00
Time:                        11:48:47   Log-Likelihood:                -1589.6
No. Observations:                7276   AIC:                             3191.
Df Residuals:                    7270   BIC:                             3233.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.0089      

In [7]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# Load data
household_data = pd.read_csv("Guatemala - Household Characteristics.csv")
region_features = pd.read_csv("Guatemala - Region Features.csv")
food_prices = pd.read_csv("Guatemala - Food Prices (2000).csv")

# Merge household and region data
merged = household_data.merge(region_features, on=["i", "t", "m"])

# Create demographic variables
age_columns = household_data.columns[3:]
merged["hh_size"] = merged[age_columns].sum(axis=1)
merged["children"] = merged[["Males 00-03", "Females 00-03", "Males 04-08", "Females 04-08", 
                             "Males 09-13", "Females 09-13"]].sum(axis=1)
merged["adults"] = merged[["Males 19-30", "Females 19-30", "Males 31-50", "Females 31-50"]].sum(axis=1)
merged["elderly"] = merged[["Males 51-99", "Females 51-99"]].sum(axis=1)

# Get list of food items
food_items = food_prices["j"].unique()

# Simulate expenditures and run models
np.random.seed(42)
predicted_data = []

for item in food_items:
    # Simulate log expenditure
    merged["log_x"] = (
        2 + 0.1 * merged["hh_size"] +
        0.2 * merged["children"] +
        0.3 * merged["adults"] +
        0.15 * merged["elderly"] -
        0.1 * merged["Rural"] +
        np.random.normal(0, 0.3, size=len(merged))
    )
    merged["x"] = np.exp(merged["log_x"])
    
    # Fit model
    model = smf.ols("log_x ~ hh_size + children + adults + elderly + Rural", data=merged).fit()
    
    # Predict expenditures (in levels)
    merged["predicted_exp"] = np.exp(model.predict(merged))
    
    # Store predicted results
    predicted_data.append(
        merged[["i", "t", "m", "predicted_exp"]].assign(j=item)
    )

# Combine results into a single DataFrame
predicted_df = pd.concat(predicted_data)
predicted_df.rename(columns={"i": "household_id", "t": "year", "m": "region", "j": "food", "predicted_exp": "predicted_expenditure"}, inplace=True)
predicted_df.reset_index(drop=True, inplace=True)

# Show the output
print(predicted_df.head(10))  # Show top 10 rows


   household_id  year         region  predicted_expenditure         food
0             1  2000  Metropolitana              32.875419  Cooking Oil
1             2  2000  Metropolitana              17.965821  Cooking Oil
2             3  2000  Metropolitana              56.346371  Cooking Oil
3             4  2000  Metropolitana               9.496091  Cooking Oil
4             5  2000  Metropolitana              22.133193  Cooking Oil
5             6  2000  Metropolitana              28.486259  Cooking Oil
6             7  2000  Metropolitana              15.732438  Cooking Oil
7             8  2000  Metropolitana              17.965821  Cooking Oil
8             9  2000  Metropolitana              12.095389  Cooking Oil
9            10  2000  Metropolitana              35.908229  Cooking Oil


In [8]:
predicted_df

Unnamed: 0,household_id,year,region,predicted_expenditure,food
0,1,2000,Metropolitana,32.875419,Cooking Oil
1,2,2000,Metropolitana,17.965821,Cooking Oil
2,3,2000,Metropolitana,56.346371,Cooking Oil
3,4,2000,Metropolitana,9.496091,Cooking Oil
4,5,2000,Metropolitana,22.133193,Cooking Oil
...,...,...,...,...,...
698491,7272,2000,Suroriente,8.560873,Leafcutter Ants and Other Insect
698492,7273,2000,Suroriente,36.237892,Leafcutter Ants and Other Insect
698493,7274,2000,Suroriente,54.565260,Leafcutter Ants and Other Insect
698494,7275,2000,Suroriente,44.425512,Leafcutter Ants and Other Insect
