In [151]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [153]:
# Exercise 11-1
# 
# Suppose one of your co-workers is expecting a baby and you are 
# participating in an office pool to predict the date of birth. 
# Assuning that bets are placed during the 30th week of pregnancy,
# what variables could you use to make the best prediction? 
#
# Variables for Prediction:
# 1. Estimated Due Date 
# 2. Mother’s Age
# 3. Number of Previous Pregnancies 
# 4. Mother’s Health Conditions
# 5. Baby’s Sex 
# 6. Smoking or Alcohol Use






In [155]:
# Exercise 11-3

# Import libraries
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import nsfg

# Load NSFG dataset
resp = nsfg.ReadFemResp()

# Check available columns
print([col for col in resp.columns if 'numbabes' in col.lower()])  # Ensure numbabes is present

# Select relevant columns
columns_needed = ['age_r', 'race', 'educat', 'numbabes']
columns_available = [col for col in resp.columns if col in columns_needed]

# Drop missing values
poisson_df = resp[columns_available].dropna()

# Rename columns for clarity
poisson_df.rename(columns={'age_r': 'age', 'educat': 'education', 'numbabes': 'num_children'}, inplace=True)

# Encode categorical variables
poisson_df['race'] = poisson_df['race'].map({1: 'white', 2: 'black', 3: 'hispanic', 4: 'other'})
poisson_df['education'] = poisson_df['education'].map({1: 'no_hs', 2: 'high_school', 3: 'some_college', 4: 'college'})

# Convert categorical variables 
poisson_df = pd.get_dummies(poisson_df, columns=['race', 'education'], drop_first=True)

# Print final column names to verify correct encoding
print("Final available columns:", poisson_df.columns)

# Build formula based on detected variables
race_dummies = [col for col in poisson_df.columns if 'race' in col.lower()]
education_dummies = [col for col in poisson_df.columns if 'education' in col.lower()]

formula = 'num_children ~ age'
for col in race_dummies + education_dummies:
    formula += f' + {col}'

# Poisson regression model
poisson_model = smf.poisson(formula, data=poisson_df).fit()

# Display model summary
print(poisson_model.summary())

# Make prediction for a 35-year-old Black woman with a college degree
new_data = pd.DataFrame({
    'age': [35],
    'race_hispanic': [0],  # Not Hispanic
    'race_white': [0],     # Not White (i.e., Black)
    'education_high_school': [0],
    'education_some_college': [0],
    'education_college': [1]  # College Graduate
})

# Ensure all necessary columns exist in new_data
for col in poisson_df.columns:
    if col not in new_data.columns and col != 'num_children':  
        new_data[col] = 0  # Set missing categorical variables to 0

# Make the prediction
predicted_children = poisson_model.predict(new_data)

print("Predicted number of children:", predicted_children.iloc[0])

['numbabes']
Final available columns: Index(['age', 'num_children', 'race_hispanic', 'race_white'], dtype='object')
Optimization terminated successfully.
         Current function value: 1.449767
         Iterations 6
                          Poisson Regression Results                          
Dep. Variable:           num_children   No. Observations:                 7643
Model:                        Poisson   Df Residuals:                     7639
Method:                           MLE   Df Model:                            3
Date:                Sat, 08 Feb 2025   Pseudo R-squ.:                  0.1050
Time:                        17:55:25   Log-Likelihood:                -11081.
converged:                       True   LL-Null:                       -12380.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                            coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------

In [156]:
# Exercise 11-4

import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import nsfg

# Load NSFG dataset
resp = nsfg.ReadFemResp()

# Identify marital status column
marital_col = [col for col in resp.columns if 'rmarital' in col.lower()]

# Identify education column
education_col = [col for col in resp.columns if 'educat' in col.lower()]
race_col = [col for col in resp.columns if 'race' in col.lower()]

# Print detected columns
print("Detected education columns:", education_col)
print("Detected race columns:", race_col)

# Check unique values in `educat`
print("Unique values in 'educat':", resp['educat'].unique())

# Find the value counts to understand the education categories
print(resp[['educat']].value_counts())

# Manually define education level categories based on actual values
education_map = {
    9: 'no_hs', 10: 'no_hs', 11: 'no_hs',  # Less than high school
    12: 'high_school',  # High school graduate
    13: 'some_college', 14: 'some_college', 15: 'some_college',  # Some college
    16: 'college', 17: 'college', 18: 'college', 19: 'college'  # College graduate
}

# Extract relevant columns
columns_needed = ['age_r'] + marital_col + ['educat', 'race']
columns_available = [col for col in resp.columns if col in columns_needed]

# Select only the available columns
mlogit_df = resp[columns_available].dropna()

# Rename columns for clarity
mlogit_df.rename(columns={'age_r': 'age', 'educat': 'education', marital_col[0]: 'marital_status'}, inplace=True)

# Convert marital_status to categorical and encode numerically
mlogit_df['marital_status'] = mlogit_df['marital_status'].astype('category')
mlogit_df['marital_status'] = mlogit_df['marital_status'].cat.codes  # Converts categories to numeric values

# Apply the correct education mapping
mlogit_df['education'] = mlogit_df['education'].map(education_map)

# Ensure mapping worked
print("Unique values in mapped 'education':", mlogit_df['education'].unique())

# Map race categories correctly
mlogit_df['race'] = mlogit_df['race'].map({
    1: 'white', 
    2: 'black', 
    3: 'hispanic'
})

# Create dummy variables
mlogit_df = pd.get_dummies(mlogit_df, columns=['race', 'education'], drop_first=True)

# Print available columns after encoding
print("Final available columns after encoding:", mlogit_df.columns)

# Identify if race or education dummies are still missing
race_dummies = [col for col in mlogit_df.columns if 'race' in col.lower()]
education_dummies = [col for col in mlogit_df.columns if 'education' in col.lower()]

if len(race_dummies) == 0 or len(education_dummies) == 0:
    raise ValueError("Error: Race or education variables are still missing! Please check column names.")

# Construct the formula dynamically
formula = 'marital_status ~ age'
for col in race_dummies + education_dummies:
    formula += f' + {col}'

# Fit Multinomial Logistic Regression Model
mlogit_model = smf.mnlogit(formula, data=mlogit_df).fit()

# Display model summary
print(mlogit_model.summary())


# Define the input values for prediction
new_data = pd.DataFrame({
    'age': [25],
    'race_hispanic': [0],  # 0 = Not Hispanic
    'race_white': [1],  # 1 = White
    'education_high_school': [1],  # 1 = High School Graduate
    'education_no_hs': [0],  # Not in "No High School" category
    'education_some_college': [0]  # Not in "Some College" category
})

# Predict probabilities for each marital status category
predicted_probs = mlogit_model.predict(new_data)

# Print results
print("Predicted probabilities for marital status categories:")
print(predicted_probs)


Detected education columns: ['educat', 'educat_i']
Detected race columns: ['rscreenrace', 'numrace', 'fl_rrace', 'chosrace', 'racehx1', 'racehx2', 'racehx3', 'racehx4', 'racehx6', 'racehx7', 'racehx11', 'racehx16', 'racehx17', 'racehx21', 'cprace1', 'cprace2', 'racecx1', 'racecx2', 'p1yrace1', 'p1yrace2', 'p1yrace3', 'p1yraceb', 'p1yrace6', 'p1yrace7', 'p1yraceb2', 'p1yrace11', 'p1yrace12', 'p1yraceb3', 'p1yrace16', 'p1yrace21', 'p1yrace22', 'p1yraceb5', 'p1yrace26', 'p1yrace31', 'p1yrace36', 'p1yrace41', 'p1yrace46', 'p1yrace51', 'p1yrace56', 'p1yrace61', 'p1yrace66', 'race', 'hisprace', 'race_i', 'hisprace_i']
Unique values in 'educat': [10 13 12  9 11 16 14 15 17 19 18]
educat
12        1792
9          993
16         920
14         835
13         805
11         553
10         542
15         435
17         303
18         235
19         230
Name: count, dtype: int64
Unique values in mapped 'education': ['no_hs' 'some_college' 'high_school' 'college']
Final available columns after enco