# Assignment 2: Binary and multinominal outcomes

## Load and Inspect Data

In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np

# The Fishing dataset is in the 'mlogit' R package
dataset_link = "https://vincentarelbundock.github.io/Rdatasets/csv/Ecdat/Fishing.csv"
raw_df = pd.read_csv(dataset_link)

raw_df

Unnamed: 0,rownames,mode,price,catch,pbeach,ppier,pboat,pcharter,cbeach,cpier,cboat,ccharter,income
0,1,charter,182.930,0.5391,157.930,157.930,157.930,182.930,0.0678,0.0503,0.2601,0.5391,7083.33170
1,2,charter,34.534,0.4671,15.114,15.114,10.534,34.534,0.1049,0.0451,0.1574,0.4671,1249.99980
2,3,boat,24.334,0.2413,161.874,161.874,24.334,59.334,0.5333,0.4522,0.2413,1.0266,3749.99990
3,4,pier,15.134,0.0789,15.134,15.134,55.930,84.930,0.0678,0.0789,0.1643,0.5391,2083.33320
4,5,boat,41.514,0.1082,106.930,106.930,41.514,71.014,0.0678,0.0503,0.1082,0.3240,4583.33200
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1177,1178,pier,150.236,0.4522,150.236,150.236,250.746,285.746,0.5333,0.4522,0.2413,1.0266,3749.99990
1178,1179,beach,235.436,0.5333,235.436,235.436,392.946,427.946,0.5333,0.4522,0.2413,1.0266,6250.00130
1179,1180,pier,65.036,0.4522,65.036,65.036,108.546,143.546,0.5333,0.4522,0.2413,1.0266,1249.99980
1180,1181,beach,36.636,0.5333,36.636,36.636,61.146,96.146,0.5333,0.4522,0.1665,0.3975,416.66668


(a) Generate a dummy variable taking 1 if the individual chose to fish on a charter, and
0 otherwise. Run a logit regression using the charter dummy as dependent variable
and prices of the beach, boat, charter, and the catch rate of beach, pier, boat and
charter and also income as independent variables.

In [2]:
X_raw, y = raw_df.assign(
    charter_dummy = (raw_df['mode'] == 'charter').astype(int)
).pipe(
    lambda df: (
        df[['pbeach', 'pcharter', 'cbeach', 'cpier', 'cboat', 'ccharter', 'income']],
        df['charter_dummy']
    )
)
print("Value Counts for y:\n{}".format(y.value_counts()))

print("Descriptive Statistics for X:")
display(X_raw.describe())

X = X_raw.pipe(lambda df: (df - df.mean()) / df.std())


Value Counts for y:
charter_dummy
0    730
1    452
Name: count, dtype: int64
Descriptive Statistics for X:


Unnamed: 0,pbeach,pcharter,cbeach,cpier,cboat,ccharter,income
count,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0
mean,103.422005,84.379244,0.241011,0.162224,0.171215,0.629368,4099.337054
std,103.641042,63.54465,0.190752,0.16039,0.209789,0.706114,2461.96406
min,1.29,27.29,0.0678,0.0014,0.0002,0.0021,416.66668
25%,26.6565,42.896,0.0678,0.0503,0.0233,0.0219,2083.3332
50%,74.628,61.607,0.2537,0.0789,0.0897,0.4216,3749.9999
75%,144.144,102.774,0.5333,0.1498,0.2413,1.0266,5416.6667
max,843.186,691.11,0.5333,0.4522,0.7369,2.3101,12499.998


Standardization is crucial because as we can see from the above descriptive stats table, the
standard deviation of `income` is ~4k and that of `cboat` is 0.2.

Hence, without standardization, the resulting odds ratios (OR) will not make any sense.
With the non-standardized data, an increase of $\$ 1$ at a standard deviation of 4 thousand
is negligible.

Because of multicolinearity issues, we shall also exclude `pboat` from the set of regressors.
Otherwise, the model seems to zone in on the difference between `pcharter` and `pboat`.

In [3]:
result = sm.Logit(y, sm.add_constant(X)).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.628581
         Iterations 5


0,1,2,3
Dep. Variable:,charter_dummy,No. Observations:,1182.0
Model:,Logit,Df Residuals:,1174.0
Method:,MLE,Df Model:,7.0
Date:,"Mon, 26 May 2025",Pseudo R-squ.:,0.05509
Time:,11:51:24,Log-Likelihood:,-742.98
converged:,True,LL-Null:,-786.3
Covariance Type:,nonrobust,LLR p-value:,6.07e-16

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.5238,0.063,-8.307,0.000,-0.647,-0.400
pbeach,0.5208,0.091,5.748,0.000,0.343,0.698
pcharter,-0.2099,0.080,-2.632,0.008,-0.366,-0.054
cbeach,0.0395,0.115,0.344,0.731,-0.186,0.265
cpier,-0.2113,0.108,-1.958,0.050,-0.423,0.000
cboat,-0.5441,0.181,-3.008,0.003,-0.899,-0.190
ccharter,0.6774,0.182,3.731,0.000,0.322,1.033
income,-0.3975,0.092,-4.314,0.000,-0.578,-0.217


(b) Compute MEM, AME, and OR and interpret your results.

In [4]:
# Marginal Effects at the Mean (MEM)
mem = result.get_margeff(at='mean').summary()
print("\nMarginal Effects at the Mean (MEM):\n", mem)



Marginal Effects at the Mean (MEM):
         Logit Marginal Effects       
Dep. Variable:          charter_dummy
Method:                          dydx
At:                              mean
                dy/dx    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
pbeach         0.1217      0.021      5.785      0.000       0.080       0.163
pcharter      -0.0490      0.019     -2.638      0.008      -0.085      -0.013
cbeach         0.0092      0.027      0.344      0.731      -0.043       0.062
cpier         -0.0494      0.025     -1.957      0.050      -0.099     6.4e-05
cboat         -0.1271      0.042     -3.009      0.003      -0.210      -0.044
ccharter       0.1582      0.042      3.733      0.000       0.075       0.241
income        -0.0929      0.021     -4.336      0.000      -0.135      -0.051


In [5]:
# Average Marginal Effects (AME)
ame = result.get_margeff(at='overall').summary()
print("\nAverage Marginal Effects (AME):\n", ame)



Average Marginal Effects (AME):
         Logit Marginal Effects       
Dep. Variable:          charter_dummy
Method:                          dydx
At:                           overall
                dy/dx    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
pbeach         0.1144      0.019      6.053      0.000       0.077       0.151
pcharter      -0.0461      0.017     -2.657      0.008      -0.080      -0.012
cbeach         0.0087      0.025      0.344      0.731      -0.041       0.058
cpier         -0.0464      0.024     -1.971      0.049      -0.093      -0.000
cboat         -0.1195      0.039     -3.053      0.002      -0.196      -0.043
ccharter       0.1487      0.039      3.817      0.000       0.072       0.225
income        -0.0873      0.020     -4.433      0.000      -0.126      -0.049


Marginal effects overall (i.e. averaged) and at mean levels are mostly similar for all predictors.

All predictors except `cbeach` are statistically significant at the $5 \%$ level.
This is interesting as it implies that fishers do not care about the price of the charter when deciding to charter or not to charter a boat for fishing.

Instead they only seem to care about the price of fishing at the beach, which increases the likelihood of chartering.
Still, the actual change in likelihood is rather small.

In [6]:
# Odds Ratios
or_ = np.exp(result.params)
print("\nOdds Ratios (OR):\n", or_)




Odds Ratios (OR):
 const       0.592294
pbeach      1.683379
pcharter    0.810657
cbeach      1.040283
cpier       0.809568
cboat       0.580356
ccharter    1.968680
income      0.672002
dtype: float64


For obvious reasons, the catch rate on charters increases the likelihood of chartering the most.
A $1$-standard deviation increase in the `ccharter` roughly doubles the $P(Charter)$.

On the other hand, a one std increase in `cboat` almost halves $P(Charter)$.

Note that all of these assume all else being equal, although there may be a strong correlation among the predictors which we didn't check.


### 2. 


In [7]:
import pyreadr

result = pyreadr.read_r('assignments/Assignment 2/holiday.rds')
holiday_df = result[None] # extract the pandas data frame for object df1
holiday_df

Unnamed: 0_level_0,holiday,salary,price.lake,price.coast,price.hiking,price.mtb,recov.lake,recov.coast,recov.hiking,recov.mtb
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,mtb,5388.887800,150.0335,134.2405,181.6195,365.860,0.064410,0.037725,0.273105,1.0782
2,mtb,1499.999867,14.3583,12.8469,12.1141,69.068,0.099655,0.033825,0.165270,0.9342
3,hiking,3166.666600,153.7803,137.5929,27.9841,118.668,0.506635,0.339150,0.253365,2.0532
4,coast,2055.555467,14.3773,12.8639,64.3195,169.860,0.064410,0.059175,0.172515,1.0782
5,hiking,3722.221333,101.5835,90.8905,47.7411,142.028,0.064410,0.037725,0.113610,0.6480
...,...,...,...,...,...,...,...,...,...,...
1178,coast,3166.666600,142.7242,127.7006,288.3579,571.492,0.506635,0.339150,0.253365,2.0532
1179,lake,4833.334200,223.6642,200.1206,451.8879,855.892,0.506635,0.339150,0.253365,2.0532
1180,coast,1499.999867,61.7842,55.2806,124.8279,287.092,0.506635,0.339150,0.253365,2.0532
1181,lake,944.444453,34.8042,31.1406,70.3179,192.292,0.506635,0.339150,0.174825,0.7950


(a) First, analyse how the holiday type varies with case-specific regressors by applying
a multinominal logit model. Calculate AME and OR to interpret the results.

In [8]:
# Case-specific regressors are only salary

reg = sm.MNLogit(
    endog=holiday_df["holiday"],
    exog=sm.add_constant(holiday_df[["salary"]])
)
results = reg.fit()
results.summary()


Optimization terminated successfully.
         Current function value: 1.249704
         Iterations 5


0,1,2,3
Dep. Variable:,holiday,No. Observations:,1182.0
Model:,MNLogit,Df Residuals:,1176.0
Method:,MLE,Df Model:,3.0
Date:,"Mon, 26 May 2025",Pseudo R-squ.:,0.01374
Time:,11:51:24,Log-Likelihood:,-1477.2
converged:,True,LL-Null:,-1497.7
Covariance Type:,nonrobust,LLR p-value:,6.093e-09

holiday=hiking,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.3105,0.222,-1.398,0.162,-0.746,0.125
salary,0.0004,6.55e-05,5.389,0.000,0.000,0.000
holiday=lake,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.9576,0.276,-3.470,0.001,-1.498,-0.417
salary,0.0002,7.99e-05,2.691,0.007,5.84e-05,0.000
holiday=mtb,coef,std err,z,P>|z|,[0.025,0.975]
const,0.4154,0.217,1.914,0.056,-0.010,0.841
salary,0.0002,6.6e-05,2.541,0.011,3.83e-05,0.000


In [9]:
print("Average Marginal Effects")
display(results.get_margeff(at='mean').summary())

print("Odds Ratios")
display(np.exp(results.params))


Average Marginal Effects


0,1
Dep. Variable:,holiday
Method:,dydx
At:,mean

holiday=coast,dy/dx,std err,z,P>|z|,[0.025,0.975]
salary,-3.099e-05,7.31e-06,-4.239,0.000,-4.53e-05,-1.67e-05
holiday=hiking,dy/dx,std err,z,P>|z|,[0.025,0.975]
salary,4.89e-05,8.54e-06,5.727,0.000,3.22e-05,6.56e-05
holiday=lake,dy/dx,std err,z,P>|z|,[0.025,0.975]
salary,1.124e-07,5.9e-06,0.019,0.985,-1.15e-05,1.17e-05
holiday=mtb,dy/dx,std err,z,P>|z|,[0.025,0.975]
salary,-1.802e-05,9.11e-06,-1.977,0.048,-3.59e-05,-1.59e-07


Odds Ratios


Unnamed: 0,0,1,2
const,0.733052,0.383831,1.514943
salary,1.000353,1.000215,1.000168


In [10]:
holiday_df.reset_index(drop=False)

Unnamed: 0,rownames,holiday,salary,price.lake,price.coast,price.hiking,price.mtb,recov.lake,recov.coast,recov.hiking,recov.mtb
0,1,mtb,5388.887800,150.0335,134.2405,181.6195,365.860,0.064410,0.037725,0.273105,1.0782
1,2,mtb,1499.999867,14.3583,12.8469,12.1141,69.068,0.099655,0.033825,0.165270,0.9342
2,3,hiking,3166.666600,153.7803,137.5929,27.9841,118.668,0.506635,0.339150,0.253365,2.0532
3,4,coast,2055.555467,14.3773,12.8639,64.3195,169.860,0.064410,0.059175,0.172515,1.0782
4,5,hiking,3722.221333,101.5835,90.8905,47.7411,142.028,0.064410,0.037725,0.113610,0.6480
...,...,...,...,...,...,...,...,...,...,...,...
1177,1178,coast,3166.666600,142.7242,127.7006,288.3579,571.492,0.506635,0.339150,0.253365,2.0532
1178,1179,lake,4833.334200,223.6642,200.1206,451.8879,855.892,0.506635,0.339150,0.253365,2.0532
1179,1180,coast,1499.999867,61.7842,55.2806,124.8279,287.092,0.506635,0.339150,0.253365,2.0532
1180,1181,lake,944.444453,34.8042,31.1406,70.3179,192.292,0.506635,0.339150,0.174825,0.7950


In [11]:
holiday_long = (
    pd.wide_to_long(
        holiday_df.reset_index(drop=False),
        stubnames=["price", "recov"],
        i=["rownames", "holiday", "salary"],
        j="location",
        sep=".",
        # Regex match any number of characters
        suffix=r".*",
    )
    .reset_index(drop=False)
    .sort_values(by=["rownames", "holiday", "salary", "location"])
    .assign(
        choice=lambda df: (df["location"] == df["holiday"]).astype(int)
    )
    # Add location dummies
    .pipe(
        lambda df: df.join(
            pd.get_dummies(df["location"]).astype(float)
        )
    )
    .drop(columns=["location", "holiday", "rownames"])
)

display(holiday_long)


Unnamed: 0,salary,price,recov,choice,coast,hiking,lake,mtb
1,5388.887800,134.2405,0.037725,0,1.0,0.0,0.0,0.0
2,5388.887800,181.6195,0.273105,0,0.0,1.0,0.0,0.0
0,5388.887800,150.0335,0.064410,0,0.0,0.0,1.0,0.0
3,5388.887800,365.8600,1.078200,1,0.0,0.0,0.0,1.0
37,2611.111067,24.0669,0.112350,0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
3991,4277.777800,565.7200,2.053200,0,0.0,0.0,0.0,1.0
3993,1499.999867,28.0296,0.112350,0,1.0,0.0,0.0,0.0
3994,1499.999867,6.3204,0.055755,1,0.0,1.0,0.0,0.0
3992,1499.999867,31.3272,0.241015,0,0.0,0.0,1.0,0.0


In [12]:
sm.Logit(
    endog=holiday_long["choice"], exog=holiday_long.drop(columns=["choice"])
).fit().summary()

Optimization terminated successfully.
         Current function value: 0.500248
         Iterations 6


0,1,2,3
Dep. Variable:,choice,No. Observations:,4728.0
Model:,Logit,Df Residuals:,4721.0
Method:,MLE,Df Model:,6.0
Date:,"Mon, 26 May 2025",Pseudo R-squ.:,0.1104
Time:,11:51:24,Log-Likelihood:,-2365.2
converged:,True,LL-Null:,-2658.7
Covariance Type:,nonrobust,LLR p-value:,1.4160000000000002e-123

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
salary,0.0001,2.34e-05,5.330,0.000,7.9e-05,0.000
price,-0.0069,0.001,-12.495,0.000,-0.008,-0.006
recov,0.1152,0.043,2.682,0.007,0.031,0.199
coast,-1.6481,0.111,-14.852,0.000,-1.866,-1.431
hiking,-0.6486,0.098,-6.623,0.000,-0.841,-0.457
lake,-1.9428,0.119,-16.307,0.000,-2.176,-1.709
mtb,0.0080,0.124,0.064,0.949,-0.235,0.251


Note that in this model, a constant is already implicitly included since we have $4$ dummy variables which always sum to one.