In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from scipy import stats

import statsmodels.api as sm
import statsmodels.formula.api as smf

from mord import LogisticAT
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from statsmodels.miscmodels.ordinal_model import OrderedModel

In [3]:
clean_job_sat = pd.read_csv("clean_job_sat.csv")

In [4]:
# Linear regression with weights
linear_model = smf.wls("stfmjob_grouped ~ happy + inprdsc + health + hlthhmp + rlgdgr + \
    brncntr + gndr + agea + rshpsts + edulvlb + eduyrs + uempla + uempli + rtrd + hswrk + \
    emplrel + wrkctra + estsz + wkdcorga + wkhtot + tporgwk + uemp3m + hinctnta + atncrse + \
    trdawrk + jbprtfp + pfmfdjba + dcsfwrka + nacer2 + domicil + hincsrca + emprelp",
    data=clean_job_sat, weights=clean_job_sat["anweight"])

linear_results = linear_model.fit()
print(linear_results.summary())

                            WLS Regression Results                            
Dep. Variable:        stfmjob_grouped   R-squared:                       0.272
Model:                            WLS   Adj. R-squared:                  0.252
Method:                 Least Squares   F-statistic:                     13.81
Date:                Sun, 25 May 2025   Prob (F-statistic):          5.09e-127
Time:                        20:29:32   Log-Likelihood:                -3706.1
No. Observations:                2580   AIC:                             7550.
Df Residuals:                    2511   BIC:                             7954.
Df Model:                          68                                         
Covariance Type:            nonrobust                                         
                                                                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

In [9]:

# Prepare X and y for ordered logit model
X = clean_job_sat[[
    'happy', 'inprdsc', 'health', 'hlthhmp', 'rlgdgr', 'brncntr', 'gndr',
    'agea', 'rshpsts', 'edulvlb', 'eduyrs', 'uempla', 'uempli', 'rtrd', 'hswrk',
    'emplrel', 'wrkctra', 'estsz', 'wkdcorga', 'wkhtot', 'tporgwk', 'uemp3m',
    'hinctnta', 'atncrse', 'trdawrk', 'jbprtfp', 'pfmfdjba', 'dcsfwrka', 'nacer2',
    'domicil', 'hincsrca', 'emprelp'
]]
y = clean_job_sat['stfmjob_grouped'].astype(int)

# Fill NaNs in categorical columns with 'missing', numeric with mean
for col in cat_cols:
    X[col] = X[col].fillna('missing')
for col in X.select_dtypes(include=['float64', 'int64', 'bool']).columns:
    if X[col].isnull().any():
        X[col] = X[col].fillna(X[col].mean())

# Fit ordinal logistic model
model_ordlogit = make_pipeline(ct, LogisticAT(alpha=1.0))
model_ordlogit.fit(X, y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna('missing')
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [12]:
# Ensure y_pred_proba returns probabilities (mord uses cumulative probs so we adjust)
def mcfadden_r2(model, X, y):
    # Convert labels to start at 0 for mord consistency
    y = np.asarray(y).astype(int) - 1
    ll_model = -log_loss(y, model.predict_proba(X), labels=np.unique(y), normalize=False)

    # Null model (predicting the most frequent class)
    y_mean_prob = np.bincount(y) / len(y)
    y_null_pred = np.tile(y_mean_prob, (len(y), 1))
    ll_null = -log_loss(y, y_null_pred, labels=np.unique(y), normalize=False)

    return 1 - (ll_model / ll_null)

In [15]:
r2_mcfadden = mcfadden_r2(model_ordlogit.named_steps['logisticat'], model_ordlogit.named_steps['columntransformer'].transform(X), y)
print("McFadden's R²:", round(r2_mcfadden, 4))

McFadden's R²: 0.099


In [19]:
# Select features and response
X_ord = clean_job_sat[[
    'happy', 'inprdsc', 'health', 'hlthhmp', 'rlgdgr', 'brncntr', 'gndr',
    'agea', 'rshpsts', 'edulvlb', 'eduyrs', 'uempla', 'uempli', 'rtrd', 'hswrk',
    'emplrel', 'wrkctra', 'estsz', 'wkdcorga', 'wkhtot', 'tporgwk', 'uemp3m',
    'hinctnta', 'atncrse', 'trdawrk', 'jbprtfp', 'pfmfdjba', 'dcsfwrka', 'nacer2',
    'domicil', 'hincsrca', 'emprelp'
]]
y_ord = clean_job_sat['stfmjob_grouped'].astype(int)

# Encode categorical variables
X_encoded = pd.get_dummies(X_ord, drop_first=True)

# Convert all boolean columns to int (0/1) for statsmodels compatibility
X_encoded = X_encoded.astype(float)

# Fit Ordered Probit model
oprobit_model = OrderedModel(
    endog=y_ord,
    exog=X_encoded,
    distr='probit'  # change to 'logit' for ordered logit
)

oprobit_result = oprobit_model.fit(method='bfgs')
print(oprobit_result.summary())

Optimization terminated successfully.
         Current function value: 1.187168
         Iterations: 263
         Function evaluations: 269
         Gradient evaluations: 269
                             OrderedModel Results                             
Dep. Variable:        stfmjob_grouped   Log-Likelihood:                -3186.4
Model:                   OrderedModel   AIC:                             6517.
Method:            Maximum Likelihood   BIC:                             6941.
Date:                Sun, 25 May 2025                                         
Time:                        20:37:59                                         
No. Observations:                2684                                         
Df Residuals:                    2612                                         
Df Model:                          72                                         
                                                               coef    std err          z      P>|z|      [0.025  

In [20]:
def likelihood_ratio_test(model_restricted, model_full):
    lr_stat = 2 * (model_full.llf - model_restricted.llf)
    df_diff = model_full.df_model - model_restricted.df_model
    p_value = stats.chi2.sf(lr_stat, df_diff)
    return lr_stat, df_diff, p_value

In [23]:
# Fit two nested models
restricted = OrderedModel(y_ord, X_encoded.drop(columns=['wkhtot']), distr='logit').fit(disp=False)
full = OrderedModel(y_ord, X_encoded, distr='logit').fit(disp=False)

# Perform LR test
lr_stat, df_diff, p_val = likelihood_ratio_test(restricted, full)
print(f"LR stat: {lr_stat:.3f}, df: {df_diff}, p-value: {p_val:.4f}")



LR stat: 0.066, df: 1, p-value: 0.7976


