In [None]:
# evaluate multinomial logistic regression model
from numpy import mean
from numpy import std
import os
import pandas as pd
import numpy as np 
import math
import json
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

In [None]:
# 获取所有模型的函数，储存在models_dict内
def get_pred_group(dataset_name, grouping_criteria, group_name, ref_groups): #iv_refs = ['college', 3, 's_cluster_5']; dv_ref = 's_cluster_3'
    # 有用的参数
    num_var_list = ['num_of_event', 'event_weighted_score','event_entropy_score','birth_year']
    cat_var_list = ['education_level', 'first_job_income_quantile','prev_stage', 'gender', 'race']
    cat_change_var_list = ['education_level', 'first_job_income_quantile','prev_stage']
    cat_control_var_list = [ 'gender', 'race']
    if dataset_name == 'nlsy':
        bool_var_list = ['single_couple', 'couple_single', 'single_single', 'childbearing_no_yes', 'childbearing_yes_no', 'has_migration','noncentral_central', 'central_noncentral',] # nlsy migration
    else:
        bool_var_list = ['single_couple', 'couple_single', 'single_single', 'childbearing_no_yes', 'childbearing_yes_no', 'has_migration',]
    dependent_var = ['dv_stage']
    control_var_list = ['gender','race','birth_year']
    control_dict = {    
        'gender': ['male','female'],
        'race': ['white','black','other']}


    dataset_info = {
    'stage_n':{
        'nlsw':7,
        'nlsm':3,
        'nlsy':8,
        'psid':10,
    },
    'cluster_n':{
        'nlsw':9,
        'nlsm':5,
        'nlsy':9,
        'psid':10,   
    },
}
    cats_dict = {
    'nlsw': {
    'education_level': ['high-school','college'], # nlsw,nlsm 1,2; nlsy, nlsw 0,1,2
    'first_job_income_quantile': [i+1 for i in range(5)],
    'prev_stage': [f's_cluster_{i}' for i in range(dataset_info['cluster_n']['nlsw'])],
    },
    'nlsm': {
    'education_level': ['high-school','college'], # nlsw,nlsm 1,2; nlsy, nlsw 0,1,2
    'first_job_income_quantile': [i+1 for i in range(5)],
    'prev_stage': [f's_cluster_{i}' for i in range(dataset_info['cluster_n']['nlsm'])],
    },
    'nlsy': {
    'education_level': ['high-school-noncompletion', 'high-school', 'college'], # nlsw,nlsm 1,2; nlsy, nlsw 0,1,2
    'first_job_income_quantile': [i+1 for i in range(5)],
    'prev_stage': [f's_cluster_{i}' for i in range(dataset_info['cluster_n']['nlsy'])],
    },
    'psid': {
    'education_level': ['high-school-noncompletion', 'high-school', 'college'], # nlsw,nlsm 1,2; nlsy, psid 0,1,2
    'first_job_income_quantile': [i+1 for i in range(5)],
    'prev_stage': [f's_cluster_{i}' for i in range(dataset_info['cluster_n']['psid'])],
    },
}
    models_dict = dict(zip([f'stage{i+1}' for i in range(1,dataset_info['stage_n'][dataset_name])],[0 for i in range(1,dataset_info['stage_n'][dataset_name])]))
    
    for i in range(1,dataset_info['stage_n'][dataset_name]):
        df0 = pd.read_csv(dataset_name+'/'+dataset_name+'_stage_'+str(i+1)+'.csv', index_col='person_id').drop('Unnamed: 0', axis=1).dropna()
        input_X = df0.iloc[:,:-1]
        input_y = df0['dv_stage']
        
        # 设定iv的ref   
        choice_order_list = ['education_level', 'first_job_income_quantile','prev_stage']
        cat_dict = cats_dict[dataset_name]
        for var in choice_order_list:
            choice = ref_groups[var]
            l0 = cat_dict[var]
            l0.remove(choice)
            cat_dict[var] = [choice]+l0
            input_X[var]=pd.Categorical(input_X[var], categories=cat_dict[var])
        # 设定dv的ref
        ref_dv = ref_groups['dv_stage']
        input_y = df0["dv_stage"].replace({ref_dv: 'a_'+ref_dv})

        # 训练模型
        cat_dummies = pd.get_dummies(input_X[cat_var_list], drop_first=True)
        train_X = pd.concat([input_X[num_var_list+bool_var_list], cat_dummies], axis=1)
        sc_X = StandardScaler()
        train_X_sd = pd.DataFrame(sc_X.fit_transform(train_X), index=train_X.index, columns=train_X.columns)
        train_y = input_y
        mlogit_mod = sm.MNLogit(train_y, train_X_sd)
        mlogit_res = mlogit_mod.fit(method='bfgs')
        # 储存模型
        models_dict['stage'+str(i+1)] = mlogit_res
    #获取预测
        df0 = pd.read_csv(dataset_name+'/'+dataset_name+'_stage_2.csv', index_col='person_id').drop('Unnamed: 0', axis=1).dropna()
        # input_X = df0.iloc[:,:-1]
        # input_y = df0['dv_stage']
        # 设置存储结果的df
        pred_cluster_info = df0[['prev_stage']].rename(columns={'prev_stage':'stage1'})
    for i in range(1,dataset_info['stage_n'][dataset_name]):
        df0 = pd.read_csv(dataset_name+'/'+dataset_name+'_stage_'+str(i+1)+'.csv', index_col='person_id').drop('Unnamed: 0', axis=1)
        df0['prev_stage']=pred_cluster_info['stage'+str(i)]
        df0=df0.dropna()

        
        input_X = df0.iloc[:,:-1]
        input_y = df0['dv_stage']
        
        for var in list(cat_dict.keys()):
            input_X[var]=pd.Categorical(input_X[var], categories=cat_dict[var])
            
        cat_dummies = pd.get_dummies(input_X[cat_var_list], drop_first=True)
        train_X = pd.concat([input_X[num_var_list+bool_var_list], cat_dummies], axis=1)
        sc_X = StandardScaler()
        train_X_sd = pd.DataFrame(sc_X.fit_transform(train_X), index=train_X.index, columns=train_X.columns)
        pred_cluster_info['stage'+str(i+1)] = pd.Series(np.asarray(models_dict['stage'+str(i+1)].predict(train_X_sd)).argmax(1), index=train_X.index).apply(lambda x: 's_cluster_'+str(x))
        
    pred_seq_dict = [{"person_id": idx, "predicted_stage_seq":[]} for idx in list(pred_cluster_info.index)]
    
    for person in pred_seq_dict:
        pid = person["person_id"]
        df = pred_cluster_info.loc[pid].dropna()
        person["predicted_stage_seq"]=[{"stage_name": idx, "career_cluster": df[idx] } for idx in df.index]
        
    
    
    return pred_seq_dict