In [90]:
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from fairlearn.metrics import MetricFrame, equalized_odds_difference
from sklearn.metrics import roc_auc_score
import warnings
from sklearn.exceptions import ConvergenceWarning
from joblib import Parallel, delayed, Memory
import os

from jupyter_server.services.config import ConfigManager

cm = ConfigManager()
cm.update('notebook', {"ServerApp.iopub_msg_rate_limit": 100000})

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
%run helper.ipynb

In [40]:
term_info_df = pd.read_csv(r"/data0/larc/LARC Student_LARC_20230125_STDNT_TERM_INFO.csv", low_memory=False)
student_info_df = pd.read_csv("/data0/larc/LARC Student_LARC_20230125_STDNT_INFO.csv", low_memory=False)
# class_info_df = pd.read_csv("/data0/larc/LARC Student_LARC_20230125_STDNT_TERM_CLASS_INFO.csv", low_memory=False)
transfer_info_df = pd.read_csv("/data0/larc/LARC Student_LARC_20230125_STDNT_TERM_TRNSFR_INFO.csv", low_memory=False)

In [97]:
# Print First-Year and Trasnfer Make Up
relevant_student_df = get_student_df(student_info_df, include_transfer=True, include_nan=False, term_cut="FA 2022")
add_first_gen_df = add_first_gen_col(relevant_student_df)
add_age_df = add_age_col(add_first_gen_df)
add_transfer_credit_df = add_transfer_credits_col(add_age_df, transfer_info_df)
add_max_sat_df = add_max_sat_col(add_transfer_credit_df)
add_major_df = add_major_col(add_max_sat_df, term_info_df)
add_cum_gpa_df = add_cum_gpa_col(add_major_df, term_info_df)
add_credits_taken_df = add_credits_taken_col(add_cum_gpa_df, term_info_df)
add_credits_taken_df['Missing_Income'] = add_credits_taken_df['EST_GROSS_FAM_INC_DES'].isna().astype(int)

add_credits_taken_df['Sex/Race'] = (add_credits_taken_df['STDNT_SEX_SHORT_DES'] + '/' + add_credits_taken_df['STDNT_ETHNC_GRP_SHORT_DES']).astype(str)
add_credits_taken_df['FG/Income'] = (
    add_credits_taken_df['IS_FIRST_GEN'].astype(str).str.strip() + 
    '/' + 
    add_credits_taken_df['EST_GROSS_FAM_INC_DES'].astype(str).str.strip()
)

add_features_df = add_credits_taken_df.copy()

sex_group = add_features_df.groupby('STDNT_SEX_SHORT_DES').size()
print("---------------SEX----------------")
print(sex_group)
print("---------------RACE---------------")
race_group = add_features_df.groupby("STDNT_ETHNC_GRP_SHORT_DES").size()
print(race_group)

sex_race_group = add_features_df.groupby('Sex/Race').size()
print("---------------SEX/Race------------")
print(sex_race_group)


fg_group = add_features_df.groupby('IS_FIRST_GEN').size()
print("---------------FG-----------------")
print(fg_group)

income_group = add_features_df.groupby("EST_GROSS_FAM_INC_DES").size()
print("--------------Income--------------")
print(income_group)


fg_income_group = add_features_df.groupby("FG/Income").size()
print("---------------FG/Income------------")
print(fg_income_group)

# add_features_df = fill_income_col(add_features_df)
# add_features_df.to_csv('add_features.csv', index=False)

---------------SEX----------------
STDNT_SEX_SHORT_DES
Female    54656
Male      54738
dtype: int64
---------------RACE---------------
STDNT_ETHNC_GRP_SHORT_DES
2 or More      4418
Asian         22280
Black          4645
Hawaiian         50
Hispanic       6361
Native Amr      154
Not Indic      6369
White         65117
dtype: int64
---------------SEX/Race------------
Sex/Race
Female/2 or More      2370
Female/Asian         10254
Female/Black          2689
Female/Hawaiian         21
Female/Hispanic       3323
Female/Native Amr       79
Female/Not Indic      2866
Female/White         33054
Male/2 or More        2048
Male/Asian           12026
Male/Black            1956
Male/Hawaiian           29
Male/Hispanic         3038
Male/Native Amr         75
Male/Not Indic        3503
Male/White           32063
dtype: int64
---------------FG-----------------
IS_FIRST_GEN
0    101573
1      7821
dtype: int64
--------------Income--------------
EST_GROSS_FAM_INC_DES
                       20409
$100,

In [67]:
# reenroll_multiverse_col_names = ["Include_Transfer", "Include_Covid", "Include_Sex", "Include_Race",  "Train_Size", "Handle_Nan", 
#                                "Scaler", "Encoder", "Sampler", "Classifier", "N_Estimators", "Learning_Rate", "C", "AUC", "Equalized_Odds_Difference_Sex/Race",
#                                 "Equalized_Odds_Difference_FG/Income"]
# reenroll_multiverse_df = pd.DataFrame(columns=reenroll_multiverse_col_names)                            

In [102]:
add_first_gen_df = add_first_gen_col(student_info_df)
add_age_df = add_age_col(add_first_gen_df)
add_transfer_credit_df = add_transfer_credits_col(add_age_df, transfer_info_df)
add_max_sat_df = add_max_sat_col(add_transfer_credit_df)
add_major_df = add_major_col(add_max_sat_df, term_info_df)
add_cum_gpa_df = add_cum_gpa_col(add_major_df, term_info_df)
add_credits_taken_df = add_credits_taken_col(add_cum_gpa_df, term_info_df)
add_credits_taken_df['Missing_Income'] = add_credits_taken_df['EST_GROSS_FAM_INC_DES'].isna().astype(int)

add_credits_taken_df['Sex/Race'] = (add_credits_taken_df['STDNT_SEX_SHORT_DES'] + '/' + add_credits_taken_df['STDNT_ETHNC_GRP_SHORT_DES']).astype(str)
add_credits_taken_df['FG/Income'] = (
    add_credits_taken_df['IS_FIRST_GEN'].astype(str).str.strip() + 
    '/' + 
    add_credits_taken_df['EST_GROSS_FAM_INC_DES'].astype(str).str.strip()
)
add_features_df = add_credits_taken_df.copy()
add_features_df = fill_income_col(add_features_df)
add_features_df.to_csv("add_features.csv", index=False)

In [103]:
add_features_df = pd.read_csv("add_features.csv", low_memory=False)

In [105]:
reenroll_multiverse_col_names = ["Include_Transfer", "Include_Covid", "Include_Sex", "Include_Race",  "Train_Size", "Handle_Nan", 
                               "Scaler", "Encoder", "Sampler", "Classifier", "N_Estimators", "Learning_Rate", "C", "AUC", "Equalized_Odds_Difference_Sex", "Equalized_Odds_Difference_Race", "Equalized_Odds_Difference_FG",
                                "Equalized_Odds_Difference_Income", "Equalized_Odds_Difference_Sex/Race",
                                "Equalized_Odds_Difference_FG/Income"]
feature_columns = ['STDNT_INTL_IND', 'STDNT_NTV_ENG_SPKR_IND', 'FIRST_US_PRMNNT_RES_PSTL_5_CD', 'PRNT_MAX_ED_LVL_DES', 'FIRST_TERM_ATTND_SHORT_DES',
                      'STARTING_AGE', 'IS_FIRST_GEN', 'TRANSFER_CREDITS', 'MAX_SAT_SCR', 'EST_GROSS_FAM_INC_DES', 'Missing_Income', 'FIRST_YR_CUM_GPA', 
                      'FIRST_YR_TAKEN_CREDITS', 'HS_GPA', 'HS_CALC_IND', 'HS_CHEM_LAB_IND', 'PGM_1_MAJOR_1_CIP_DES', 'FIRST_UG_ENTRY_TYP_DES', 'STDNT_SEX_SHORT_DES',
                  'STDNT_ETHNC_GRP_SHORT_DES', 'Sex/Race', 'FG/Income']


def process_combination(include_transfer, include_covid, handle_nan, train_size, include_sex, include_race, encoder, scaler, sampler, classifier, classifier_params):
    pipe = Pipeline(steps=[
        ("Transformer", 'passthrough'),
        ("Scaler", 'passthrough'),
        ("Sampler", 'passthrough'),
        ("Classifier", 'passthrough')
    ]) 
    
    # Load and preprocess data
    premature_X, premature_Y = premature(add_features_df, term_info_df, include_transfer=include_transfer, include_covid=include_covid)
    features_df = premature_X[feature_columns].reset_index(drop=True)
    premature_Y = premature_Y.reset_index(drop=True)
    
    if handle_nan == 'Drop':
        rows_with_nan = features_df.isnull().any(axis=1)
        X_after_nan = features_df[~rows_with_nan]
        Y_after_nan = premature_Y[~rows_with_nan]
        X_train, X_test, Y_train, Y_test = train_test_split(X_after_nan, Y_after_nan, train_size=train_size) # Train-Test Split
    else:  
        X_train, X_test, Y_train, Y_test = train_test_split(features_df, premature_Y, train_size=train_size) # Train - Test Split first
        X_train = X_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        Y_train = Y_train.reset_index(drop=True)
        Y_test = Y_test.reset_index(drop=True)

        # Perform imputation on the training data and apply it to the test data
        imputer = SimpleImputer(strategy="most_frequent")
        X_train_array = imputer.fit_transform(X_train)
        X_train = pd.DataFrame(X_train_array, columns=X_train.columns, index=X_train.index)

        X_test_array = imputer.transform(X_test)
        X_test = pd.DataFrame(X_test_array, columns=X_test.columns, index=X_test.index)

    sex_test = X_test['STDNT_SEX_SHORT_DES']
    race_test = X_test['STDNT_ETHNC_GRP_SHORT_DES']
    sex_race_test = X_test['Sex/Race']
    fg_test = X_test['IS_FIRST_GEN']
    income_test = X_test['EST_GROSS_FAM_INC_DES']
    fg_income_test = X_test['FG/Income']
    
    
    X_train = X_train.drop(columns=['Sex/Race', 'FG/Income'])
    X_test = X_test.drop(columns=['Sex/Race', 'FG/Income'])
    # Handle categorical features
    if not include_sex:
        X_train = X_train.drop(columns=['STDNT_SEX_SHORT_DES'])
        X_test = X_test.drop(columns=['STDNT_SEX_SHORT_DES'])
    if not include_race:
        X_train = X_train.drop(columns=['STDNT_ETHNC_GRP_SHORT_DES'])
        X_test = X_test.drop(columns=['STDNT_ETHNC_GRP_SHORT_DES'])

    # Setup pipeline with caching
    categorical_columns = X_train.select_dtypes(include=['object']).columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', encoder, categorical_columns)
        ],
        remainder='passthrough'
    )

    
    classifier.set_params(**classifier_params)
    pipe.set_params(**{
        "Transformer": preprocessor,
        "Scaler": scaler,
        "Sampler": sampler,
        "Classifier": classifier
    })


    # Fit and evaluate
    pipe.fit(X_train, Y_train)
    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:, 1]

    # Metrics
    auc = roc_auc_score(Y_test, y_prob)
    sex_eq_odds_diff = equalized_odds_difference(Y_test, y_pred, sensitive_features=sex_test)
    race_eq_odds_diff = equalized_odds_difference(Y_test, y_pred, sensitive_features=race_test)
    fg_eq_odds_diff = equalized_odds_difference(Y_test, y_pred, sensitive_features=fg_test)
    income_eq_odds_diff = equalized_odds_difference(Y_test, y_pred, sensitive_features=income_test)
    sex_race_eq_odds_diff = equalized_odds_difference(Y_test, y_pred, sensitive_features=sex_race_test)
    fg_income_eq_odds_diff = equalized_odds_difference(Y_test, y_pred, sensitive_features=fg_income_test)

    verse_dict = {
    "Include_Transfer": 1 if include_transfer else 0,
    "Include_Covid": 1 if include_covid else 0,
    "Handle_Nan": handle_nan,
    "Train_Size": train_size,
    "Include_Sex": 1 if include_sex else 0,
    "Include_Race": 1 if include_race else 0,
    "Encoder": 'OneHotEncoder' if isinstance(encoder, OneHotEncoder) else 'OrdinalEncoder',
    "Scaler": 'StandardScaler' if scaler else None,
    "Sampler": sampler.__class__.__name__ if sampler else None,
    "Classifier": classifier.__class__.__name__,
    }

    if isinstance(classifier, RandomForestClassifier):
        verse_dict['N_Estimators'] = classifier.n_estimators
    elif isinstance(classifier, GradientBoostingClassifier):
        verse_dict['Learning_Rate'] = classifier.learning_rate
    elif isinstance(classifier, LogisticRegression):
        verse_dict['C'] = classifier.C

    # Add the AUC and equalized odds difference after classifier parameters
    verse_dict.update({
        "AUC": auc,
        "Equalized_Odds_Difference_Sex": sex_eq_odds_diff,
        "Equalized_Odds_Difference_Race": race_eq_odds_diff,
        "Equalized_Odds_Difference_FG": fg_eq_odds_diff,
        "Equalized_Odds_Difference_Income": income_eq_odds_diff,
        "Equalized_Odds_Difference_Sex/Race": sex_race_eq_odds_diff,
        "Equalized_Odds_Difference_FG/Income": fg_income_eq_odds_diff
    })
    
    # Ensure that the DataFrame has consistent columns
    result_df = pd.DataFrame([verse_dict], columns=reenroll_multiverse_col_names)
    save_header = not os.path.isfile('reenroll_partial_results.csv')

    # Save the result after processing each combination
    with open('reenroll_partial_results.csv', 'a') as f:
        result_df.to_csv(f, header=save_header, index=False)
    return verse_dict

# Parallel execution
results = Parallel(n_jobs=-1, backend="multiprocessing")(
    delayed(process_combination)(
        include_transfer, include_covid, handle_nan, train_size, include_sex, include_race, encoder, scaler, sampler, classifier, classifier_params
    )
   for include_transfer in [True, False]
    for include_covid in [True, False]
    for handle_nan in ["Impute", "Drop"]
    for train_size in [0.7, 0.8]
    for include_sex in [True, False]
    for include_race in [True, False]
    for encoder in [OneHotEncoder(handle_unknown='ignore'), OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)]
    for scaler in [StandardScaler(with_mean=False), None]
    for sampler in [SMOTE(), NearMiss(), None]
    for classifier, classifier_params in [
        (RandomForestClassifier(), {"n_estimators": 50}),
        (RandomForestClassifier(), {"n_estimators": 100}),
        (RandomForestClassifier(), {"n_estimators": 150}),
        (GradientBoostingClassifier(), {"learning_rate": 0.01}),
        (GradientBoostingClassifier(), {"learning_rate": 0.1}),
        (GradientBoostingClassifier(), {"learning_rate": 1}),
        (LogisticRegression(max_iter=3500, solver='saga'), {"C": 0.01}),
        (LogisticRegression(max_iter=3500, solver='saga'), {"C": 0.1}),
        (LogisticRegression(max_iter=3500, solver='saga'), {"C": 1})
    ]
)

# Convert results to DataFrame and save to CSV
reenroll_multiverse_df = pd.DataFrame(results)
reenroll_multiverse_df.to_csv("reenroll_multiverse_result.csv", index=False)

In [26]:
 for handle_nan in ["Impute"]:
    if handle_nan == 'Drop':
        rows_with_nan = features_df.isnull().any(axis=1)
        X_after_nan = features_df[~rows_with_nan]
        Y_after_nan = premature_Y[~rows_with_nan]
    else:
        imputer = SimpleImputer(strategy="most_frequent")
        X_after_nan_array = imputer.fit_transform(features_df)
        X_after_nan = pd.DataFrame(X_after_nan_array, columns=features_df.columns, index=features_df.index)
        Y_after_nan = premature_Y

In [27]:
for train_size in [0.7, 0.8]:
    X_train, X_test, Y_train, Y_test = train_test_split(X_after_nan, Y_after_nan, train_size=train_size)
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    Y_train = Y_train.reset_index(drop=True)
    Y_test = Y_test.reset_index(drop=True)
    sex_test = X_test['STDNT_SEX_SHORT_DES']
    race_test = X_test['STDNT_ETHNC_GRP_SHORT_DES']
    for include_sex in [False, True]:


        if not include_sex:
            X_train_after_sex = X_train.drop(columns=['STDNT_SEX_SHORT_DES'])
            X_test_after_sex = X_test.drop(columns=['STDNT_SEX_SHORT_DES'])
        else:
            X_train_after_sex = X_train
            X_test_after_sex = X_test

        for include_race in [False, True]:


            if not include_race:
                X_train_final = X_train_after_sex.drop(columns=['STDNT_ETHNC_GRP_SHORT_DES'])
                X_test_final = X_test_after_sex.drop(columns=['STDNT_ETHNC_GRP_SHORT_DES'])
            else:
                X_train_final = X_train_after_sex
                X_test_final = X_test_after_sex

In [47]:
for encoder in [OneHotEncoder(handle_unknown='ignore')]:
    categorical_columns = features_df.select_dtypes(include=['object']).columns
    # Step 2: Create a ColumnTransformer with the current encoder
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', encoder, categorical_columns)
        ],
        remainder='passthrough'  # Leave the rest of the columns as they are
    )

    pipe.set_params(**{"Transformer": preprocessor})
    for scaler in [StandardScaler(with_mean=False)]:
        pipe.set_params(**{"Scaler": scaler})
        for sampler in [SMOTE()]:
            pipe.set_params(**{"Sampler": sampler})
            for classifier in [GradientBoostingClassifier()]:
                pipe.set_params(**{"Classifier": classifier})

In [48]:
for n in [50, 100, 150]:
    pipe.set_params(**{"Classifier__n_estimators": n})
    verse_dict = {}
    verse_dict["Include_Transfer"] = 1 if include_transfer else 0
    verse_dict["Include_Covid"] = 1 if include_covid else 0
    verse_dict["Handle_Nan"] = handle_nan
    verse_dict["Train_Size"] = train_size
    verse_dict["Include_Sex"] = 1 if include_sex else 0
    verse_dict["Include_Race"] = 1 if include_race else 0

    if isinstance(encoder, OneHotEncoder):
        verse_dict['Encoder'] = 'OneHotEncoder'
    elif isinstance(encoder, OrdinalEncoder):
        verse_dict['Encoder'] = 'OrdinalEncoder'

    if scaler:
        verse_dict['Scaler'] = 'StandardScaler'

    if isinstance(sampler, SMOTE):
        verse_dict['Sampler'] = 'SMOTE'

    elif isinstance(sampler, NearMiss):
        verse_dict['Sampler'] = 'NearMiss'

    verse_dict['Classifier'] = "GradientBoosting"
    verse_dict['Learning_Rate'] = n

    pipe.fit(X_train_final, Y_train)
    y_pred = pipe.predict(X_test_final)
    y_prob = pipe.predict_proba(X_test_final)[:, 1]

    auc = roc_auc_score(y_true=Y_test, y_score=y_prob)
    verse_dict['AUC'] = auc
   

    sex_eq_odds_diff = equalized_odds_difference(y_true=Y_test, y_pred=y_pred, sensitive_features=sex_test)
    verse_dict["Equalized_Odds_Difference_Sex"] = sex_eq_odds_diff
   

    race_eq_odds_diff = equalized_odds_difference(y_true=Y_test, y_pred=y_pred, sensitive_features=race_test)
    verse_dict["Equalized_Odds_Difference_Race"] = race_eq_odds_diff
   

    new_row_df = pd.DataFrame([verse_dict])
    reenroll_multiverse_df = pd.concat([reenroll_multiverse_df, new_row_df], ignore_index=True)
print(reenroll_multiverse_df)

KeyboardInterrupt: 

In [30]:
print(X_test_final.iloc[:,0])

0        0
1        0
2        0
3        0
4        0
        ..
21876    0
21877    1
21878    0
21879    0
21880    1
Name: STDNT_INTL_IND, Length: 21881, dtype: object


202489    222323
35598     124976
190300     10943
103542    249434
164735    118447
           ...  
51814      74052
34514     124757
191939       496
180844    345495
150054    213247
Name: STDNT_ID, Length: 45938, dtype: object


In [255]:
print(Y_test[Y_test==0])

113578    0
17349     0
86978     0
200966    0
23233     0
         ..
81958     0
107334    0
46826     0
404045    0
4006      0
Name: IS_REENROLL, Length: 99, dtype: int64


In [94]:
test = add_major_df[['STDNT_ID', 'STDNT_INTL_IND', 'STDNT_NTV_ENG_SPKR_IND', 'FIRST_US_PRMNNT_RES_PSTL_5_CD', 'PRNT_MAX_ED_LVL_DES', 'FIRST_TERM_ATTND_SHORT_DES',
                      'STARTING_AGE', 'IS_FIRST_GEN', 'TRANSFER_CREDITS', 'MAX_SAT_SCR', 'EST_GROSS_FAM_INC_DES', 'FIRST_YR_CUM_GPA', 
                      'FIRST_YR_TAKEN_CREDITS', 'HS_GPA', 'HS_CALC_IND', 'HS_CHEM_LAB_IND', 'PGM_1_MAJOR_1_CIP_DES', 'FIRST_UG_ENTRY_TYP_DES']]
print(test.dropna())

        STDNT_ID  STDNT_INTL_IND  STDNT_NTV_ENG_SPKR_IND  \
6         352207               0                       0   
7          44685               0                       1   
8         339010               0                       1   
9         301099               0                       0   
10         46601               0                       1   
...          ...             ...                     ...   
198560     86047               0                       0   
198562    383383               0                       1   
198564    395358               0                       1   
198571    290628               0                       1   
198572    105012               0                       1   

       FIRST_US_PRMNNT_RES_PSTL_5_CD     PRNT_MAX_ED_LVL_DES  \
6                              17110  Professional Doctorate   
7                              49085  Professional Doctorate   
8                              22207            Some College   
9                      

In [95]:
test = test.dropna()
target_df = generate_reenroll_target(test, term_info_df)
print(target_df)

6         1
7         1
8         1
9         0
10        1
         ..
198560    1
198562    1
198564    1
198571    1
198572    1
Name: IS_REENROLL, Length: 80155, dtype: int64


In [96]:
sd_id_1 = target_df[target_df == 1]
sd_id_0 = target_df[target_df == 0]
print("Number of Re-Enroll First-Year: ", len(sd_id_1))
print("Numer of Not Re-Enroll First-Year: ", len(sd_id_0))
# match = term_info_df[term_info_df['STDNT_ID'] == 367959]
# print(match)

Number of Re-Enroll First-Year:  79035
Numer of Not Re-Enroll First-Year:  1120


In [97]:
# income_df = fill_income_col(relevant_student_df, 'zip_mode')
# print(income_df[income_df['STDNT_ID'] == 250287][['EST_GROSS_FAM_INC_DES']])


In [98]:
# add_age_df = add_age_col(relevant_student_df)
# print(add_age_df[add_age_df['STARTING_AGE'] > 90][['FIRST_TERM_ATTND_BEGIN_YR_MO', 'STDNT_BIRTH_YR']])

In [99]:
# add_transfer_credits_df = add_transfer_credits_col(relevant_student_df, transfer_info_df)
# print(add_transfer_credits_df['TRANSFER_CREDITS'].unique())

In [100]:
# add_max_sat_df = add_max_sat_col(relevant_student_df)
# print(add_max_sat_df['MAX_SAT_SCR'].unique())

In [101]:
# #student_term_info_df
# print(term_info_df[term_info_df['PGM_1_MAJOR_1_CIP_DES'].isna()])
# print("####")

# # print(term_info_df[term_info_df['TERM_CD'] == 2420])

In [None]:
# print(term_info_df[term_info_df['STDNT_ID'] == 413831][['PGM_1_MAJOR_1_CIP_DES']])

In [None]:
# student_info_df

In [None]:
# print(len(student_info_df[(student_info_df['FIRST_UG_ENTRY_TYP_DES'] == 'First-Year') | (student_info_df['FIRST_UG_ENTRY_TYP_DES'] == 'First-Year Assumed')]))

In [None]:
# class_info_df

In [None]:
# transfer_info_df