In [1]:
import pandas as pd


csv_file = 'dataset\preprocessed\data_grade1.csv'  
df = pd.read_csv(csv_file)
df = df.drop(columns=['STUID'])
print(df)

csv_file2 = 'dataset\preprocessed\data_grade2.csv'  
df2 = pd.read_csv(csv_file2)
df2 = df2.drop(columns=['STUID'])

csv_file3 = 'dataset\preprocessed\data_grade3.csv'  
df3 = pd.read_csv(csv_file3)
df3 = df3.drop(columns=['STUID'])


      ST0  ST14  ST15  ST1_1  ST1_2  ST2_3  ST2_4  ST2_5  ST2_6  ST3  ...  \
0       1  48.0     0      2      4      4      3      4      0    1  ...   
1       1  45.0     1      2      4      1      1      3      3    2  ...   
2       1  48.0     4      1      2      0      8      1      2    2  ...   
3       0  44.0     2      2      5      4      1      3      0    2  ...   
4       0  50.0     0      2      3      0      4      3      4    2  ...   
...   ...   ...   ...    ...    ...    ...    ...    ...    ...  ...  ...   
1927    1  55.0     4      2      3      1      8      3      4    1  ...   
1928    1  35.0     4      2      3      3      8      4      5    0  ...   
1929    1  62.0     0      2      2      1      3      3      1    2  ...   
1930    1  43.0     0      2      3      2      3      4      8    3  ...   
1931    1  50.0     4      3      3      3      0      8      3    0  ...   

      DB7_5  DB7_6  DB7_7  DB7_8  KOR_S  ENG_S  MATH_S  KOR_HIGH  ENG_HIGH 

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, cohen_kappa_score, confusion_matrix

def model_predict(model, df, subject):
    
    # 과목별 학업성취도 (0,1) 변수를 예측 변수 y로 지정 
    y = df[subject]
    # 과목별 원점수 및 학업성취도(0,1) 변수 제거 후 나머지 변수 모두 예측을 위한 변수 x값으로 지정
    X = df.drop(columns=['KOR_HIGH', 'ENG_HIGH', 'MATH_HIGH', 'KOR_S', 'ENG_S', 'MATH_S'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    accuracy = accuracy_score(y_test, y_pred)
    sensitivity = tp / (tp + fn)  
    specificity = tn / (tn + fp)  
    kappa = cohen_kappa_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    return accuracy, sensitivity, specificity, kappa, auc

In [30]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

svm_model = SVC(probability=True, random_state=42)
dc_model = DecisionTreeClassifier(random_state=42)
nn_model = MLPClassifier(random_state=42)
knn_model = KNeighborsClassifier()
rf_model = RandomForestClassifier(random_state=42)
ada_model = AdaBoostClassifier(random_state=42)
xgb_model = XGBClassifier(random_state= 42)
grade_list = [df, df2, df3]
subject_list = ['KOR_HIGH', 'ENG_HIGH', 'MATH_HIGH']

acc_sum = 0
sen_sum = 0
spe_sum = 0
kap_sum = 0
auc_sum = 0

# 1,2,3 학년별로
for grade in grade_list:
    # 과목별로
    for subject in subject_list:
        # 5가지 점수 계산, model_predict()에 첫번째 변수에 모델명 수정시 해당 모델로 계산 진행됨
        accuracy, sensitivity, specificity, kappa, auc = model_predict(rf_model, grade, subject)
        print(f"Subject: {subject}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Sensitvity: {sensitivity:.4f}")
        print(f"Specificity: {specificity:.4f}")
        print(f"Kappa: {kappa:.4f}")
        print(f"AUC: {auc:.4f} \n")
        acc_sum += accuracy
        sen_sum += sensitivity
        spe_sum += specificity
        kap_sum += kappa
        auc_sum += auc

print(f"avg_Accuracy: {acc_sum/9:.4f}")
print(f"avg_Sensitvity: {sen_sum/9:.4f}")
print(f"avg_Specificity: {spe_sum/9:.4f}")
print(f"avg_Kappa: {kap_sum/9:.4f}")
print(f"avg_AUC: {auc_sum/9:.4f} \n")

Subject: KOR_HIGH
Accuracy: 0.6563
Sensitvity: 0.7418
Specificity: 0.5517
Kappa: 0.2971
AUC: 0.7068 

Subject: ENG_HIGH
Accuracy: 0.7829
Sensitvity: 0.7887
Specificity: 0.7772
Kappa: 0.5659
AUC: 0.8563 

Subject: MATH_HIGH
Accuracy: 0.8165
Sensitvity: 0.8597
Specificity: 0.7590
Kappa: 0.6230
AUC: 0.8836 

Subject: KOR_HIGH
Accuracy: 0.7062
Sensitvity: 0.7860
Specificity: 0.6071
Kappa: 0.3981
AUC: 0.7638 

Subject: ENG_HIGH
Accuracy: 0.8337
Sensitvity: 0.8093
Specificity: 0.8621
Kappa: 0.6675
AUC: 0.8974 

Subject: MATH_HIGH
Accuracy: 0.8109
Sensitvity: 0.7798
Specificity: 0.8416
Kappa: 0.6217
AUC: 0.8749 

Subject: KOR_HIGH
Accuracy: 0.7173
Sensitvity: 0.7736
Specificity: 0.6603
Kappa: 0.4342
AUC: 0.8036 

Subject: ENG_HIGH
Accuracy: 0.8195
Sensitvity: 0.7731
Specificity: 0.8683
Kappa: 0.6397
AUC: 0.8966 

Subject: MATH_HIGH
Accuracy: 0.8100
Sensitvity: 0.6935
Specificity: 0.9144
Kappa: 0.6146
AUC: 0.8753 

avg_Accuracy: 0.7726
avg_Sensitvity: 0.7784
avg_Specificity: 0.7602
avg_Kappa: 

Feature selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm


def feature_importance(df, subject, least, most):

    y = df[subject]
    X = df.drop(columns=['KOR_HIGH', 'ENG_HIGH', 'MATH_HIGH', 'KOR_S', 'ENG_S', 'MATH_S'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    full_score = accuracy_score(y_test, y_pred)


    best = 0
    importances = model.feature_importances_
    for i in range (least, most):
        top_features = np.argsort(importances)[-i:]
        # print(top_features)

        X_train_selected = X_train.iloc[:, top_features]
        X_test_selected = X_test.iloc[:, top_features]

        model_selected = RandomForestClassifier(random_state=42)
        model_selected.fit(X_train_selected, y_train)
        y_pred_selected = model_selected.predict(X_test_selected)
        selected_score = accuracy_score(y_test, y_pred_selected)

        # print("Full features score:", full_score)
        # print("Selected features score:", selected_score)
        score = full_score - selected_score
        # print(score)

        if (score < best):
            best = score
            best_i = i
            best_features = X_train.columns[top_features]
            
    # print(best, best_i)
    
    return best, best_i, best_features

In [27]:
print(feature_importance(df, "KOR_HIGH", 3, 667))
print(feature_importance(df, 'ENG_HIGH', 3, 667))
print(feature_importance(df, 'MATH_HIGH', 3, 667))
print(feature_importance(df2, "KOR_HIGH", 3, 667))
print(feature_importance(df2, 'ENG_HIGH', 3, 667))
print(feature_importance(df2, 'MATH_HIGH', 3, 667))
print(feature_importance(df3, "KOR_HIGH", 3, 667))
print(feature_importance(df3, 'ENG_HIGH', 3, 667))
print(feature_importance(df3, 'MATH_HIGH', 3, 667))

(-0.06201550387596899, 19, Index(['ST11_5', 'DB2_3W', 'DB7_1', 'DB7_3', 'ST59_6', 'ST14', 'PA20',
       'ST12_2_1', 'ST2_4', 'PA2', 'ST35K_3', 'ST35E_3', 'ST32M_3', 'ST53',
       'ST12_1_1', 'ST0', 'ST30_2', 'ST30_3', 'ST30_1'],
      dtype='object'))
(-0.03875968992248069, 164, Index(['ST23_19', 'ST37_8', 'ST29_1', 'ST36M_3', 'ST28_1', 'ST13C_3', 'ST27_6',
       'ST32M_5', 'ST36K_2', 'ST36E_2',
       ...
       'ST32E_1', 'ST31_2', 'ST35E_1', 'PA5', 'ST35E_2', 'ST53', 'PA2', 'PA6',
       'ST30_3', 'ST30_2'],
      dtype='object', length=164))
(-0.03100775193798455, 289, Index(['TR16_7', 'ST38_11', 'ST18_1', 'ST34M_8', 'ST11_9', 'TR28_9', 'ST34M_5',
       'ST13A_6', 'TR26_1', 'ST34E_1',
       ...
       'ST35E_2', 'ST53', 'ST39_2', 'ST31_3', 'PA2', 'ST39_3', 'PA5', 'ST30_2',
       'PA6', 'ST30_3'],
      dtype='object', length=289))
(-0.031890660592255204, 613, Index(['PR14_14', 'ST39_4', 'ST10_1', 'ST12_1', 'TR30_1', 'ST56A_9', 'PR10',
       'PA1A_4', 'TR16_5', 'PA10',
      

Model Ensemble

In [31]:
# adaboost
acc_sum = 0
sen_sum = 0
spe_sum = 0
kap_sum = 0
auc_sum = 0

for grade in grade_list:
    for subject in subject_list:
        accuracy, sensitivity, specificity, kappa, auc = model_predict(ada_model, grade, subject)
        print(f"Subject: {subject}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Sensitvity: {sensitivity:.4f}")
        print(f"Specificity: {specificity:.4f}")
        print(f"Kappa: {kappa:.4f}")
        print(f"AUC: {auc:.4f} \n")
        acc_sum += accuracy
        sen_sum += sensitivity
        spe_sum += specificity
        kap_sum += kappa
        auc_sum += auc

print(f"avg_Accuracy: {acc_sum/9:.4f}")
print(f"avg_Sensitvity: {sen_sum/9:.4f}")
print(f"avg_Specificity: {spe_sum/9:.4f}")
print(f"avg_Kappa: {kap_sum/9:.4f}")
print(f"avg_AUC: {auc_sum/9:.4f} \n")



Subject: KOR_HIGH
Accuracy: 0.6770
Sensitvity: 0.6808
Specificity: 0.6724
Kappa: 0.3511
AUC: 0.7269 





Subject: ENG_HIGH
Accuracy: 0.7494
Sensitvity: 0.7784
Specificity: 0.7202
Kappa: 0.4986
AUC: 0.8375 





Subject: MATH_HIGH
Accuracy: 0.8165
Sensitvity: 0.8145
Specificity: 0.8193
Kappa: 0.6286
AUC: 0.8832 





Subject: KOR_HIGH
Accuracy: 0.6879
Sensitvity: 0.7366
Specificity: 0.6276
Kappa: 0.3658
AUC: 0.7563 





Subject: ENG_HIGH
Accuracy: 0.8178
Sensitvity: 0.8432
Specificity: 0.7882
Kappa: 0.6327
AUC: 0.8949 





Subject: MATH_HIGH
Accuracy: 0.7927
Sensitvity: 0.8119
Specificity: 0.7738
Kappa: 0.5855
AUC: 0.8821 





Subject: KOR_HIGH
Accuracy: 0.7387
Sensitvity: 0.8160
Specificity: 0.6603
Kappa: 0.4768
AUC: 0.8004 





Subject: ENG_HIGH
Accuracy: 0.8290
Sensitvity: 0.8148
Specificity: 0.8439
Kappa: 0.6581
AUC: 0.8946 





Subject: MATH_HIGH
Accuracy: 0.8147
Sensitvity: 0.7688
Specificity: 0.8559
Kappa: 0.6270
AUC: 0.8754 

avg_Accuracy: 0.7693
avg_Sensitvity: 0.7850
avg_Specificity: 0.7513
avg_Kappa: 0.5360
avg_AUC: 0.8390 



In [32]:
# xgboost
acc_sum = 0
sen_sum = 0
spe_sum = 0
kap_sum = 0
auc_sum = 0

for grade in grade_list:
    for subject in subject_list:
        accuracy, sensitivity, specificity, kappa, auc = model_predict(xgb_model, grade, subject)
        print(f"Subject: {subject}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Sensitvity: {sensitivity:.4f}")
        print(f"Specificity: {specificity:.4f}")
        print(f"Kappa: {kappa:.4f}")
        print(f"AUC: {auc:.4f} \n")
        acc_sum += accuracy
        sen_sum += sensitivity
        spe_sum += specificity
        kap_sum += kappa
        auc_sum += auc

print(f"avg_Accuracy: {acc_sum/9:.4f}")
print(f"avg_Sensitvity: {sen_sum/9:.4f}")
print(f"avg_Specificity: {spe_sum/9:.4f}")
print(f"avg_Kappa: {kap_sum/9:.4f}")
print(f"avg_AUC: {auc_sum/9:.4f} \n")

Subject: KOR_HIGH
Accuracy: 0.6744
Sensitvity: 0.7183
Specificity: 0.6207
Kappa: 0.3401
AUC: 0.7222 

Subject: ENG_HIGH
Accuracy: 0.7881
Sensitvity: 0.8041
Specificity: 0.7720
Kappa: 0.5762
AUC: 0.8569 

Subject: MATH_HIGH
Accuracy: 0.8243
Sensitvity: 0.8281
Specificity: 0.8193
Kappa: 0.6435
AUC: 0.8880 

Subject: KOR_HIGH
Accuracy: 0.6834
Sensitvity: 0.7654
Specificity: 0.5816
Kappa: 0.3514
AUC: 0.7592 

Subject: ENG_HIGH
Accuracy: 0.8428
Sensitvity: 0.8517
Specificity: 0.8325
Kappa: 0.6840
AUC: 0.9231 

Subject: MATH_HIGH
Accuracy: 0.8292
Sensitvity: 0.8303
Specificity: 0.8281
Kappa: 0.6583
AUC: 0.8946 

Subject: KOR_HIGH
Accuracy: 0.7340
Sensitvity: 0.8019
Specificity: 0.6651
Kappa: 0.4674
AUC: 0.8204 

Subject: ENG_HIGH
Accuracy: 0.8290
Sensitvity: 0.8056
Specificity: 0.8537
Kappa: 0.6582
AUC: 0.9088 

Subject: MATH_HIGH
Accuracy: 0.7957
Sensitvity: 0.7437
Specificity: 0.8423
Kappa: 0.5885
AUC: 0.8783 

avg_Accuracy: 0.7779
avg_Sensitvity: 0.7943
avg_Specificity: 0.7573
avg_Kappa: 

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score, roc_auc_score
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

def model_voting(df, subject, model_names=None, num_models=3):

    y = df[subject]
    X = df.drop(columns=['KOR_HIGH', 'ENG_HIGH', 'MATH_HIGH', 'KOR_S', 'ENG_S', 'MATH_S'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    all_models = {
        'svm': SVC(probability=True, random_state=42),
        'decision_tree': DecisionTreeClassifier(random_state=42),
        'random_forest': RandomForestClassifier(random_state=42),
        'knn': KNeighborsClassifier(),
        'mlp': MLPClassifier(random_state=42, max_iter=500),
        'adaboost': AdaBoostClassifier(random_state=42),
        'xgboost': XGBClassifier(random_state=42)
    }

    if model_names is None:
        model_names = list(all_models.keys())  
    selected_models = model_names[:num_models]  

    estimators = [(name, all_models[name]) for name in selected_models]

    voting_clf = VotingClassifier(
        estimators=estimators,
        voting='soft'  
    )

    voting_clf.fit(X_train, y_train)

    y_pred = voting_clf.predict(X_test)
    y_pred_proba = voting_clf.predict_proba(X_test)[:, 1]  

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = accuracy_score(y_test, y_pred)
    sensitivity = tp / (tp + fn) 
    specificity = tn / (tn + fp)
    kappa = cohen_kappa_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    return accuracy, sensitivity, specificity, kappa, auc


In [None]:
acc_sum = 0
sen_sum = 0
spe_sum = 0
kap_sum = 0
auc_sum = 0

for grade in grade_list:
    for subject in subject_list:
        accuracy, sensitivity, specificity, kappa, auc = model_voting(grade, subject, ['svm', 'decision_tree', 'random_forest', 'knn', 'mlp', 'adaboost', 'xgboost'],7)
        print(f"Subject: {subject}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Sensitvity: {sensitivity:.4f}")
        print(f"Specificity: {specificity:.4f}")
        print(f"Kappa: {kappa:.4f}")
        print(f"AUC: {auc:.4f} \n")
        acc_sum += accuracy
        sen_sum += sensitivity
        spe_sum += specificity
        kap_sum += kappa
        auc_sum += auc

print(f"avg_Accuracy: {acc_sum/9:.4f}")
print(f"avg_Sensitvity: {sen_sum/9:.4f}")
print(f"avg_Specificity: {spe_sum/9:.4f}")
print(f"avg_Kappa: {kap_sum/9:.4f}")
print(f"avg_AUC: {auc_sum/9:.4f} \n")



Subject: KOR_HIGH
Accuracy: 0.6641
Sensitvity: 0.8075
Specificity: 0.4885
Kappa: 0.3037
AUC: 0.6910 





Subject: ENG_HIGH
Accuracy: 0.7752
Sensitvity: 0.8196
Specificity: 0.7306
Kappa: 0.5503
AUC: 0.8489 





Subject: MATH_HIGH
Accuracy: 0.8295
Sensitvity: 0.8597
Specificity: 0.7892
Kappa: 0.6508
AUC: 0.9037 





Subject: KOR_HIGH
Accuracy: 0.6743
Sensitvity: 0.7531
Specificity: 0.5765
Kappa: 0.3334
AUC: 0.7471 





Subject: ENG_HIGH
Accuracy: 0.8269
Sensitvity: 0.8432
Specificity: 0.8079
Kappa: 0.6516
AUC: 0.8994 





Subject: MATH_HIGH
Accuracy: 0.8041
Sensitvity: 0.7752
Specificity: 0.8326
Kappa: 0.6080
AUC: 0.8846 





Subject: KOR_HIGH
Accuracy: 0.7150
Sensitvity: 0.7736
Specificity: 0.6555
Kappa: 0.4294
AUC: 0.7915 





Subject: ENG_HIGH
Accuracy: 0.8314
Sensitvity: 0.8102
Specificity: 0.8537
Kappa: 0.6629
AUC: 0.9037 





Subject: MATH_HIGH
Accuracy: 0.8242
Sensitvity: 0.7638
Specificity: 0.8784
Kappa: 0.6456
AUC: 0.8945 

avg_Accuracy: 0.7716
avg_Sensitvity: 0.8007
avg_Specificity: 0.7348
avg_Kappa: 0.5373
avg_AUC: 0.8405 



In [35]:
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score, roc_auc_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

def model_stacking(df, subject, model_names, num_models=3):

    y = df[subject]
    X = df.drop(columns=['KOR_HIGH', 'ENG_HIGH', 'MATH_HIGH', 'KOR_S', 'ENG_S', 'MATH_S'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    all_models = {
        'svm': SVC(probability=True, random_state=42),
        'decision_tree': DecisionTreeClassifier(random_state=42),
        'random_forest': RandomForestClassifier(random_state=42),
        'knn': KNeighborsClassifier(),
        'mlp': MLPClassifier(random_state=42, max_iter=500),
        'adaboost': AdaBoostClassifier(random_state=42),
        'xgboost': XGBClassifier(random_state=42)
    }

    selected_models = model_names[:num_models] 

    base_estimators = [(name, all_models[name]) for name in selected_models]

    meta_model = LogisticRegression()

    stacking_clf = StackingClassifier(
        estimators=base_estimators,
        final_estimator=meta_model,
        cv=5  
    )

    stacking_clf.fit(X_train, y_train)

    y_pred = stacking_clf.predict(X_test)
    y_pred_proba = stacking_clf.predict_proba(X_test)[:, 1]  

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = accuracy_score(y_test, y_pred)
    sensitivity = tp / (tp + fn)  
    specificity = tn / (tn + fp)
    kappa = cohen_kappa_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    return accuracy, sensitivity, specificity, kappa, auc


In [None]:
acc_sum = 0
sen_sum = 0
spe_sum = 0
kap_sum = 0
auc_sum = 0

for grade in grade_list:
    for subject in subject_list:
        accuracy, sensitivity, specificity, kappa, auc = model_stacking(grade, subject, ['svm', 'decision_tree', 'random_forest', 'knn', 'mlp', 'adaboost', 'xgboost'], 7)
        print(f"Subject: {subject}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Sensitvity: {sensitivity:.4f}")
        print(f"Specificity: {specificity:.4f}")
        print(f"Kappa: {kappa:.4f}")
        print(f"AUC: {auc:.4f} \n")
        acc_sum += accuracy
        sen_sum += sensitivity
        spe_sum += specificity
        kap_sum += kappa
        auc_sum += auc

print(f"avg_Accuracy: {acc_sum/9:.4f}")
print(f"avg_Sensitvity: {sen_sum/9:.4f}")
print(f"avg_Specificity: {spe_sum/9:.4f}")
print(f"avg_Kappa: {kap_sum/9:.4f}")
print(f"avg_AUC: {auc_sum/9:.4f} \n")



Subject: KOR_HIGH
Accuracy: 0.6744
Sensitvity: 0.7183
Specificity: 0.6207
Kappa: 0.3401
AUC: 0.7280 





Subject: ENG_HIGH
Accuracy: 0.7959
Sensitvity: 0.8093
Specificity: 0.7824
Kappa: 0.5917
AUC: 0.8664 





Subject: MATH_HIGH
Accuracy: 0.8320
Sensitvity: 0.8462
Specificity: 0.8133
Kappa: 0.6579
AUC: 0.9086 





Subject: KOR_HIGH
Accuracy: 0.6902
Sensitvity: 0.7695
Specificity: 0.5918
Kappa: 0.3657
AUC: 0.7782 





Subject: ENG_HIGH
Accuracy: 0.8405
Sensitvity: 0.8390
Specificity: 0.8424
Kappa: 0.6799
AUC: 0.9144 





Subject: MATH_HIGH
Accuracy: 0.8314
Sensitvity: 0.8028
Specificity: 0.8597
Kappa: 0.6627
AUC: 0.8944 





Subject: KOR_HIGH
Accuracy: 0.7292
Sensitvity: 0.7972
Specificity: 0.6603
Kappa: 0.4579
AUC: 0.8197 





Subject: ENG_HIGH
Accuracy: 0.8432
Sensitvity: 0.8241
Specificity: 0.8634
Kappa: 0.6866
AUC: 0.9157 





Subject: MATH_HIGH
Accuracy: 0.8195
Sensitvity: 0.7437
Specificity: 0.8874
Kappa: 0.6354
AUC: 0.8900 

avg_Accuracy: 0.7840
avg_Sensitvity: 0.7944
avg_Specificity: 0.7690
avg_Kappa: 0.5642
avg_AUC: 0.8573 



Stacking + feature selection

In [40]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score, roc_auc_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

def model_stacking_with_feature_selection(df, subject, model_names, num_models, n_features):
    y = df[subject]
    X = df.drop(columns=['KOR_HIGH', 'ENG_HIGH', 'MATH_HIGH', 'KOR_S', 'ENG_S', 'MATH_S'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    importance_model = RandomForestClassifier(random_state=42)
    importance_model.fit(X_train, y_train)
    feature_importances = importance_model.feature_importances_

    important_indices = np.argsort(feature_importances)[-n_features:] 
    important_features = X.columns[important_indices]  

    X_train = X_train[important_features]
    X_test = X_test[important_features]

    all_models = {
        'svm': SVC(probability=True, random_state=42),
        'decision_tree': DecisionTreeClassifier(random_state=42),
        'random_forest': RandomForestClassifier(random_state=42),
        'knn': KNeighborsClassifier(),
        'mlp': MLPClassifier(random_state=42, max_iter=500),
        'adaboost': AdaBoostClassifier(random_state=42),
        'xgboost': XGBClassifier(random_state=42)
    }

    selected_models = model_names[:num_models]

    base_estimators = [(name, all_models[name]) for name in selected_models]

    meta_model = LogisticRegression()

    
    stacking_clf = StackingClassifier(
        estimators=base_estimators,
        final_estimator=meta_model,
        cv=5  
    )

    stacking_clf.fit(X_train, y_train)

    y_pred = stacking_clf.predict(X_test)
    y_pred_proba = stacking_clf.predict_proba(X_test)[:, 1]

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = accuracy_score(y_test, y_pred)
    sensitivity = tp / (tp + fn)  
    specificity = tn / (tn + fp)
    kappa = cohen_kappa_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    return accuracy, sensitivity, specificity, kappa, auc

In [41]:
n_feature_list = [19, 164, 289, 613, 41, 17, 12, 154, 59]

acc_sum = 0
sen_sum = 0
spe_sum = 0
kap_sum = 0
auc_sum = 0

for grade in grade_list:
    for subject, n_feature in zip(subject_list, n_feature_list):
        accuracy, sensitivity, specificity, kappa, auc = model_stacking_with_feature_selection(grade, subject, ['svm', 'decision_tree', 'random_forest', 'knn', 'mlp', 'adaboost', 'xgboost'], 7, n_feature)
        print(f"Subject: {subject}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Sensitvity: {sensitivity:.4f}")
        print(f"Specificity: {specificity:.4f}")
        print(f"Kappa: {kappa:.4f}")
        print(f"AUC: {auc:.4f} \n")
        acc_sum += accuracy
        sen_sum += sensitivity
        spe_sum += specificity
        kap_sum += kappa
        auc_sum += auc

print(f"avg_Accuracy: {acc_sum/9:.4f}")
print(f"avg_Sensitvity: {sen_sum/9:.4f}")
print(f"avg_Specificity: {spe_sum/9:.4f}")
print(f"avg_Kappa: {kap_sum/9:.4f}")
print(f"avg_AUC: {auc_sum/9:.4f} \n")



Subject: KOR_HIGH
Accuracy: 0.6977
Sensitvity: 0.7653
Specificity: 0.6149
Kappa: 0.3836
AUC: 0.7668 





Subject: ENG_HIGH
Accuracy: 0.7933
Sensitvity: 0.8196
Specificity: 0.7668
Kappa: 0.5865
AUC: 0.8650 





Subject: MATH_HIGH
Accuracy: 0.8450
Sensitvity: 0.8552
Specificity: 0.8313
Kappa: 0.6845
AUC: 0.9129 





Subject: KOR_HIGH
Accuracy: 0.7062
Sensitvity: 0.7613
Specificity: 0.6378
Kappa: 0.4016
AUC: 0.7731 





Subject: ENG_HIGH
Accuracy: 0.8337
Sensitvity: 0.8178
Specificity: 0.8522
Kappa: 0.6670
AUC: 0.9122 





Subject: MATH_HIGH
Accuracy: 0.8292
Sensitvity: 0.8303
Specificity: 0.8281
Kappa: 0.6583
AUC: 0.8970 





Subject: KOR_HIGH
Accuracy: 0.7506
Sensitvity: 0.7972
Specificity: 0.7033
Kappa: 0.5008
AUC: 0.8282 





Subject: ENG_HIGH
Accuracy: 0.8171
Sensitvity: 0.7778
Specificity: 0.8585
Kappa: 0.6348
AUC: 0.9129 





Subject: MATH_HIGH
Accuracy: 0.8314
Sensitvity: 0.7688
Specificity: 0.8874
Kappa: 0.6598
AUC: 0.8942 

avg_Accuracy: 0.7893
avg_Sensitvity: 0.7992
avg_Specificity: 0.7756
avg_Kappa: 0.5752
avg_AUC: 0.8625 



In [42]:
n_feature_list = [19, 164, 289, 613, 41, 17, 12, 154, 59]

acc_sum = 0
sen_sum = 0
spe_sum = 0
kap_sum = 0
auc_sum = 0

for grade in grade_list:
    for subject, n_feature in zip(subject_list, n_feature_list):
        accuracy, sensitivity, specificity, kappa, auc = model_stacking_with_feature_selection(grade, subject, ['svm', 'decision_tree', 'random_forest', 'knn', 'mlp'], 5, n_feature)
        print(f"Subject: {subject}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Sensitvity: {sensitivity:.4f}")
        print(f"Specificity: {specificity:.4f}")
        print(f"Kappa: {kappa:.4f}")
        print(f"AUC: {auc:.4f} \n")
        acc_sum += accuracy
        sen_sum += sensitivity
        spe_sum += specificity
        kap_sum += kappa
        auc_sum += auc

print(f"avg_Accuracy: {acc_sum/9:.4f}")
print(f"avg_Sensitvity: {sen_sum/9:.4f}")
print(f"avg_Specificity: {spe_sum/9:.4f}")
print(f"avg_Kappa: {kap_sum/9:.4f}")
print(f"avg_AUC: {auc_sum/9:.4f} \n")

Subject: KOR_HIGH
Accuracy: 0.6977
Sensitvity: 0.7840
Specificity: 0.5920
Kappa: 0.3810
AUC: 0.7618 

Subject: ENG_HIGH
Accuracy: 0.8010
Sensitvity: 0.8093
Specificity: 0.7927
Kappa: 0.6020
AUC: 0.8654 

Subject: MATH_HIGH
Accuracy: 0.8450
Sensitvity: 0.8552
Specificity: 0.8313
Kappa: 0.6845
AUC: 0.9070 

Subject: KOR_HIGH
Accuracy: 0.6902
Sensitvity: 0.7449
Specificity: 0.6224
Kappa: 0.3695
AUC: 0.7735 

Subject: ENG_HIGH
Accuracy: 0.8383
Sensitvity: 0.8093
Specificity: 0.8719
Kappa: 0.6768
AUC: 0.9093 

Subject: MATH_HIGH
Accuracy: 0.8223
Sensitvity: 0.8303
Specificity: 0.8145
Kappa: 0.6447
AUC: 0.8937 

Subject: KOR_HIGH
Accuracy: 0.7482
Sensitvity: 0.7925
Specificity: 0.7033
Kappa: 0.4961
AUC: 0.8275 

Subject: ENG_HIGH
Accuracy: 0.8195
Sensitvity: 0.7731
Specificity: 0.8683
Kappa: 0.6397
AUC: 0.9106 

Subject: MATH_HIGH
Accuracy: 0.8361
Sensitvity: 0.7588
Specificity: 0.9054
Kappa: 0.6689
AUC: 0.8922 

avg_Accuracy: 0.7887
avg_Sensitvity: 0.7953
avg_Specificity: 0.7780
avg_Kappa: 