In [6]:
import lightgbm
import glob, os
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

------
## 1. 전체 석사 졸업 예측

In [7]:
total_ms_list = glob.glob(os.path.join(os.getcwd(), 'original_data', '*ms.csv'))
print(total_ms_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\original_data\\std_info_grad_ms.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\original_data\\std_info_ms.csv']


In [8]:
ms_grad = pd.read_csv(total_ms_list[0])
ms_undergrad = pd.read_csv(total_ms_list[1])


In [9]:
'''
ms_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'rec014_school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
ms_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',        #학습 위해서 column name(rec014~) 변경
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액', '월별인건비']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    ms_grad[i] = ms_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


import numpy as np
X_grad = ms_grad[grad_cat_features + grad_num_features]
y_grad = ms_grad[grad_term]
y_grad = y_grad[y_grad['school_term'] != 0]
X_grad = X_grad.iloc[y_grad.index]


X_grad = X_grad.dropna()
X_grad.iloc[:,-1].astype(float)
y_grad = ms_grad[grad_term]
y_grad = y_grad.iloc[X_grad.index]

In [10]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

X_grad = ms_grad[grad_cat_features + grad_num_features]
y_grad = ms_grad[grad_term]
y_grad = y_grad[y_grad['school_term'] != 0]
X_grad = X_grad.iloc[y_grad.index]
X_grad.iloc[:,-1] = X_grad.iloc[:,-1].astype(float).round(0)

iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)



Fold 0:
               precision    recall  f1-score   support

           2       1.00      0.33      0.50         3
           3       0.50      0.33      0.40         6
           4       0.78      0.87      0.82       590
           5       0.37      0.34      0.35       201
           6       0.20      0.11      0.14        62
           7       0.25      0.15      0.19        26
           8       0.22      0.15      0.18        13
           9       0.00      0.00      0.00         7
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         3

    accuracy                           0.66       915
   macro avg       0.30      0.21      0.24       915
weighted avg       0.61      0.66      0.63       915

Fold 1:
               precision    recall  f1-score   support

           2       0.00      0.00      0.00         3
           3       0.80      0.57      0.67         7
      

### 재학자에 대한 예측

In [11]:
'''
ms_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
ms_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신', '월별인건비']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    ms_undergrad[i] = ms_undergrad[i].astype('category')

X_undergrad = ms_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [12]:
ms_undergrad['predicted_school_term'] = y_undergrad
ms_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'original_data', 'std_info_ms_predicted.csv'))

------
## 2. 전체 박사 졸업 예측

In [13]:
total_phd_list = glob.glob(os.path.join(os.getcwd(), 'original_data', '*phd.csv'))
print(total_phd_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\original_data\\std_info_grad_phd.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\original_data\\std_info_phd.csv']


In [14]:
phd_grad = pd.read_csv(total_phd_list[0])
phd_undergrad = pd.read_csv(total_phd_list[1])


In [15]:
'''
phd_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'rec014_school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
phd_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',        #학습 위해서 column name(rec014~) 변경
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액', '월별인건비']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    phd_grad[i] = phd_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


In [16]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

# X_grad = phd_grad[grad_cat_features + grad_num_features]
# y_grad = phd_grad[grad_term]

X_grad = phd_grad[grad_cat_features + grad_num_features]
y_grad = phd_grad[grad_term]
y_grad = y_grad[y_grad['school_term'] != 0]
X_grad = X_grad.iloc[y_grad.index]
X_grad.iloc[:,-1] = X_grad.iloc[:,-1].astype(float).round(0)

iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)

Fold 0:
               precision    recall  f1-score   support

           4       0.00      0.00      0.00         3
           5       0.12      0.08      0.10        12
           6       0.41      0.50      0.45        18
           7       0.27      0.32      0.29        22
           8       0.24      0.36      0.29        22
           9       0.12      0.14      0.13        14
          10       0.25      0.20      0.22        15
          11       0.00      0.00      0.00         7
          12       0.00      0.00      0.00         4
          13       0.00      0.00      0.00         3

    accuracy                           0.25       120
   macro avg       0.14      0.16      0.15       120
weighted avg       0.21      0.25      0.23       120

Fold 1:
               precision    recall  f1-score   support

           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00        12
           6       0.19      0.28      0.23        18
      

### 재학자에 대한 예측

In [17]:
'''
phd_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
phd_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액', '월별인건비']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    phd_undergrad[i] = phd_undergrad[i].astype('category')

X_undergrad = phd_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [18]:
phd_undergrad['predicted_school_term'] = y_undergrad
phd_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'original_data', 'std_info_phd_predicted.csv'))

------
## 3. 전체 석박통합 졸업 예측

In [19]:
total_combined_list = glob.glob(os.path.join(os.getcwd(), 'original_data', '*combined.csv'))
print(total_combined_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\original_data\\std_info_combined.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\original_data\\std_info_grad_combined.csv']


In [20]:
combined_grad = pd.read_csv(total_combined_list[1])
combined_undergrad = pd.read_csv(total_combined_list[0])


In [21]:
'''
combined_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
       '과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
       'rec014_school_term', '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
combined_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',              #학습 위해서 column name(rec014~) 변경
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
       'school_term', '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신', '월별인건비']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    combined_grad[i] = combined_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


In [22]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

# X_grad = combined_grad[grad_cat_features + grad_num_features]
# y_grad = combined_grad[grad_term]

X_grad = combined_grad[grad_cat_features + grad_num_features]
y_grad = combined_grad[grad_term]
y_grad = y_grad[y_grad['school_term'] != 0]
X_grad = X_grad.iloc[y_grad.index]
X_grad.iloc[:,-1] = X_grad.iloc[:,-1].astype(float).round(0)

iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)

Fold 0:
               precision    recall  f1-score   support

           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         4
           9       0.29      0.33      0.31         6
          10       0.18      0.29      0.22         7
          11       0.25      0.14      0.18         7
          12       0.30      0.33      0.32         9
          13       0.00      0.00      0.00         5

    accuracy                           0.20        41
   macro avg       0.13      0.14      0.13        41
weighted avg       0.18      0.20      0.18        41

Fold 1:
               precision    recall  f1-score   support

           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         4
           9       0.20      0.17      0.18         6
          10       0.00      0.00      0.00         8
      

### 재학자에 대한 예측

In [23]:
'''
combined_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
combined_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신', '월별인건비']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    combined_undergrad[i] = combined_undergrad[i].astype('category')

X_undergrad = combined_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [24]:
combined_undergrad['predicted_school_term'] = y_undergrad
combined_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'original_data', 'std_info_combined_predicted.csv'))

------
------
## 1. 공학계열 석사 졸업 예측

In [25]:
total_ms_list = glob.glob(os.path.join(os.getcwd(), 'eng_data', '*ms_eng.csv'))
print(total_ms_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\eng_data\\std_info_grad_ms_eng.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\eng_data\\std_info_ms_eng.csv']


In [26]:
ms_grad = pd.read_csv(total_ms_list[0])
ms_undergrad = pd.read_csv(total_ms_list[1])


In [27]:
'''
ms_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'rec014_school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
ms_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',        #학습 위해서 column name(rec014~) 변경
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액', '월별인건비']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    ms_grad[i] = ms_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


In [28]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

# X_grad = ms_grad[grad_cat_features + grad_num_features]
# y_grad = ms_grad[grad_term]

X_grad = ms_grad[grad_cat_features + grad_num_features]
y_grad = ms_grad[grad_term]
y_grad = y_grad[y_grad['school_term'] != 0]
X_grad = X_grad.iloc[y_grad.index]
X_grad.iloc[:,-1] = X_grad.iloc[:,-1].astype(float).round(0)


iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)



Fold 0:
               precision    recall  f1-score   support

           4       0.97      1.00      0.99       289
           5       0.00      0.00      0.00         6
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1

    accuracy                           0.97       297
   macro avg       0.24      0.25      0.25       297
weighted avg       0.95      0.97      0.96       297

Fold 1:
               precision    recall  f1-score   support

           4       0.97      1.00      0.99       289
           5       0.00      0.00      0.00         6
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1

    accuracy                           0.97       297
   macro avg       0.24      0.25      0.25       297
weighted avg       0.95      0.97      0.96       297

Fold 2:
               precision    recall  f1-score   support

           3       0.00      0.00      0.00     

### 재학자에 대한 예측

In [29]:
'''
ms_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
ms_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신', '월별인건비']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    ms_undergrad[i] = ms_undergrad[i].astype('category')

X_undergrad = ms_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [30]:
ms_undergrad['predicted_school_term'] = y_undergrad
ms_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'eng_data', 'std_info_ms_eng_predicted.csv'))

------
## 2. 공학계열 박사 졸업 예측

In [31]:
total_phd_list = glob.glob(os.path.join(os.getcwd(), 'eng_data', '*phd_eng.csv'))
print(total_phd_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\eng_data\\std_info_grad_phd_eng.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\eng_data\\std_info_phd_eng.csv']


In [32]:
phd_grad = pd.read_csv(total_phd_list[0])
phd_undergrad = pd.read_csv(total_phd_list[1])


In [33]:
'''
phd_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'rec014_school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
phd_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',        #학습 위해서 column name(rec014~) 변경
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액', '월별인건비']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    phd_grad[i] = phd_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


In [34]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

# X_grad = phd_grad[grad_cat_features + grad_num_features]
# y_grad = phd_grad[grad_term]

X_grad = phd_grad[grad_cat_features + grad_num_features]
y_grad = phd_grad[grad_term]
y_grad = y_grad[y_grad['school_term'] != 0]
X_grad = X_grad.iloc[y_grad.index]
X_grad.iloc[:,-1] = X_grad.iloc[:,-1].astype(float).round(0)


iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)



Fold 0:
               precision    recall  f1-score   support

           4       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         4
           8       0.29      0.80      0.42         5
           9       0.00      0.00      0.00         3
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         1

    accuracy                           0.20        20
   macro avg       0.04      0.10      0.05        20
weighted avg       0.07      0.20      0.11        20

Fold 1:
               precision    recall  f1-score   support

           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.14      0.33      0.20         3
           8       0.00      0.00      0.00         6
           9       0.00      0.00      0.00         3
      

### 재학자에 대한 예측

In [35]:
'''
phd_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
phd_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액', '월별인건비']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    phd_undergrad[i] = phd_undergrad[i].astype('category')

X_undergrad = phd_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [36]:
phd_undergrad['predicted_school_term'] = y_undergrad
phd_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'eng_data', 'std_info_phd_eng_predicted.csv'))

------
## 3. 공학계열 석박통합 졸업 예측

In [37]:
total_combined_list = glob.glob(os.path.join(os.getcwd(), 'eng_data', '*combined_eng.csv'))
print(total_combined_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\eng_data\\std_info_combined_eng.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대_예측\\eng_data\\std_info_grad_combined_eng.csv']


In [38]:
combined_grad = pd.read_csv(total_combined_list[1])
combined_undergrad = pd.read_csv(total_combined_list[0])


In [39]:
'''
combined_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
       '과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
       'rec014_school_term', '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
combined_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',              #학습 위해서 column name(rec014~) 변경
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
       'school_term', '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신', '월별인건비']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    combined_grad[i] = combined_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


In [40]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

# X_grad = combined_grad[grad_cat_features + grad_num_features]
# y_grad = combined_grad[grad_term]

X_grad = combined_grad[grad_cat_features + grad_num_features]
y_grad = combined_grad[grad_term]
y_grad = y_grad[y_grad['school_term'] != 0]
X_grad = X_grad.iloc[y_grad.index]
X_grad.iloc[:,-1] = X_grad.iloc[:,-1].astype(float).round(0)

iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)



Fold 0:
               precision    recall  f1-score   support

           6       0.89      1.00      0.94        16
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1

    accuracy                           0.89        18
   macro avg       0.30      0.33      0.31        18
weighted avg       0.79      0.89      0.84        18

Fold 1:
               precision    recall  f1-score   support

           6       0.94      1.00      0.97        16
           7       0.00      0.00      0.00         1
           8       1.00      1.00      1.00         1

    accuracy                           0.94        18
   macro avg       0.65      0.67      0.66        18
weighted avg       0.89      0.94      0.92        18

Fold 2:
               precision    recall  f1-score   support

           6       0.89      1.00      0.94        16
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00     

### 재학자에 대한 예측

In [41]:
'''
combined_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
combined_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신', '월별인건비']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '월별인건비']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    combined_undergrad[i] = combined_undergrad[i].astype('category')

X_undergrad = combined_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [42]:
combined_undergrad['predicted_school_term'] = y_undergrad
combined_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'eng_data', 'std_info_combined_eng_predicted.csv'))