In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/프로젝트 1/MBA.csv')

In [4]:
# numeric_cols = df.select_dtypes(include=['float64', 'int64'])
# for element in numeric_cols:
#     sns.boxplot(data=df[element])
#     plt.title(element)
#     plt.show()

In [5]:
# 'race' 컬럼에서 결측치를 'unknown'으로 채움
df['race'] = df['race'].fillna('unknown')

# 'admission' 컬럼에서 결측치를 'denied'로 채움
df['admission'] = df['admission'].fillna('denied')

# 'admission' 컬럼에서 'Waitlist' 값을 'denied'로 변환
df['admission'] = df['admission'].apply(lambda x : 'Admit' if x == 'Waitlist' else x)

# 'admission' 컬럼을 이진 변수로 변환 ('denied' = 1, 그 외 = 0)
df['admission'] = df['admission'].apply(lambda x : 1 if x == 'denied' else 0)

# 'application_id' 컬럼은 필요 없으므로 제거
df = df.drop('application_id', axis = 1)

# 'gender' 컬럼을 이진 변수로 변환 ('Male' = 0, 'Female' = 1)
df['gender'] = df['gender'].apply(lambda x : 0 if x == 'Male' else 1)

# 'international' 컬럼을 이진 변수로 변환 (True = 0, False = 1)
df['international'] = df['international'].apply(lambda x : 0 if x == True else 1)

# LabelEncoder 객체 생성
le = LabelEncoder()

# 범주형 컬럼(object 타입)을 모두 찾아서 레이블 인코딩 수행
categorical_cols = df.select_dtypes(include=['object']).columns

# 각 범주형 컬럼에 Label Encoding 적용
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])  # 인코딩 적용
    print(f"Classes (범주) for {col}: {le.classes_}")  # 원래 범주들 출력
    encoded_data = df[col]  # 인코딩된 값을 데이터프레임에 저장
    mapping = dict(zip(le.classes_, range(len(le.classes_))))  # 범주와 인코딩 값 매핑
    print(f"Mapping (범주 -> 인코딩 값) for {col}: {mapping}\n")

# X에 'admission'을 제외한 모든 특성 데이터를 저장
X = df.drop(['admission'], axis=1)

# y에 'admission' 컬럼을 저장 (타겟 변수)
y = df['admission']


Classes (범주) for major: ['Business' 'Humanities' 'STEM']
Mapping (범주 -> 인코딩 값) for major: {'Business': 0, 'Humanities': 1, 'STEM': 2}

Classes (범주) for race: ['Asian' 'Black' 'Hispanic' 'Other' 'White' 'unknown']
Mapping (범주 -> 인코딩 값) for race: {'Asian': 0, 'Black': 1, 'Hispanic': 2, 'Other': 3, 'White': 4, 'unknown': 5}

Classes (범주) for work_industry: ['CPG' 'Consulting' 'Energy' 'Financial Services' 'Health Care'
 'Investment Banking' 'Investment Management' 'Media/Entertainment'
 'Nonprofit/Gov' 'Other' 'PE/VC' 'Real Estate' 'Retail' 'Technology']
Mapping (범주 -> 인코딩 값) for work_industry: {'CPG': 0, 'Consulting': 1, 'Energy': 2, 'Financial Services': 3, 'Health Care': 4, 'Investment Banking': 5, 'Investment Management': 6, 'Media/Entertainment': 7, 'Nonprofit/Gov': 8, 'Other': 9, 'PE/VC': 10, 'Real Estate': 11, 'Retail': 12, 'Technology': 13}



In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
from scipy.stats import chi2_contingency

selected_columns = []

for element in df.columns:

    contingency_table = pd.crosstab(df[element], df['admission'])

    # 카이제곱 검정 수행
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    if p < 0.03:
        selected_columns.append(element)

    print(element)
    print("Chi-Square Statistic:", chi2)
    print("p-value:", p)
    print("Degrees of Freedom:", dof)
    print("")

gender
Chi-Square Statistic: 95.4601654520653
p-value: 1.5089725785759555e-22
Degrees of Freedom: 1

international
Chi-Square Statistic: 0.5839856987918092
p-value: 0.4447542811936849
Degrees of Freedom: 1

gpa
Chi-Square Statistic: 677.0609368349182
p-value: 1.629006490177676e-86
Degrees of Freedom: 100

major
Chi-Square Statistic: 0.28451932190137114
p-value: 0.8673959983722536
Degrees of Freedom: 2

race
Chi-Square Statistic: 51.11047016621665
p-value: 8.209554766013871e-10
Degrees of Freedom: 5

gmat
Chi-Square Statistic: 939.5027943511665
p-value: 2.1223845341083515e-185
Degrees of Freedom: 21

work_exp
Chi-Square Statistic: 10.390943281471268
p-value: 0.2386514807174321
Degrees of Freedom: 8

work_industry
Chi-Square Statistic: 17.649920799709022
p-value: 0.17126972345693742
Degrees of Freedom: 13

admission
Chi-Square Statistic: 6186.61567232219
p-value: 0.0
Degrees of Freedom: 1



In [None]:
features = []
remaining_features = list(range(X.shape[1]))
best_score = 0

while remaining_features:
    scores = []
    for feature in remaining_features:
        # 현재 선택된 특성 + 새 특성
        selected_features = features + [feature]
        model = LogisticRegression(max_iter=200)
        model.fit(X_train[:, selected_features], y_train)
        predictions = model.predict(X_test[:, selected_features])
        score = f1_score(y_test, predictions)
        scores.append((score, feature))

    # 가장 성능이 좋은 특성 선택
    scores.sort(reverse=True)
    best_new_score, best_new_feature = scores[0]

    if best_new_score > best_score:
        best_score = best_new_score
        features.append(best_new_feature)
        remaining_features.remove(best_new_feature)
    else:
        break


print(f"최고 높은 점수: {best_score}")
for element in features:
    print(f"선택된 특성: {df.columns[element]}")

최고 높은 점수: 0.9069042316258352
선택된 특성: gpa


In [None]:
# 랜덤 포레스트 모델 학습
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# 특성 중요도 추출
importances = model.feature_importances_
print(len(importances))
# 특성 중요도를 데이터프레임으로 변환
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# 중요도 순으로 정렬
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# 결과 출력
print(feature_importance_df)


8
         Feature  Importance
2            gpa    0.315986
5           gmat    0.290446
7  work_industry    0.133386
6       work_exp    0.093658
4           race    0.063968
3          major    0.059652
0         gender    0.030113
1  international    0.012791


In [None]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)

lasso_coef = pd.Series(lasso.coef_, index=X.columns)
selected_features = lasso_coef[lasso_coef != 0].index

print(f"Selected Features by Lasso: {selected_features}")


Selected Features by Lasso: Index(['gender', 'gpa', 'race', 'gmat'], dtype='object')


# 최종 선택된 변수들

gpa (수치형)

gmat (수치형)

gender (수치형 또는 이진 변수)

race

work_industry

In [None]:
models = [
    ("Logistic Regression", LogisticRegression(class_weight='balanced')),
    ("K-Nearest Neighbors", KNeighborsClassifier(n_neighbors=3)),
    ("Decision Tree", DecisionTreeClassifier(class_weight='balanced', random_state=42)),
    ("Random Forest", RandomForestClassifier(class_weight='balanced', random_state=42)),
    ("Support Vector Machine", SVC(probability=True)),
    ("XGBOOST", XGBClassifier(scale_pos_weight=10, random_state=42)),
    ("GaussianNB", GaussianNB()),
    ("GradientBoostingClassifier", GradientBoostingClassifier()),
    ("AdaBoostClassifier", AdaBoostClassifier())
]

for model_name, model in models:
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {accuracy * 100:.2f}%")

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    if hasattr(model, "predict_proba"):
        y_probs = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_probs)
        print(f"ROC-AUC: {roc_auc:.2f}")


Model: Logistic Regression
Accuracy: 72.48%
Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.77      0.49       214
           1       0.94      0.72      0.81      1025

    accuracy                           0.72      1239
   macro avg       0.65      0.74      0.65      1239
weighted avg       0.84      0.72      0.76      1239

ROC-AUC: 0.82

Model: K-Nearest Neighbors
Accuracy: 81.44%
Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.29      0.35       214
           1       0.86      0.92      0.89      1025

    accuracy                           0.81      1239
   macro avg       0.65      0.61      0.62      1239
weighted avg       0.79      0.81      0.80      1239

ROC-AUC: 0.71

Model: Decision Tree
Accuracy: 79.02%
Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.31      0.34       214
           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Model: XGBOOST
Accuracy: 83.05%
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.06      0.10       214
           1       0.83      0.99      0.91      1025

    accuracy                           0.83      1239
   macro avg       0.72      0.52      0.50      1239
weighted avg       0.79      0.83      0.77      1239

ROC-AUC: 0.83

Model: GaussianNB
Accuracy: 82.73%
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.29      0.36       214
           1       0.86      0.94      0.90      1025

    accuracy                           0.83      1239
   macro avg       0.68      0.61      0.63      1239
weighted avg       0.80      0.83      0.81      1239

ROC-AUC: 0.82

Model: GradientBoostingClassifier
Accuracy: 82.57%
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.14      0.21       214
           1       




Model: AdaBoostClassifier
Accuracy: 82.32%
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.26      0.33       214
           1       0.86      0.94      0.90      1025

    accuracy                           0.82      1239
   macro avg       0.67      0.60      0.62      1239
weighted avg       0.79      0.82      0.80      1239

ROC-AUC: 0.83


# 언더샘플링

In [16]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# 오버샘플링

In [18]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# SMOTE

In [20]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [15]:
pd.Series(y_resampled).value_counts()

Unnamed: 0_level_0,count
admission,Unnamed: 1_level_1
1,4169
0,4169


In [21]:
models = [
    ("Logistic Regression", LogisticRegression(class_weight='balanced')),
    ("K-Nearest Neighbors", KNeighborsClassifier(n_neighbors=3)),
    ("Decision Tree", DecisionTreeClassifier(class_weight='balanced', random_state=42)),
    ("Random Forest", RandomForestClassifier(class_weight='balanced', random_state=42)),
    ("Support Vector Machine", SVC(probability=True)),
    ("XGBOOST", XGBClassifier(scale_pos_weight=10, random_state=42)),
    ("GaussianNB", GaussianNB()),
    ("GradientBoostingClassifier", GradientBoostingClassifier()),
    ("AdaBoostClassifier", AdaBoostClassifier())
]

for model_name, model in models:
    model.fit(X_resampled, y_resampled)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {accuracy * 100:.2f}%")

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    if hasattr(model, "predict_proba"):
        y_probs = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_probs)
        print(f"ROC-AUC: {roc_auc:.2f}")


Model: Logistic Regression
Accuracy: 72.48%
Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.76      0.49       214
           1       0.94      0.72      0.81      1025

    accuracy                           0.72      1239
   macro avg       0.65      0.74      0.65      1239
weighted avg       0.84      0.72      0.76      1239

ROC-AUC: 0.82

Model: K-Nearest Neighbors
Accuracy: 73.85%
Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.62      0.45       214
           1       0.91      0.76      0.83      1025

    accuracy                           0.74      1239
   macro avg       0.63      0.69      0.64      1239
weighted avg       0.81      0.74      0.76      1239

ROC-AUC: 0.73

Model: Decision Tree
Accuracy: 78.05%
Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.38      0.38       214
           




Model: AdaBoostClassifier
Accuracy: 78.45%
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.74      0.54       214
           1       0.94      0.79      0.86      1025

    accuracy                           0.78      1239
   macro avg       0.68      0.77      0.70      1239
weighted avg       0.85      0.78      0.80      1239

ROC-AUC: 0.84


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# 랜덤 포레스트 모델 정의
rf = RandomForestClassifier(random_state=42)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [100, 200, 300],        # 트리 개수
    'max_depth': [10, 20, 30, None],        # 트리 최대 깊이
    'min_samples_split': [2, 5, 10],        # 노드 분할 최소 샘플 수
    'min_samples_leaf': [1, 2, 4]           # 리프 노드 최소 샘플 수
}

# GridSearchCV 설정
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# 학습 데이터로 튜닝 수행
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터와 성능 확인
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Accuracy: {grid_search.best_score_:.2f}")

# 최적의 모델로 테스트 데이터 성능 평가
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# XGBoost 모델 정의
xgb = XGBClassifier(random_state=42)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [100, 200, 300],        # 트리 개수
    'max_depth': [3, 6, 9],                 # 트리 최대 깊이
    'learning_rate': [0.01, 0.1, 0.2],      # 학습 속도
    'subsample': [0.6, 0.8, 1.0],           # 각 트리마다 사용할 샘플 비율
    'colsample_bytree': [0.6, 0.8, 1.0]     # 각 트리마다 사용할 피처 비율
}

# GridSearchCV 설정
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# 학습 데이터로 튜닝 수행
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터와 성능 확인
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Accuracy: {grid_search.best_score_:.2f}")

# 최적의 모델로 테스트 데이터 성능 평가
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))
