In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.4f}'.format
pd.options.display.max_rows = 500   
pd.options.display.max_columns = 20   
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score

In [2]:
df = pd.read_csv('230102_company3.csv')
df.drop(['기업명','기업구분','Unnamed: 0'], axis=1, inplace=True)
df

Unnamed: 0,기업구분코드,사원수,업력,입사율(%),퇴사율(%),이직율(%),별점,추천
0,3.0,52,4.0,114.71,64.71,56.41,4.3,1.0
1,3.0,930,17.0,89.94,97.84,108.78,2.3,0.0
2,3.0,20,10.0,40.0,30.0,75.0,3.0,1.0
3,3.0,140,0.0,120.0,100.0,83.33,3.3,1.0
4,3.0,3,6.0,33.33,0.0,0.0,3.0,1.0
5,3.0,67,23.0,42.86,50.0,116.66,2.4,0.0
6,3.0,80,4.0,164.29,84.29,51.31,2.3,0.0
7,3.0,75,12.0,89.19,62.16,69.69,2.1,0.0
8,3.0,27,10.0,11.11,3.7,33.3,4.5,1.0
9,3.0,39,22.0,12.5,25.0,200.0,3.3,1.0


In [3]:
X = df.drop(columns='추천')
Y = df['추천']

# x_train, x_test 데이터 생성
x_train, x_test, y_train, y_test  = train_test_split(X, Y, test_size=0.2, stratify= Y, random_state=0)

x_train.shape, y_train.shape, x_test.shape

((139, 7), (139,), (35, 7))

In [4]:
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train),columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)

x_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
기업구분코드,139.0,-0.0,1.0036,-2.6867,0.2195,0.2195,0.2195,1.6727
사원수,139.0,-0.0,1.0036,-0.3065,-0.2774,-0.2382,-0.1185,8.485
업력,139.0,-0.0,1.0036,-1.239,-0.7399,-0.1501,0.6665,3.7515
입사율(%),139.0,0.0,1.0036,-0.9812,-0.6121,-0.3597,0.3922,5.3723
퇴사율(%),139.0,0.0,1.0036,-1.0662,-0.5957,-0.2615,0.2995,6.3101
이직율(%),139.0,-0.0,1.0036,-1.3048,-0.5471,-0.2226,0.2078,5.6733
별점,139.0,-0.0,1.0036,-2.69,-0.6547,-0.0732,0.6537,3.1251


In [5]:
# train score, test score, roc_auc_score 
def get_scores(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    B = model.score(xtest, ytest)
    ypred = model.predict(xtest)
    C = roc_auc_score(ytest, ypred)  
    return '{:.4f} {:.4f} {:.4f}'.format(A, B, C)
# 모델선택 
def make_models(xtrain, xtest, ytrain, ytest):
    model1 = LogisticRegression(max_iter=1000).fit(xtrain, ytrain)
    print('model1', get_scores(model1, xtrain, xtest, ytrain, ytest))
    for k in range(1, 10):
        model2 = KNeighborsClassifier(k).fit(xtrain, ytrain)
        print('model2', k, get_scores(model2, xtrain, xtest, ytrain, ytest))

    # overfitting
    model3 = DecisionTreeClassifier(random_state=0).fit(xtrain, ytrain)
    print('model3', get_scores(model3, xtrain, xtest, ytrain, ytest))

    # overfitting 해결
    for d in range(3, 8):
        model3 = DecisionTreeClassifier(max_depth=d,random_state=0).fit(xtrain, ytrain)
        print('model3', d, get_scores(model3, xtrain, xtest, ytrain, ytest))

    # overfitting
    model4 = RandomForestClassifier(random_state=0).fit(xtrain, ytrain)
    print('model4', get_scores(model4, xtrain, xtest, ytrain, ytest))

    # overfitting 해결
    for d in range(3, 8):
        model4 = RandomForestClassifier(500, max_depth=d, random_state=0).fit(xtrain, ytrain)
        print('model4', d, get_scores(model4, xtrain, xtest, ytrain, ytest))

    model5 = XGBClassifier(eval_metric='logloss', use_label_encoder=False).fit(xtrain, ytrain)
    print('model5', get_scores(model5, xtrain, xtest, ytrain, ytest))

In [8]:
#성능평가 
def get_other_scores(model, xtest, ytest):
    y_pred = model.predict(xtest)
    accuracy = accuracy_score(ytest, y_pred)
    precision = precision_score(ytest, y_pred)
    recall = recall_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred)
    print(f'accuracy  : {accuracy:7.4f}')
    print(f'precision : {precision:7.4f}')
    print(f'recall    : {recall:7.4f}')
    print(f'f1        : {f1:7.4f}')

In [9]:
make_models(x_train,x_test,y_train,y_test)

model1 0.9856 0.9714 0.9750
model2 1 1.0000 0.8857 0.8833
model2 2 0.9209 0.7143 0.7417
model2 3 0.9424 0.8000 0.7917
model2 4 0.9353 0.8000 0.8000
model2 5 0.8849 0.8571 0.8500
model2 6 0.8777 0.8000 0.8083
model2 7 0.8489 0.8571 0.8583
model2 8 0.8849 0.8857 0.8917
model2 9 0.8705 0.8571 0.8500
model3 1.0000 1.0000 1.0000
model3 3 1.0000 1.0000 1.0000
model3 4 1.0000 1.0000 1.0000
model3 5 1.0000 1.0000 1.0000
model3 6 1.0000 1.0000 1.0000
model3 7 1.0000 1.0000 1.0000
model4 1.0000 1.0000 1.0000
model4 3 1.0000 1.0000 1.0000
model4 4 1.0000 1.0000 1.0000
model4 5 1.0000 1.0000 1.0000
model4 6 1.0000 1.0000 1.0000
model4 7 1.0000 1.0000 1.0000
model5 1.0000 1.0000 1.0000


In [10]:
# model1 = LogisticRegression(max_iter=1000).fit(x_train, y_train)
# model2 = KNeighborsClassifier(8).fit(x_train, y_train)

In [33]:
model = LogisticRegression(max_iter=1000).fit(x_train, y_train)
print(get_scores(model, x_train, x_test, y_train, y_test))  # train score, test score, roc_auc_score 
get_other_scores(model,x_test,y_test)

0.9856 0.9714 0.9750
accuracy  :  0.9714
precision :  1.0000
recall    :  0.9500
f1        :  0.9744


In [34]:
y_pred = model.predict(x_test)
label=['비추천', '추천']
a = confusion_matrix(y_test, y_pred)
b = pd.DataFrame(a, columns=label, index=label)
b

Unnamed: 0,비추천,추천
비추천,15,0
추천,1,19


In [35]:
model = KNeighborsClassifier(8).fit(x_train, y_train)
print(get_scores(model, x_train, x_test, y_train, y_test))  # train score, test score, roc_auc_score 
get_other_scores(model,x_test,y_test)

0.8849 0.8857 0.8917
accuracy  :  0.8857
precision :  0.9444
recall    :  0.8500
f1        :  0.8947


In [36]:
y_pred = model.predict(x_test)
label=['비추천', '추천']
a = confusion_matrix(y_test, y_pred)
b = pd.DataFrame(a, columns=label, index=label)
b

Unnamed: 0,비추천,추천
비추천,14,1
추천,3,17
