### Module & FIle Import

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
rc("font", family = font_manager.FontProperties(fname="C:/Windows/Fonts/KoPubWorld Dotum_Pro Medium.otf").get_name())

get_ipython().run_line_magic('matplotlib', 'inline')

In [3]:
df = pd.read_csv('230104_company_weight_rec.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,기업명,기업구분,기업구분코드,사원수,업력,입사율(%),퇴사율(%),이직율(%),별점,가중치,추천
0,(주)페이타랩,중소기업,3.0,52,4.0,114.71,64.71,56.41,4.3,22.5,1.0
1,(주)엠제이플렉스,중소기업,3.0,930,17.0,89.94,97.84,108.78,2.3,22.5,1.0
2,(주)앰진,중소기업,3.0,20,10.0,40.0,30.0,75.0,3.0,21.5,0.0
3,(주)헥토이노베이션,중소기업,3.0,140,14.0,120.0,100.0,83.33,3.3,22.5,1.0
4,(주)엑스큐어넷,중소기업,3.0,67,23.0,42.86,50.0,116.66,2.4,22.5,1.0


### 전처리 전

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

X = df[['기업구분코드', '사원수', '업력', '이직율(%)', '별점']]
y = df['추천']

X_train, X_test, y_train, y_test = \
    train_test_split(X,y, test_size = 0.2, random_state = 13)

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

for i in range(2,10):
    dt = DecisionTreeClassifier(max_depth = i, random_state = 13)
    dt.fit(X_train, y_train)

    y_pred_test = dt.predict(X_test)
    print(f'Test max_depth: {i} - accuracy_score: {accuracy_score(y_test, y_pred_test).round(3)}')

Test max_depth: 2 - accuracy_score: 1.0
Test max_depth: 3 - accuracy_score: 1.0
Test max_depth: 4 - accuracy_score: 1.0
Test max_depth: 5 - accuracy_score: 1.0
Test max_depth: 6 - accuracy_score: 1.0
Test max_depth: 7 - accuracy_score: 1.0
Test max_depth: 8 - accuracy_score: 1.0
Test max_depth: 9 - accuracy_score: 1.0


In [10]:
from sklearn.neighbors import KNeighborsClassifier

for i in range(2,10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)

    y_pred_test = knn.predict(X_test)
    print(f'Test n_neighbors: {i} - accuracy_score: {accuracy_score(y_test, y_pred_test).round(3)}')

Test n_neighbors: 2 - accuracy_score: 0.943
Test n_neighbors: 3 - accuracy_score: 0.914
Test n_neighbors: 4 - accuracy_score: 0.914
Test n_neighbors: 5 - accuracy_score: 0.943
Test n_neighbors: 6 - accuracy_score: 0.943
Test n_neighbors: 7 - accuracy_score: 0.914
Test n_neighbors: 8 - accuracy_score: 0.914
Test n_neighbors: 9 - accuracy_score: 0.914


In [11]:
from sklearn.ensemble import RandomForestClassifier

for i in range(2,10):
    rf = RandomForestClassifier(n_estimators=i, random_state=0)
    rf.fit(X_train, y_train)

    y_pred_test = knn.predict(X_test)
    print(f'Test n_neighbors: {i} - accuracy_score: {accuracy_score(y_test, y_pred_test).round(3)}')

Test n_neighbors: 2 - accuracy_score: 0.914
Test n_neighbors: 3 - accuracy_score: 0.914
Test n_neighbors: 4 - accuracy_score: 0.914
Test n_neighbors: 5 - accuracy_score: 0.914
Test n_neighbors: 6 - accuracy_score: 0.914
Test n_neighbors: 7 - accuracy_score: 0.914
Test n_neighbors: 8 - accuracy_score: 0.914
Test n_neighbors: 9 - accuracy_score: 0.914


### 전처리 후 (Robust Scaler)

In [18]:
X = df[['기업구분코드', '사원수', '업력', '이직율(%)', '별점']]
y = df['추천']

X_train, X_test, y_train, y_test = \
    train_test_split(X,y, test_size = 0.2, random_state = 13)

RS = RobustScaler()

X_rs_train = RS.fit_transform(X_train)
X_rs_test = RS.transform(X_test)

X_rs_pd = pd.DataFrame(X_test, columns= X.columns)
X_rs_pd.head()

Unnamed: 0,기업구분코드,사원수,업력,이직율(%),별점
170,4.0,115,27.0,59.98,2.2
7,3.0,27,10.0,33.3,4.5
104,3.0,223,18.0,85.72,2.5
93,3.0,162,20.0,50.0,2.5
10,3.0,173,13.0,59.7,2.8


In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

for i in range(2,10):
    dt = DecisionTreeClassifier(max_depth = i, random_state = 13)
    dt.fit(X_rs_train, y_train)

    y_pred_test = dt.predict(X_rs_test)
    print(f'Test max_depth: {i} - accuracy_score: {accuracy_score(y_test, y_pred_test).round(3)}')

Test max_depth: 2 - accuracy_score: 1.0
Test max_depth: 3 - accuracy_score: 1.0
Test max_depth: 4 - accuracy_score: 1.0
Test max_depth: 5 - accuracy_score: 1.0
Test max_depth: 6 - accuracy_score: 1.0
Test max_depth: 7 - accuracy_score: 1.0
Test max_depth: 8 - accuracy_score: 1.0
Test max_depth: 9 - accuracy_score: 1.0


In [20]:
from sklearn.neighbors import KNeighborsClassifier

for i in range(2,10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_rs_train, y_train)

    y_pred_test = knn.predict(X_rs_test)
    print(f'Test n_neighbors: {i} - accuracy_score: {accuracy_score(y_test, y_pred_test).round(3)}')

Test n_neighbors: 2 - accuracy_score: 0.886
Test n_neighbors: 3 - accuracy_score: 0.971
Test n_neighbors: 4 - accuracy_score: 0.943
Test n_neighbors: 5 - accuracy_score: 1.0
Test n_neighbors: 6 - accuracy_score: 1.0
Test n_neighbors: 7 - accuracy_score: 1.0
Test n_neighbors: 8 - accuracy_score: 0.971
Test n_neighbors: 9 - accuracy_score: 0.914


In [21]:
from sklearn.ensemble import RandomForestClassifier

for i in range(2,10):
    rf = RandomForestClassifier(n_estimators=i, random_state=0)
    rf.fit(X_rs_train, y_train)

    y_pred_test = knn.predict(X_rs_test)
    print(f'Test n_neighbors: {i} - accuracy_score: {accuracy_score(y_test, y_pred_test).round(3)}')

Test n_neighbors: 2 - accuracy_score: 0.914
Test n_neighbors: 3 - accuracy_score: 0.914
Test n_neighbors: 4 - accuracy_score: 0.914
Test n_neighbors: 5 - accuracy_score: 0.914
Test n_neighbors: 6 - accuracy_score: 0.914
Test n_neighbors: 7 - accuracy_score: 0.914
Test n_neighbors: 8 - accuracy_score: 0.914
Test n_neighbors: 9 - accuracy_score: 0.914
