In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [4]:
df = pd.read_csv("edu_enrollees.csv")
df.head(2)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,Xgrp
0,8949.0,city_103,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,,,1,36.0,1.0,train
1,29725.0,city_40,0.776,Male,No relevant experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47.0,0.0,train


### 전처리

In [5]:
df = df.drop(columns = ["city", "company_size", "company_type"])

In [6]:
df.isna().sum()

enrollee_id                  0
city_development_index       0
gender                    4508
relevant_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
last_new_job               423
training_hours               0
target                       0
Xgrp                         0
dtype: int64

In [7]:
df_str = df.select_dtypes(exclude = "number")
df_str.dtypes

gender                 object
relevant_experience    object
enrolled_university    object
education_level        object
major_discipline       object
experience             object
last_new_job           object
Xgrp                   object
dtype: object

In [8]:
df_str.head(2)

Unnamed: 0,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,last_new_job,Xgrp
0,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,1,train
1,Male,No relevant experience,no_enrollment,Graduate,STEM,15,>4,train


In [9]:
df_str.isna().sum()

gender                 4508
relevant_experience       0
enrolled_university     386
education_level         460
major_discipline       2813
experience               65
last_new_job            423
Xgrp                      0
dtype: int64

In [11]:
df_num = df.select_dtypes(include = "number")
df_num.isna().sum()

enrollee_id               0
city_development_index    0
training_hours            0
target                    0
dtype: int64

In [12]:
df = df.dropna()
len(df)

12477

In [14]:
df = df.loc[~df["experience"].isin([">20", "<1"]), ]
df = df.loc[~df["last_new_job"].isin([">4", "never"]), ]

In [16]:
df["experience"] = df["experience"].astype("int")
df["last_new_job"] = df["last_new_job"].astype("int")

In [17]:
len(df)

7522

In [18]:
df_base = df.copy()

### Q1.

In [None]:
df_q1_no  = df_base.loc[df_base["relevant_experience"] == "No relevant experience"]
df_q1_has = df_base.loc[df_base["relevant_experience"] != "No relevant experience"]

In [20]:
df_q1_no  = df_base.loc[df_base["relevant_experience"] ==  "No relevant experience"]
df_q1_has = df_base.loc[df_base["relevant_experience"] == "Has relevant experience"]

In [19]:
df_base["relevant_experience"].unique()

array(['Has relevant experience', 'No relevant experience'], dtype=object)

In [21]:
df_q1_no["target"].value_counts()

0.0    872
1.0    541
Name: target, dtype: int64

In [22]:
df_q1_no["target"].value_counts(normalize = True)

0.0    0.617127
1.0    0.382873
Name: target, dtype: float64

In [23]:
df_q1_has["target"].value_counts(normalize = True)

0.0    0.784089
1.0    0.215911
Name: target, dtype: float64

In [25]:
stat_1 = df_q1_no[ "target"].value_counts(normalize = True)[1]
stat_2 = df_q1_has["target"].value_counts(normalize = True)[1]
round(stat_1 / stat_2, 2)

1.77

In [28]:
pd.crosstab(df_base["relevant_experience"], df_base["target"], normalize = "index")

target,0.0,1.0
relevant_experience,Unnamed: 1_level_1,Unnamed: 2_level_1
Has relevant experience,0.784089,0.215911
No relevant experience,0.617127,0.382873


In [27]:
df_base.groupby("relevant_experience")["target"].mean()

relevant_experience
Has relevant experience    0.215911
No relevant experience     0.382873
Name: target, dtype: float64

### Q2.

In [33]:
df_base = df_base.reset_index(drop = True)

In [34]:
df_q2_cat = df_base[["gender", "relevant_experience", "enrolled_university", "education_level", "major_discipline"]]
df_q2_cat.head(2)

Unnamed: 0,gender,relevant_experience,enrolled_university,education_level,major_discipline
0,Male,Has relevant experience,no_enrollment,Graduate,STEM
1,Male,Has relevant experience,no_enrollment,Graduate,STEM


In [35]:
df_q2_cat_dum = pd.get_dummies(df_q2_cat)
df_q2_cat_dum.head(2)

Unnamed: 0,gender_Female,gender_Male,gender_Other,relevant_experience_Has relevant experience,relevant_experience_No relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,enrolled_university_no_enrollment,education_level_Graduate,education_level_Masters,education_level_Phd,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other,major_discipline_STEM
0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1
1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1


In [36]:
drop_cols = ["gender_Other", "relevant_experience_No relevant experience", 
             "enrolled_university_no_enrollment", "education_level_Phd",
             "major_discipline_STEM"]
df_q2_cat_dum = df_q2_cat_dum.drop(columns = drop_cols)
df_q2_cat_dum.head(2)

Unnamed: 0,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,0,1,1,0,0,1,0,0,0,0,0,0
1,0,1,1,0,0,1,0,0,0,0,0,0


In [37]:
df_q2 = df_base[["target", "Xgrp", "city_development_index", "experience", "last_new_job", "training_hours"]]
df_q2 = pd.concat([df_q2, df_q2_cat_dum], axis = 1).copy()
df_q2.head(2)

Unnamed: 0,target,Xgrp,city_development_index,experience,last_new_job,training_hours,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,1.0,train,0.92,7,1,46.0,0,1,1,0,0,1,0,0,0,0,0,0
1,0.0,train,0.92,5,1,108.0,0,1,1,0,0,1,0,0,0,0,0,0


In [38]:
df_job2 = df_q2.copy()

In [39]:
df_job2.shape[1]

18

In [42]:
model_lr = LogisticRegression(C = 100000, max_iter = 1000, solver = "liblinear", 
                              random_state = 123)
model_lr.fit(X = df_job2.drop(columns = ["target", "Xgrp"]), 
             y = df_job2["target"])

In [43]:
model_lr.coef_

array([[-6.11732384e+00, -2.85015530e-02,  9.56531984e-02,
        -9.26206002e-04, -1.62975865e-01, -1.36723955e-01,
        -7.60567385e-01,  5.14109835e-01, -2.81496522e-01,
         3.23116412e-01,  1.67418791e-02,  2.61665001e-01,
         1.07607195e-01,  2.43479152e-01,  3.97900227e-01,
        -4.39387752e-01]])

In [48]:
np.exp(model_lr.coef_).max()

1.6721493496611732

In [50]:
pd.DataFrame(model_lr.coef_, 
             columns = df_job2.columns[2:]).transpose()

Unnamed: 0,0
city_development_index,-6.117324
experience,-0.028502
last_new_job,0.095653
training_hours,-0.000926
gender_Female,-0.162976
gender_Male,-0.136724
relevant_experience_Has relevant experience,-0.760567
enrolled_university_Full time course,0.51411
enrolled_university_Part time course,-0.281497
education_level_Graduate,0.323116


### Q3.

In [53]:
df_train = df_job2.loc[df_job2["Xgrp"] == "train", ].drop(columns = "Xgrp")
df_test  = df_job2.loc[df_job2["Xgrp"] == "test" , ].drop(columns = "Xgrp")

In [54]:
df_train.head(2)

Unnamed: 0,target,city_development_index,experience,last_new_job,training_hours,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,1.0,0.92,7,1,46.0,0,1,1,0,0,1,0,0,0,0,0,0
1,0.0,0.92,5,1,108.0,0,1,1,0,0,1,0,0,0,0,0,0


In [58]:
model_knn = KNeighborsClassifier(n_neighbors = 5)
model_knn.fit(X = df_train.drop(columns = "target"),
              y = df_train["target"])
pred = model_knn.predict(df_test.drop(columns = "target"))
pred[:4]

array([0., 0., 0., 0.])

In [59]:
df_pair = pd.DataFrame(dict(true = df_test["target"],
                            pred = pred))
df_pair.head()

Unnamed: 0,true,pred
4706,0.0,0.0
4707,1.0,0.0
4708,0.0,0.0
4709,0.0,0.0
4710,0.0,0.0


In [60]:
pd.crosstab(df_test["target"], pred)

col_0,0.0,1.0
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1899,193
1.0,616,108


In [61]:
from sklearn.metrics import accuracy_score

In [63]:
round(accuracy_score(y_true = df_test["target"], 
                     y_pred = pred), 2)

0.71

In [64]:
import sklearn.metrics as skm

In [66]:
ser_skm = pd.Series(dir(skm))
ser_skm[ser_skm.str.contains("score$")]

26                           accuracy_score
27               adjusted_mutual_info_score
28                      adjusted_rand_score
30                  average_precision_score
31                  balanced_accuracy_score
33                  calinski_harabasz_score
38                        cohen_kappa_score
39                       completeness_score
41                          consensus_score
43                  d2_absolute_error_score
44                         d2_pinball_score
45                         d2_tweedie_score
46                     davies_bouldin_score
47                                dcg_score
50                 explained_variance_score
51                                 f1_score
52                              fbeta_score
53                    fowlkes_mallows_score
59                        homogeneity_score
60                            jaccard_score
61    label_ranking_average_precision_score
77                        mutual_info_score
79                              

In [68]:
[met for met in dir(ser_skm) if met[0] != "_"]

['T',
 'abs',
 'add',
 'add_prefix',
 'add_suffix',
 'agg',
 'aggregate',
 'align',
 'all',
 'any',
 'append',
 'apply',
 'argmax',
 'argmin',
 'argsort',
 'array',
 'asfreq',
 'asof',
 'astype',
 'at',
 'at_time',
 'attrs',
 'autocorr',
 'axes',
 'backfill',
 'between',
 'between_time',
 'bfill',
 'bool',
 'clip',
 'combine',
 'combine_first',
 'compare',
 'convert_dtypes',
 'copy',
 'corr',
 'count',
 'cov',
 'cummax',
 'cummin',
 'cumprod',
 'cumsum',
 'describe',
 'diff',
 'div',
 'divide',
 'divmod',
 'dot',
 'drop',
 'drop_duplicates',
 'droplevel',
 'dropna',
 'dtype',
 'dtypes',
 'duplicated',
 'empty',
 'eq',
 'equals',
 'ewm',
 'expanding',
 'explode',
 'factorize',
 'ffill',
 'fillna',
 'filter',
 'first',
 'first_valid_index',
 'flags',
 'floordiv',
 'ge',
 'get',
 'groupby',
 'gt',
 'hasnans',
 'head',
 'hist',
 'iat',
 'idxmax',
 'idxmin',
 'iloc',
 'index',
 'infer_objects',
 'info',
 'interpolate',
 'is_monotonic',
 'is_monotonic_decreasing',
 'is_monotonic_increasing',