In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn import set_config
from catboost import CatBoostRegressor

#### Load data

In [8]:
X_train = pd.read_csv('X_train.csv', encoding='cp949').drop(columns='ID')
y_train = pd.read_csv('y_train.csv', encoding='cp949').Salary

X_test = pd.read_csv('X_test.csv', encoding='cp949')
test_id = X_test.ID
X_test = X_test.drop(columns='ID')

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16570 entries, 0 to 16569
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   직종      16570 non-null  object 
 1   세부직종    16570 non-null  object 
 2   직무태그    14055 non-null  object 
 3   근무경력    16570 non-null  object 
 4   근무형태    6661 non-null   object 
 5   근무지역    16570 non-null  object 
 6   출신대학    16570 non-null  object 
 7   대학전공    16570 non-null  object 
 8   어학시험    4988 non-null   object 
 9   자격증     16570 non-null  object 
 10  대학성적    14600 non-null  float64
dtypes: float64(1), object(10)
memory usage: 1.4+ MB


In [10]:
X_train.head()

Unnamed: 0,직종,세부직종,직무태그,근무경력,근무형태,근무지역,출신대학,대학전공,어학시험,자격증,대학성적
0,문화·예술·신문·방송,영상·음향·사진·카메라,"취재기자, 편집 기사, 유아 사이트 기획, 시나리오 작성",0개월,,"서울,경기,서울",성균관대학교,국문,,無,70.0
1,경영·기획·회계·사무,사무·총무·법무,,2년 11개월,정규직,"부산,서울,일본",신라대학교,관광경영,JLPT,無,
2,IT·게임,하드웨어설계·개발·관리,"하드웨어 , 무선통신 , MICOM , ASM , RF , CDMA , Firm W...",0개월,,"서울,경기,",수원대학교,정보통신공,,有,60.0
3,영업·판매·TM,기술영업,반도체,1년 0개월,정규직,"경기,서울,충북",수원대학교,정보통신,,有,70.0
4,기술·과학·산업,기타 기술·과학·산업,기술직,0개월,,"전국,전국,전국",한밭대학교,화학공학과,,無,70.0


# 대학 전공

In [2]:
major = pd.read_csv('major.csv').T

In [3]:
major

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
1,2,3,4,5,6,7,8,9,10,11,...,97,98,99,100,101,102,103,104,105,106
국어국문학과,독일어문학과,러시아어문학과,스페인어문학과,언어학과,영미어문학과,일본어문학과,중국어문학과,프랑스어문학과,기타동양어문학과,기타서양어문학과,...,국악학과,기악과,성악과,실용음악학과,음악학과,작곡과,영상콘텐츠학과,체육교육학과,음악교육학과,미술교육학과
국문,독문(영문),노문학,스페인어,관광통역,영어영문학과,일어일문,중국학,불어불문학과,동양어문학,서양학과,...,국악,F-Horn,음악(성악),교회음악,Music Theory,작곡,다중매체,체육교육,음악교육,미&#49715;교육과
국어/전산,"독문,심리",노어노문,스페인어 중남미학과,관광통역학,영어영문,일본어,중국학과,불문,동양어문학부,,...,국악과,기악,성악,교회음악과,음악학,작곡학과,다중매체영상학,체육교육과,,미술교육
국어국문학,독문학,노어노문학,스페인어중남미,관광통역학과,영어영문학,관광일본어,중국학과(중퇴),불문학,,,...,한국음악 거문고전공,기악(관악),성악과,실용음악과,음악과,,디지털 컨텐츠,체육교육학,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Unnamed: 109,,,,,,,,,,,...,,,,,,,,,,
Unnamed: 110,,,,,,,,,,,...,,,,,,,,,,
Unnamed: 111,,,,,,,,,,,...,,,,,,,,,,
Unnamed: 112,,,,,,,,,,,...,,,,,,,,,,


In [4]:
major = major.drop('1')

In [5]:
major = major.rename(columns=major.iloc[0])

In [6]:
major

Unnamed: 0,독일어문학과,러시아어문학과,스페인어문학과,언어학과,영미어문학과,일본어문학과,중국어문학과,프랑스어문학과,기타동양어문학과,기타서양어문학과,...,국악학과,기악과,성악과,실용음악학과,음악학과,작곡과,영상콘텐츠학과,체육교육학과,음악교육학과,미술교육학과
국어국문학과,독일어문학과,러시아어문학과,스페인어문학과,언어학과,영미어문학과,일본어문학과,중국어문학과,프랑스어문학과,기타동양어문학과,기타서양어문학과,...,국악학과,기악과,성악과,실용음악학과,음악학과,작곡과,영상콘텐츠학과,체육교육학과,음악교육학과,미술교육학과
국문,독문(영문),노문학,스페인어,관광통역,영어영문학과,일어일문,중국학,불어불문학과,동양어문학,서양학과,...,국악,F-Horn,음악(성악),교회음악,Music Theory,작곡,다중매체,체육교육,음악교육,미&#49715;교육과
국어/전산,"독문,심리",노어노문,스페인어 중남미학과,관광통역학,영어영문,일본어,중국학과,불문,동양어문학부,,...,국악과,기악,성악,교회음악과,음악학,작곡학과,다중매체영상학,체육교육과,,미술교육
국어국문학,독문학,노어노문학,스페인어중남미,관광통역학과,영어영문학,관광일본어,중국학과(중퇴),불문학,,,...,한국음악 거문고전공,기악(관악),성악과,실용음악과,음악과,,디지털 컨텐츠,체육교육학,,
"국문,사회학",독문학과,노어러시아,스페인어중남미학,관광통역학과 중국어,영문학,관광일어,중문,불어,,,...,,기악과,,,음악,,디지털 컨텐츠학,체육교육학과,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Unnamed: 109,,,,,,,,,,,...,,,,,,,,,,
Unnamed: 110,,,,,,,,,,,...,,,,,,,,,,
Unnamed: 111,,,,,,,,,,,...,,,,,,,,,,
Unnamed: 112,,,,,,,,,,,...,,,,,,,,,,


* * *

In [13]:
col = major.columns

In [15]:
for i in range(105):
    cat = major.iloc[:, i].values.tolist()
    cat = list(map(str, cat))
    name = [j for j in cat if j != 'nan']
    
    for k in col:
        X_train = X_train.replace(name, k)

KeyboardInterrupt: 

In [None]:
cat = major.iloc[:, 0].values.tolist()

In [None]:
cat = list(map(str, cat))

In [None]:
name = [i for i in cat if i != 'nan']

In [None]:
name

In [None]:
X_train = X_train.replace(name, '독일어문학과')

In [None]:
X_train['대학전공'].value_counts()