In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations 

#### Load data

In [2]:
X_train = pd.read_csv('X_train_111.csv', encoding='cp949')
y_train = pd.read_csv('y_train.csv', encoding='cp949').Salary

X_test = pd.read_csv('X_test_111.csv', encoding='cp949')
test_id = pd.read_csv('X_test.csv', encoding='cp949').ID

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16570 entries, 0 to 16569
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   직종      16570 non-null  object 
 1   세부직종    16570 non-null  object 
 2   직무태그    16570 non-null  object 
 3   근무경력    16570 non-null  object 
 4   근무형태    16570 non-null  object 
 5   근무지역    16570 non-null  object 
 6   출신대학    16570 non-null  object 
 7   대학전공    16570 non-null  object 
 8   어학시험    16570 non-null  object 
 9   자격증     16570 non-null  object 
 10  대학성적    16570 non-null  float64
dtypes: float64(1), object(10)
memory usage: 1.4+ MB


***

# 수치형 피쳐생성

### 근무경력 =>수치로 변환해서 '근무개월'열 생성

In [4]:
### 경력을 수치로 변환
import re

def f1(x):
    numbers = re.findall(r'\d+', x)
    return int(numbers[0])*12 if len(numbers) == 1 else int(numbers[0])*12+int(numbers[1])

X_train['근무개월'] = X_train['근무경력'].apply(f1)
X_test['근무개월'] = X_test['근무경력'].apply(f1)

### 근무개월log

In [5]:
for df in [X_train, X_test]:
    df['근무개월log'] = np.log1p(1+df['근무개월'])

### 대학랭킹

In [6]:
top_rank = ['성균관대학교', '연세대학교', '중앙대학교', '이화여자대학교']
mid_rank = ['세종대학교', '성신여자대학교']
low_rank = ['한성대학교', '동덕여자대학교', '서울여자대학교', '서울과학기술대학교']
province = ['제주대학교', '인천대학교', '군산대학교', '한국산업기술대학교']
etc= ['수원대학교', '전주대학교', '세명대학교', '신라대학교', '상지대학교', '한밭대학교', '경기대학교', '선문대학교', '성공회대학교', '호원대학교', '한림대학교', '목포대학교', '동아대학교', '동의대학교', '성결대학교', '호남대학교', '광주대학교', '서원대학교', '동서대학교', '한일장신대학교', '한세대학교', '용인대학교', '경주대학교', '협성대학교', '서울신학대학교', '송원대학교', '부산디지털대학교','남부대학교','기타']

university = []
for i in X_train['출신대학']:
    if i in top_rank:
        university.append(1)
    elif i in mid_rank:
        university.append(2)
    elif i in low_rank:
        university.append(3)
    elif i in province:
        university.append(4)
    elif i in etc:
        university.append(5)  
        
X_train['대학랭킹'] = university

university = []
for i in X_test['출신대학']:
    if i in top_rank:
        university.append(1)
    elif i in mid_rank:
        university.append(2)
    elif i in low_rank:
        university.append(3)
    elif i in province:
        university.append(4)
    elif i in etc:
        university.append(5)
        
X_test['대학랭킹'] = university

### 출신대학수치*대학성적

In [7]:
university = []
for i in X_train['출신대학']:
    if i in top_rank:
        university.append(5)
    elif i in mid_rank:
        university.append(4)
    elif i in low_rank:
        university.append(3)
    elif i in province:
        university.append(2)
    elif i in etc:
        university.append(1)
        
X_train['출신대학수치'] = university

university = []
for i in X_test['출신대학']:
    if i in top_rank:
        university.append(5)
    elif i in mid_rank:
        university.append(4)
    elif i in low_rank:
        university.append(3)
    elif i in province:
        university.append(2)
    elif i in etc:
        university.append(1) 
        
X_test['출신대학수치'] = university

a = X_train['출신대학수치'] * X_train['대학성적']
X_train['출신대학수치*대학성적'] = a
a = X_test['출신대학수치'] * X_test['대학성적']
X_test['출신대학수치*대학성적'] = a


#출신대학수치 삭제
#X_train = X_train.drop(['출신대학수치'],axis=1)
#X_test = X_test.drop(['출신대학수치'], axis=1)

### (범주형) 근무지역 1,2,3으로 나눈 열 생성

In [8]:
result1=[]
result2=[]
result3=[]

for x in X_train.근무지역:
    L = x.split(',')
    if (L[1]==''):
        result1.append(L[0])
        result2.append(L[0])
        result3.append(L[0])
    elif (L[2]==''):
        result1.append(L[0])
        result2.append(L[1])
        result3.append(L[1])
    else: 
        result1.append(L[0])
        result2.append(L[1])
        result3.append(L[2])
        
X_train['근무지역_1'] = result1
X_train['근무지역_2'] = result2
X_train['근무지역_3'] = result3

In [9]:
result1=[]
result2=[]
result3=[]

for x in X_test.근무지역:
    L = x.split(',')
    if (L[1]==''):
        result1.append(L[0])
        result2.append(L[0])
        result3.append(L[0])
    elif (L[2]==''):
        result1.append(L[0])
        result2.append(L[1])
        result3.append(L[1])
    else: 
        result1.append(L[0])
        result2.append(L[1])
        result3.append(L[2])
        
X_test['근무지역_1'] = result1
X_test['근무지역_2'] = result2
X_test['근무지역_3'] = result3

### (범주형)마지막근무형태

In [10]:
X_train['마지막근무형태'] = 0
for i in range(0,len(X_train.근무형태),1):
    X_train['마지막근무형태'][i] = X_train.근무형태[i].split(', ')[-1]
    if (X_train.근무형태[i].split(', ')[-1] == ''):
        X_train['마지막근무형태'][i] = X_train.근무형태[i].split(', ')[-2]

X_test['마지막근무형태'] = 0
for i in range(0,len(X_test.근무형태),1):
    X_test['마지막근무형태'][i] = X_test.근무형태[i].split(', ')[-1]
    if (X_test.근무형태[i].split(', ')[-1] == ''):
        X_test['마지막근무형태'][i] = X_test.근무형태[i].split(', ')[-2]

### 직무태그 하나씩 열 생성

In [11]:
List=[]
name = []
for i in range(1,28,1):
    name.append('직무태그_'+ str(i))
    
for n in X_train.직무태그:
    n1 = (n.split(','))
    for i in range(0, len(n1), 1):
        n1[i] = n1[i].strip()
    while(len(n1)<27):
        n1.append(n1[-1])
    List.append(n1)
df1 = pd.DataFrame(List, columns=name)
X_train = pd.concat([X_train,df1],axis=1)

List=[]
name = []
for i in range(1,28,1):
    name.append('직무태그_'+ str(i))
    
for n in X_test.직무태그:
    n1 = (n.split(','))
    for i in range(0, len(n1), 1):
        n1[i] = n1[i].strip()
    while(len(n1)<27):
        n1.append(n1[-1])
    List.append(n1)
df2 = pd.DataFrame(List, columns=name)
X_test = pd.concat([X_test,df2],axis=1)

### 직무태그개수

In [12]:
List = []
for n in X_train.직무태그:
    L = len(n.split(', '))
    List.append(L)
X_train['직무태그개수'] = List

List = []
for n in X_test.직무태그:
    L = len(n.split(', '))
    List.append(L)
X_test['직무태그개수'] = List

### 대학성적*0.1 +근무개월

In [13]:
X_train['대학성적0.1+근무개월'] = X_train.대학성적*0.1 + X_train.근무개월
X_test['대학성적0.1+근무개월'] = X_test.대학성적*0.1 + X_test.근무개월

## 범주형 변수 groupby로 수치형 변수 sum, mean, variance, standard deviation 피쳐 생성

In [14]:
def inf_num(cat, num):
    global X_train
    global X_test
    
    group_object1 = X_train.groupby(by=[cat])[num].agg('sum').reset_index()
    group_object2 = X_test.groupby(by=[cat])[num].agg('sum').reset_index()

    group_object1 = group_object1.rename(columns = {num:(cat+'_'+num+'_sum')})
    group_object2 = group_object2.rename(columns = {num:(cat+'_'+num+'_sum')})

    X_train = pd.merge(X_train, group_object1, on = cat, how='left')
    X_test = pd.merge(X_test, group_object2, on = cat, how='left')

    # mean

    group_object3 = X_train.groupby(by=[cat])[num].agg('mean').reset_index()
    group_object4 = X_test.groupby(by=[cat])[num].agg('mean').reset_index()

    group_object3 = group_object3.rename(columns = {num:(cat+'_'+num+'_mean')})
    group_object4 = group_object4.rename(columns = {num:(cat+'_'+num+'_mean')})

    X_train = pd.merge(X_train, group_object3, on = cat, how='left')
    X_test = pd.merge(X_test, group_object4, on = cat, how='left')

    # variance

    group_object5 = X_train.groupby(by=[cat])[num].agg('var').reset_index()
    group_object6 = X_test.groupby(by=[cat])[num].agg('var').reset_index()

    group_object5 = group_object5.rename(columns = {num:(cat+'_'+num+'_variance')})
    group_object6 = group_object6.rename(columns = {num:(cat+'_'+num+'_variance')})

    X_train = pd.merge(X_train, group_object5, on = cat, how='left')
    X_test = pd.merge(X_test, group_object6, on = cat, how='left')

    # standard deviation

    group_object7 = X_train.groupby(by=[cat])[num].agg('std').reset_index()
    group_object8 = X_test.groupby(by=[cat])[num].agg('std').reset_index()

    group_object7 = group_object7.rename(columns = {num:(cat+'_'+num+'_standard_deviation')})
    group_object8 = group_object8.rename(columns = {num:(cat+'_'+num+'_standard_deviation')})

    X_train = pd.merge(X_train, group_object7, on = cat, how='left')
    X_test = pd.merge(X_test, group_object8, on = cat, how='left')

    X_train[(cat+'_'+num+'_variance')] = X_train[(cat+'_'+num+'_variance')].fillna(0)
    X_test[(cat+'_'+num+'_variance')] = X_test[(cat+'_'+num+'_variance')].fillna(0)
    X_train[(cat+'_'+num+'_standard_deviation')] = X_train[(cat+'_'+num+'_standard_deviation')].fillna(0)
    X_test[(cat+'_'+num+'_standard_deviation')] = X_test[(cat+'_'+num+'_standard_deviation')].fillna(0)

In [15]:
def inf_num2(cat1,cat2, num):
    global X_train
    global X_test
    
    group_object1 = X_train.groupby(by=[cat1,cat2])[num].agg('sum').reset_index()
    group_object2 = X_test.groupby(by=[cat1,cat2])[num].agg('sum').reset_index()

    group_object1 = group_object1.rename(columns = {num:(cat1+'_'+cat2+'_'+num+'_sum')})
    group_object2 = group_object2.rename(columns = {num:(cat1+'_'+cat2+'_'+num+'_sum')})

    X_train = pd.merge(X_train, group_object1, on = [cat1,cat2], how='left')
    X_test = pd.merge(X_test, group_object2, on = [cat1,cat2], how='left')

    # mean

    group_object3 = X_train.groupby(by=[cat1,cat2])[num].agg('mean').reset_index()
    group_object4 = X_test.groupby(by=[cat1,cat2])[num].agg('mean').reset_index()

    group_object3 = group_object3.rename(columns = {num:(cat1+'_'+cat2+'_'+num+'_mean')})
    group_object4 = group_object4.rename(columns = {num:(cat1+'_'+cat2+'_'+num+'_mean')})

    X_train = pd.merge(X_train, group_object3, on = [cat1,cat2], how='left')
    X_test = pd.merge(X_test, group_object4, on = [cat1,cat2], how='left')

    # variance

    group_object5 = X_train.groupby(by=[cat1,cat2])[num].agg('var').reset_index()
    group_object6 = X_test.groupby(by=[cat1,cat2])[num].agg('var').reset_index()

    group_object5 = group_object5.rename(columns = {num:(cat1+'_'+cat2+'_'+num+'_variance')})
    group_object6 = group_object6.rename(columns = {num:(cat1+'_'+cat2+'_'+num+'_variance')})

    X_train = pd.merge(X_train, group_object5, on = [cat1,cat2], how='left')
    X_test = pd.merge(X_test, group_object6, on = [cat1,cat2], how='left')

    # standard deviation

    group_object7 = X_train.groupby(by=[cat1,cat2])[num].agg('std').reset_index()
    group_object8 = X_test.groupby(by=[cat1,cat2])[num].agg('std').reset_index()

    group_object7 = group_object7.rename(columns = {num:(cat1+'_'+cat2+'_'+num+'_standard_deviation')})
    group_object8 = group_object8.rename(columns = {num:(cat1+'_'+cat2+'_'+num+'_standard_deviation')})

    X_train = pd.merge(X_train, group_object7, on = [cat1,cat2], how='left')
    X_test = pd.merge(X_test, group_object8, on = [cat1,cat2], how='left')

    X_train[(cat1+'_'+cat2+'_'+num+'_variance')] = X_train[(cat1+'_'+cat2+'_'+num+'_variance')].fillna(0)
    X_test[(cat1+'_'+cat2+'_'+num+'_variance')] = X_test[(cat1+'_'+cat2+'_'+num+'_variance')].fillna(0)
    X_train[(cat1+'_'+cat2+'_'+num+'_standard_deviation')] = X_train[(cat1+'_'+cat2+'_'+num+'_standard_deviation')].fillna(0)
    X_test[(cat1+'_'+cat2+'_'+num+'_standard_deviation')] = X_test[(cat1+'_'+cat2+'_'+num+'_standard_deviation')].fillna(0)

In [16]:
#범주형 변수 하나 별 근무개월
inf_num('출신대학','근무개월')
inf_num('직종','근무개월')
inf_num('세부직종','근무개월')
inf_num('근무지역_1','근무개월')
inf_num('근무지역_2','근무개월')
inf_num('근무지역_3','근무개월')
inf_num('마지막근무형태','근무개월')
inf_num('직무태그_1','근무개월')
inf_num('대학성적','근무개월')
inf_num('대학전공','근무개월')
inf_num('어학시험','근무개월')

#범주형 변수 두개 별 근무개월
inf_num2('직종','세부직종','근무개월')
inf_num2('직종','근무지역_1','근무개월')
inf_num2('직종','근무지역_2','근무개월')
inf_num2('직종','근무지역_3','근무개월')
inf_num2('직종','마지막근무형태','근무개월')
inf_num2('직종','직무태그_1','근무개월')
inf_num2('직종','대학성적','근무개월')

inf_num2('세부직종','근무지역_1','근무개월')
inf_num2('세부직종','근무지역_2','근무개월')
inf_num2('세부직종','근무지역_3','근무개월')
inf_num2('세부직종','마지막근무형태','근무개월')

inf_num2('근무지역_1','근무지역_2','근무개월')
inf_num2('근무지역_1','근무지역_3','근무개월')
inf_num2('근무지역_2','근무지역_3','근무개월')

inf_num2('자격증','어학시험','근무개월')

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16570 entries, 0 to 16569
Columns: 153 entries, 직종 to 자격증_어학시험_근무개월_standard_deviation
dtypes: float64(82), int64(30), object(41)
memory usage: 19.5+ MB


### 수치형 변수 통합 열 생성

In [18]:
X_train['수치형통합'] = np.log1p(1+X_train.근무개월+X_train.대학성적+X_train.출신대학수치)
X_test['수치형통합'] = np.log1p(1+X_test.근무개월+X_test.대학성적+X_test.출신대학수치)

In [19]:
#범주형 변수 하나 별 근무개월
inf_num('출신대학','수치형통합')
inf_num('직종','수치형통합')
inf_num('세부직종','수치형통합')
inf_num('근무지역_1','수치형통합')
inf_num('근무지역_2','수치형통합')
inf_num('근무지역_3','수치형통합')
inf_num('마지막근무형태','수치형통합')
inf_num('직무태그_1','수치형통합')
inf_num('대학성적','수치형통합')
inf_num('대학전공','수치형통합')
inf_num('어학시험','수치형통합')

#범주형 변수 두개 별 근무개월
inf_num2('직종','세부직종','수치형통합')
inf_num2('직종','근무지역_1','수치형통합')
inf_num2('직종','근무지역_2','수치형통합')
inf_num2('직종','근무지역_3','수치형통합')
inf_num2('직종','마지막근무형태','수치형통합')
inf_num2('직종','직무태그_1','수치형통합')
inf_num2('직종','대학성적','수치형통합')

inf_num2('세부직종','근무지역_1','수치형통합')
inf_num2('세부직종','근무지역_2','수치형통합')
inf_num2('세부직종','근무지역_3','수치형통합')
inf_num2('세부직종','마지막근무형태','수치형통합')

inf_num2('근무지역_1','근무지역_2','수치형통합')
inf_num2('근무지역_1','근무지역_3','수치형통합')
inf_num2('근무지역_2','근무지역_3','수치형통합')

inf_num2('자격증','어학시험','수치형통합')

# 범주형 피쳐생성

### 정규직비정규직  
- 정규직 외 비정규직 처리

In [20]:
X_train['정규직비정규직'] = np.where(X_train.근무형태 == "정규직", '정규직', '비정규직')
X_test['정규직비정규직'] = np.where(X_test.근무형태 == "정규직", '정규직', '비정규직')

### 대학+전공

In [21]:
X_train['대학+전공'] = X_train['출신대학'] + X_train['대학전공']
X_test['대학+전공'] = X_test['출신대학'] + X_test['대학전공']

### 직종+세부직종

In [22]:
X_train['직종+세부직종']=(X_train.직종 + X_train.세부직종)
X_test['직종+세부직종']=(X_test.직종 + X_test.세부직종)

### 근무형태+직종

In [23]:
X_train['근무형태+직종'] = X_train['근무형태']+X_train['직종']
X_test['근무형태+직종'] = X_test['근무형태']+X_test['직종']

### 대학교 인서울 여부

In [24]:
in_seoul_lst = ['성균관대학교', '중앙대학교', '세종대학교', '연세대학교', '이화여자대학교']

X_train['인서울여부'] = X_train['출신대학'].map(lambda x: '0' if x in in_seoul_lst else '1')

X_test['인서울여부'] = X_test['출신대학'].map(lambda x: '0' if x in in_seoul_lst else '1')

### 자격증또는어학시험유무

In [25]:
X_train['자격증또는어학시험유무'] = np.where((X_train.자격증 == "無")& (X_train.어학시험 == '없음'), 0, 1)
X_test['자격증또는어학시험유무'] = np.where((X_test.자격증 == "無")&(X_test.어학시험 == '없음'), 0, 1)

### 범주형 변수 2개 합침

In [26]:
X_train['직종+직무태그'] = X_train['직종'] + X_train['직무태그']
X_train['직종+근무경력'] = X_train['직종'] + X_train['근무경력']
X_train['직종+대학전공'] = X_train['직종'] + X_train['대학전공']
X_train['직종+어학시험'] = X_train['직종'] + X_train['어학시험']
X_train['직종+자격증'] = X_train['직종'] + X_train['자격증']
#X_train['직종+근무개월'] = X_train['직종'] + X_train['근무개월']
X_train['직종+근무지역_1'] = X_train['직종'] + X_train['근무지역_1']
X_train['직종+근무지역_2'] = X_train['직종'] + X_train['근무지역_2']
X_train['직종+근무지역_3'] = X_train['직종'] + X_train['근무지역_3']
X_train['직종+출신대학'] = X_train['직종'] + X_train['출신대학']
X_train['직종+마지막근무형태'] = X_train['직종'] + X_train['마지막근무형태']

X_test['직종+직무태그'] = X_test['직종'] + X_test['직무태그']
X_test['직종+근무경력'] = X_test['직종'] + X_test['근무경력']
X_test['직종+대학전공'] = X_test['직종'] + X_test['대학전공']
X_test['직종+어학시험'] = X_test['직종'] + X_test['어학시험']
X_test['직종+자격증'] = X_test['직종'] + X_test['자격증']
#X_test['직종+근무개월'] = X_test['직종'] + X_test['근무개월']
X_test['직종+근무지역_1'] = X_test['직종'] + X_test['근무지역_1']
X_test['직종+근무지역_2'] = X_test['직종'] + X_test['근무지역_2']
X_test['직종+근무지역_3'] = X_test['직종'] + X_test['근무지역_3']
X_test['직종+출신대학'] = X_test['직종'] + X_test['출신대학']
X_test['직종+마지막근무형태'] = X_test['직종'] + X_test['마지막근무형태']

X_train['세부직종+직무태그'] = X_train['세부직종'] + X_train['직무태그']
X_train['세부직종+근무경력'] = X_train['세부직종'] + X_train['근무경력']
X_train['세부직종+대학전공'] = X_train['세부직종'] + X_train['대학전공']
X_train['세부직종+어학시험'] = X_train['세부직종'] + X_train['어학시험']
X_train['세부직종+자격증'] = X_train['세부직종'] + X_train['자격증']
#X_train['세부직종+근무개월'] = X_train['세부직종'] + X_train['근무개월']
X_train['세부직종+근무지역_1'] = X_train['세부직종'] + X_train['근무지역_1']
X_train['세부직종+근무지역_2'] = X_train['세부직종'] + X_train['근무지역_2']
X_train['세부직종+근무지역_3'] = X_train['세부직종'] + X_train['근무지역_3']
X_train['세부직종+출신대학'] = X_train['세부직종'] + X_train['출신대학']
X_train['세부직종+근무형태'] = X_train['세부직종'] + X_train['근무형태']
X_train['세부직종+마지막근무형태'] = X_train['세부직종'] + X_train['마지막근무형태']

X_test['세부직종+직무태그'] = X_test['세부직종'] + X_test['직무태그']
X_test['세부직종+근무경력'] = X_test['세부직종'] + X_test['근무경력']
X_test['세부직종+대학전공'] = X_test['세부직종'] + X_test['대학전공']
X_test['세부직종+어학시험'] = X_test['세부직종'] + X_test['어학시험']
X_test['세부직종+자격증'] = X_test['세부직종'] + X_test['자격증']
#X_test['세부직종+근무개월'] = X_test['세부직종'] + X_test['근무개월']
X_test['세부직종+근무지역_1'] = X_test['세부직종'] + X_test['근무지역_1']
X_test['세부직종+근무지역_2'] = X_test['세부직종'] + X_test['근무지역_2']
X_test['세부직종+근무지역_3'] = X_test['세부직종'] + X_test['근무지역_3']
X_test['세부직종+출신대학'] = X_test['세부직종'] + X_test['출신대학']
X_test['세부직종+근무형태'] = X_test['세부직종'] + X_test['근무형태']
X_test['세부직종+마지막근무형태'] = X_test['세부직종'] + X_test['마지막근무형태']

X_train['직무태그+근무경력'] = X_train['직무태그'] + X_train['근무경력']
X_train['직무태그+대학전공'] = X_train['직무태그'] + X_train['대학전공']
X_train['직무태그+어학시험'] = X_train['직무태그'] + X_train['어학시험']
X_train['직무태그+자격증'] = X_train['직무태그'] + X_train['자격증']
#X_train['직무태그+근무개월'] = X_train['직무태그'] + X_train['근무개월']
X_train['직무태그+근무지역_1'] = X_train['직무태그'] + X_train['근무지역_1']
X_train['직무태그+근무지역_2'] = X_train['직무태그'] + X_train['근무지역_2']
X_train['직무태그+근무지역_3'] = X_train['직무태그'] + X_train['근무지역_3']
X_train['직무태그+출신대학'] = X_train['직무태그'] + X_train['출신대학']
X_train['직무태그+근무형태'] = X_train['직무태그'] + X_train['근무형태']
X_train['직무태그+마지막근무형태'] = X_train['직무태그'] + X_train['마지막근무형태']

X_test['직무태그+근무경력'] = X_test['직무태그'] + X_test['근무경력']
X_test['직무태그+대학전공'] = X_test['직무태그'] + X_test['대학전공']
X_test['직무태그+어학시험'] = X_test['직무태그'] + X_test['어학시험']
X_test['직무태그+자격증'] = X_test['직무태그'] + X_test['자격증']
#X_test['직무태그+근무개월'] = X_test['직무태그'] + X_test['근무개월']
X_test['직무태그+근무지역_1'] = X_test['직무태그'] + X_test['근무지역_1']
X_test['직무태그+근무지역_2'] = X_test['직무태그'] + X_test['근무지역_2']
X_test['직무태그+근무지역_3'] = X_test['직무태그'] + X_test['근무지역_3']
X_test['직무태그+출신대학'] = X_test['직무태그'] + X_test['출신대학']
X_test['직무태그+근무형태'] = X_test['직무태그'] + X_test['근무형태']
X_test['직무태그+마지막근무형태'] = X_test['직무태그'] + X_test['마지막근무형태']

X_train['근무경력+대학전공'] = X_train['근무경력'] + X_train['대학전공']
X_train['근무경력+어학시험'] = X_train['근무경력'] + X_train['어학시험']
X_train['근무경력+자격증'] = X_train['근무경력'] + X_train['자격증']
#X_train['근무경력+근무개월'] = X_train['근무경력'] + X_train['근무개월']
X_train['근무경력+근무지역_1'] = X_train['근무경력'] + X_train['근무지역_1']
X_train['근무경력+근무지역_2'] = X_train['근무경력'] + X_train['근무지역_2']
X_train['근무경력+근무지역_3'] = X_train['근무경력'] + X_train['근무지역_3']
X_train['근무경력+출신대학'] = X_train['근무경력'] + X_train['출신대학']
X_train['근무경력+근무형태'] = X_train['근무경력'] + X_train['근무형태']
X_train['근무경력+마지막근무형태'] = X_train['근무경력'] + X_train['마지막근무형태']

X_test['근무경력+대학전공'] = X_test['근무경력'] + X_test['대학전공']
X_test['근무경력+어학시험'] = X_test['근무경력'] + X_test['어학시험']
X_test['근무경력+자격증'] = X_test['근무경력'] + X_test['자격증']
#X_test['근무경력+근무개월'] = X_test['근무경력'] + X_test['근무개월']
X_test['근무경력+근무지역_1'] = X_test['근무경력'] + X_test['근무지역_1']
X_test['근무경력+근무지역_2'] = X_test['근무경력'] + X_test['근무지역_2']
X_test['근무경력+근무지역_3'] = X_test['근무경력'] + X_test['근무지역_3']
X_test['근무경력+출신대학'] = X_test['근무경력'] + X_test['출신대학']
X_test['근무경력+근무형태'] = X_test['근무경력'] + X_test['근무형태']
X_test['근무경력+마지막근무형태'] = X_test['근무경력'] + X_test['마지막근무형태']

X_train['대학전공+어학시험'] = X_train['대학전공'] + X_train['어학시험']
X_train['대학전공+자격증'] = X_train['대학전공'] + X_train['자격증']
#X_train['대학전공+근무개월'] = X_train['대학전공'] + X_train['근무개월']
X_train['대학전공+근무지역_1'] = X_train['대학전공'] + X_train['근무지역_1']
X_train['대학전공+근무지역_2'] = X_train['대학전공'] + X_train['근무지역_2']
X_train['대학전공+근무지역_3'] = X_train['대학전공'] + X_train['근무지역_3']
X_train['대학전공+출신대학'] = X_train['대학전공'] + X_train['출신대학']
X_train['대학전공+근무형태'] = X_train['대학전공'] + X_train['근무형태']
X_train['대학전공+마지막근무형태'] = X_train['대학전공'] + X_train['마지막근무형태']

X_test['대학전공+어학시험'] = X_test['대학전공'] + X_test['어학시험']
X_test['대학전공+자격증'] = X_test['대학전공'] + X_test['자격증']
#X_test['대학전공+근무개월'] = X_test['대학전공'] + X_test['근무개월']
X_test['대학전공+근무지역_1'] = X_test['대학전공'] + X_test['근무지역_1']
X_test['대학전공+근무지역_2'] = X_test['대학전공'] + X_test['근무지역_2']
X_test['대학전공+근무지역_3'] = X_test['대학전공'] + X_test['근무지역_3']
X_test['대학전공+출신대학'] = X_test['대학전공'] + X_test['출신대학']
X_test['대학전공+근무형태'] = X_test['대학전공'] + X_test['근무형태']
X_test['대학전공+마지막근무형태'] = X_test['대학전공'] + X_test['마지막근무형태']

X_train['어학시험+자격증'] = X_train['어학시험'] + X_train['자격증']
#X_train['어학시험+근무개월'] = X_train['어학시험'] + X_train['근무개월']
X_train['어학시험+근무지역_1'] = X_train['어학시험'] + X_train['근무지역_1']
X_train['어학시험+근무지역_2'] = X_train['어학시험'] + X_train['근무지역_2']
X_train['어학시험+근무지역_3'] = X_train['어학시험'] + X_train['근무지역_3']
X_train['어학시험+출신대학'] = X_train['어학시험'] + X_train['출신대학']
X_train['어학시험+근무형태'] = X_train['어학시험'] + X_train['근무형태']
X_train['어학시험+마지막근무형태'] = X_train['어학시험'] + X_train['마지막근무형태']

X_test['어학시험+자격증'] = X_test['어학시험'] + X_test['자격증']
#X_test['어학시험+근무개월'] = X_test['어학시험'] + X_test['근무개월']
X_test['어학시험+근무지역_1'] = X_test['어학시험'] + X_test['근무지역_1']
X_test['어학시험+근무지역_2'] = X_test['어학시험'] + X_test['근무지역_2']
X_test['어학시험+근무지역_3'] = X_test['어학시험'] + X_test['근무지역_3']
X_test['어학시험+출신대학'] = X_test['어학시험'] + X_test['출신대학']
X_test['어학시험+근무형태'] = X_test['어학시험'] + X_test['근무형태']
X_test['어학시험+마지막근무형태'] = X_test['어학시험'] + X_test['마지막근무형태']

#X_train['자격증+근무개월'] = X_train['자격증'] + X_train['근무개월']
X_train['자격증+근무지역_1'] = X_train['자격증'] + X_train['근무지역_1']
X_train['자격증+근무지역_2'] = X_train['자격증'] + X_train['근무지역_2']
X_train['자격증+근무지역_3'] = X_train['자격증'] + X_train['근무지역_3']
X_train['자격증+출신대학'] = X_train['자격증'] + X_train['출신대학']
X_train['자격증+근무형태'] = X_train['자격증'] + X_train['근무형태']
X_train['자격증+마지막근무형태'] = X_train['자격증'] + X_train['마지막근무형태']

#X_test['자격증+근무개월'] = X_test['자격증'] + X_test['근무개월']
X_test['자격증+근무지역_1'] = X_test['자격증'] + X_test['근무지역_1']
X_test['자격증+근무지역_2'] = X_test['자격증'] + X_test['근무지역_2']
X_test['자격증+근무지역_3'] = X_test['자격증'] + X_test['근무지역_3']
X_test['자격증+출신대학'] = X_test['자격증'] + X_test['출신대학']
X_test['자격증+근무형태'] = X_test['자격증'] + X_test['근무형태']
X_test['자격증+마지막근무형태'] = X_test['자격증'] + X_test['마지막근무형태']

X_train['근무지역_1+출신대학'] = X_train['근무지역_1'] + X_train['출신대학']
X_train['근무지역_1+근무형태'] = X_train['근무지역_1'] + X_train['근무형태']
X_train['근무지역_1+마지막근무형태'] = X_train['근무지역_1'] + X_train['마지막근무형태']

X_test['근무지역_1+출신대학'] = X_test['근무지역_1'] + X_test['출신대학']
X_test['근무지역_1+근무형태'] = X_test['근무지역_1'] + X_test['근무형태']
X_test['근무지역_1+마지막근무형태'] = X_test['근무지역_1'] + X_test['마지막근무형태']

X_train['근무지역_2+출신대학'] = X_train['근무지역_2'] + X_train['출신대학']
X_train['근무지역_2+근무형태'] = X_train['근무지역_2'] + X_train['근무형태']
X_train['근무지역_2+마지막근무형태'] = X_train['근무지역_2'] + X_train['마지막근무형태']

X_test['근무지역_2+출신대학'] = X_test['근무지역_2'] + X_test['출신대학']
X_test['근무지역_2+근무형태'] = X_test['근무지역_2'] + X_test['근무형태']
X_test['근무지역_2+마지막근무형태'] = X_test['근무지역_2'] + X_test['마지막근무형태']

X_train['근무지역_3+출신대학'] = X_train['근무지역_3'] + X_train['출신대학']
X_train['근무지역_3+근무형태'] = X_train['근무지역_3'] + X_train['근무형태']
X_train['근무지역_3+마지막근무형태'] = X_train['근무지역_3'] + X_train['마지막근무형태']

X_test['근무지역_3+출신대학'] = X_test['근무지역_3'] + X_test['출신대학']
X_test['근무지역_3+근무형태'] = X_test['근무지역_3'] + X_test['근무형태']
X_test['근무지역_3+마지막근무형태'] = X_test['근무지역_3'] + X_test['마지막근무형태']

X_train['근무형태+출신대학'] = X_train['근무형태'] + X_train['출신대학']

X_test['근무형태+출신대학'] = X_test['근무형태'] + X_test['출신대학']

X_train['마지막근무형태+출신대학'] = X_train['마지막근무형태'] + X_train['출신대학']

X_test['마지막근무형태+출신대학'] = X_test['마지막근무형태'] + X_test['출신대학']

In [27]:
a = [ '근무지역_1', '근무지역_2', '근무지역_3']
b = ['직종', '세부직종', '직무태그', '근무경력', '근무형태', '출신대학', '대학전공', '어학시험', '자격증']

from itertools import combinations 

b_com = list(combinations(b,3))
b_wecom = list(combinations(b,2))
a_1 = []
a_2 = []
a_3 = []

for i in b_wecom:
    x = ('근무지역_1',)
    a_1.append(i+x)
    
for i in b_wecom:
    x = ('근무지역_2',)
    a_2.append(i+x)

for i in b_wecom:
    x = ('근무지역_3',)
    a_3.append(i+x)
    
c = b_com + a_1 + a_2 + a_3

df = pd.DataFrame()
df_2 = pd.DataFrame()

for i in c:
    df[i] = X_train[i[0]] + X_train[i[1]] + X_train[i[2]]
    df_2[i] = X_test[i[0]] + X_test[i[1]] + X_test[i[2]]

In [28]:
X_train = pd.concat([X_train,df],axis=1)
X_test = pd.concat([X_test,df_2],axis=1)

In [29]:
def sinfo(X):    
    for i, j in enumerate(X):
        print(i,j)

In [30]:
sinfo(X_train)

0 직종
1 세부직종
2 직무태그
3 근무경력
4 근무형태
5 근무지역
6 출신대학
7 대학전공
8 어학시험
9 자격증
10 대학성적
11 근무개월
12 근무개월log
13 대학랭킹
14 출신대학수치
15 출신대학수치*대학성적
16 근무지역_1
17 근무지역_2
18 근무지역_3
19 마지막근무형태
20 직무태그_1
21 직무태그_2
22 직무태그_3
23 직무태그_4
24 직무태그_5
25 직무태그_6
26 직무태그_7
27 직무태그_8
28 직무태그_9
29 직무태그_10
30 직무태그_11
31 직무태그_12
32 직무태그_13
33 직무태그_14
34 직무태그_15
35 직무태그_16
36 직무태그_17
37 직무태그_18
38 직무태그_19
39 직무태그_20
40 직무태그_21
41 직무태그_22
42 직무태그_23
43 직무태그_24
44 직무태그_25
45 직무태그_26
46 직무태그_27
47 직무태그개수
48 대학성적0.1+근무개월
49 출신대학_근무개월_sum
50 출신대학_근무개월_mean
51 출신대학_근무개월_variance
52 출신대학_근무개월_standard_deviation
53 직종_근무개월_sum
54 직종_근무개월_mean
55 직종_근무개월_variance
56 직종_근무개월_standard_deviation
57 세부직종_근무개월_sum
58 세부직종_근무개월_mean
59 세부직종_근무개월_variance
60 세부직종_근무개월_standard_deviation
61 근무지역_1_근무개월_sum
62 근무지역_1_근무개월_mean
63 근무지역_1_근무개월_variance
64 근무지역_1_근무개월_standard_deviation
65 근무지역_2_근무개월_sum
66 근무지역_2_근무개월_mean
67 근무지역_2_근무개월_variance
68 근무지역_2_근무개월_standard_deviation
69 근무지역_3_근무개월_sum
70 근무지역_3_근무개월_mean
71 근무지역_3_근무개월_variance
72

In [31]:
cat = ['직종', '세부직종', '직무태그', '근무지역','근무경력', '출신대학', '대학전공','어학시험', '자격증']

# Label Encoding 528,536
from sklearn.preprocessing import LabelEncoder

for i in cat:
    X_train[i+'label'] = LabelEncoder().fit_transform(X_train[i])
    
for i in cat:
    X_test[i+'label'] = LabelEncoder().fit_transform(X_test[i])

In [32]:
for i in X_train.iloc[:,528:-1].columns:
    print(type(X_train[i][0]))

<class 'numpy.int32'>
<class 'numpy.int32'>
<class 'numpy.int32'>
<class 'numpy.int32'>
<class 'numpy.int32'>
<class 'numpy.int32'>
<class 'numpy.int32'>
<class 'numpy.int32'>


In [33]:
len(X_train)

16570

In [34]:
X_train.iloc[:,528].value_counts()

2     7332
0     2396
7     2352
8     1266
1     1193
9      882
5      605
10     188
3      150
6      119
4       87
Name: 직종label, dtype: int64

In [35]:
for i in X_train.iloc[:,528:-1].columns:
    X_train[i] = X_train[i] +1

In [36]:
#537, 774
a = X_train.iloc[:,528:-1].columns
a_com2 = list(combinations(a,2))
a_com3 = list(combinations(a,3))
a_com4 = list(combinations(a,4))
a_com5 = list(combinations(a,5))
a_com6 = list(combinations(a,6))

for i in a_com2:
    X_train[i[0]+i[1]] = X_train[i[0]] * X_train[i[1]]
    X_test[i[0]+i[1]] = X_test[i[0]] * X_test[i[1]]

for i in a_com3:
    X_train[i[0]+i[1]+i[2]] = X_train[i[0]] * X_train[i[1]] * X_train[i[2]]
    X_test[i[0]+i[1]+i[2]] = X_test[i[0]] * X_test[i[1]] * X_test[i[2]]

for i in a_com4:
    X_train[i[0]+i[1]+i[2]+i[3]] = X_train[i[0]] * X_train[i[1]] * X_train[i[2]]*X_train[i[3]]
    X_test[i[0]+i[1]+i[2]+i[3]] = X_test[i[0]] * X_test[i[1]] * X_test[i[2]]*X_test[i[3]]

for i in a_com5:
    X_train[i[0]+i[1]+i[2]+i[3]+i[4]] = X_train[i[0]] * X_train[i[1]] * X_train[i[2]]*X_train[i[3]]*X_train[i[4]]
    X_test[i[0]+i[1]+i[2]+i[3]+i[4]] = X_test[i[0]] * X_test[i[1]] * X_test[i[2]]*X_test[i[3]]*X_test[i[4]]

for i in a_com6:
    X_train[i[0]+i[1]+i[2]+i[3]+i[4]+i[5]] = X_train[i[0]] * X_train[i[1]] * X_train[i[2]]*X_train[i[3]]*X_train[i[4]]*X_train[i[5]]
    X_test[i[0]+i[1]+i[2]+i[3]+i[4]+i[5]] = X_test[i[0]] * X_test[i[1]] * X_test[i[2]]*X_test[i[3]]*X_test[i[4]]*X_test[i[5]]

In [37]:
a_com2[0][1]

'세부직종label'

In [38]:
sinfo(X_train)

0 직종
1 세부직종
2 직무태그
3 근무경력
4 근무형태
5 근무지역
6 출신대학
7 대학전공
8 어학시험
9 자격증
10 대학성적
11 근무개월
12 근무개월log
13 대학랭킹
14 출신대학수치
15 출신대학수치*대학성적
16 근무지역_1
17 근무지역_2
18 근무지역_3
19 마지막근무형태
20 직무태그_1
21 직무태그_2
22 직무태그_3
23 직무태그_4
24 직무태그_5
25 직무태그_6
26 직무태그_7
27 직무태그_8
28 직무태그_9
29 직무태그_10
30 직무태그_11
31 직무태그_12
32 직무태그_13
33 직무태그_14
34 직무태그_15
35 직무태그_16
36 직무태그_17
37 직무태그_18
38 직무태그_19
39 직무태그_20
40 직무태그_21
41 직무태그_22
42 직무태그_23
43 직무태그_24
44 직무태그_25
45 직무태그_26
46 직무태그_27
47 직무태그개수
48 대학성적0.1+근무개월
49 출신대학_근무개월_sum
50 출신대학_근무개월_mean
51 출신대학_근무개월_variance
52 출신대학_근무개월_standard_deviation
53 직종_근무개월_sum
54 직종_근무개월_mean
55 직종_근무개월_variance
56 직종_근무개월_standard_deviation
57 세부직종_근무개월_sum
58 세부직종_근무개월_mean
59 세부직종_근무개월_variance
60 세부직종_근무개월_standard_deviation
61 근무지역_1_근무개월_sum
62 근무지역_1_근무개월_mean
63 근무지역_1_근무개월_variance
64 근무지역_1_근무개월_standard_deviation
65 근무지역_2_근무개월_sum
66 근무지역_2_근무개월_mean
67 근무지역_2_근무개월_variance
68 근무지역_2_근무개월_standard_deviation
69 근무지역_3_근무개월_sum
70 근무지역_3_근무개월_mean
71 근무지역_3_근무개월_variance
72

In [39]:
#776,1012
for i in X_train.iloc[:,537:775].columns:
    X_train[i,'_log'] = np.log1p(1+X_train[i])
    X_test[i,'_log'] = np.log1p(1+X_test[i])

In [40]:
sinfo(X_train)

0 직종
1 세부직종
2 직무태그
3 근무경력
4 근무형태
5 근무지역
6 출신대학
7 대학전공
8 어학시험
9 자격증
10 대학성적
11 근무개월
12 근무개월log
13 대학랭킹
14 출신대학수치
15 출신대학수치*대학성적
16 근무지역_1
17 근무지역_2
18 근무지역_3
19 마지막근무형태
20 직무태그_1
21 직무태그_2
22 직무태그_3
23 직무태그_4
24 직무태그_5
25 직무태그_6
26 직무태그_7
27 직무태그_8
28 직무태그_9
29 직무태그_10
30 직무태그_11
31 직무태그_12
32 직무태그_13
33 직무태그_14
34 직무태그_15
35 직무태그_16
36 직무태그_17
37 직무태그_18
38 직무태그_19
39 직무태그_20
40 직무태그_21
41 직무태그_22
42 직무태그_23
43 직무태그_24
44 직무태그_25
45 직무태그_26
46 직무태그_27
47 직무태그개수
48 대학성적0.1+근무개월
49 출신대학_근무개월_sum
50 출신대학_근무개월_mean
51 출신대학_근무개월_variance
52 출신대학_근무개월_standard_deviation
53 직종_근무개월_sum
54 직종_근무개월_mean
55 직종_근무개월_variance
56 직종_근무개월_standard_deviation
57 세부직종_근무개월_sum
58 세부직종_근무개월_mean
59 세부직종_근무개월_variance
60 세부직종_근무개월_standard_deviation
61 근무지역_1_근무개월_sum
62 근무지역_1_근무개월_mean
63 근무지역_1_근무개월_variance
64 근무지역_1_근무개월_standard_deviation
65 근무지역_2_근무개월_sum
66 근무지역_2_근무개월_mean
67 근무지역_2_근무개월_variance
68 근무지역_2_근무개월_standard_deviation
69 근무지역_3_근무개월_sum
70 근무지역_3_근무개월_mean
71 근무지역_3_근무개월_variance
72

867 ('직종label세부직종label근무지역label어학시험label', '_log')
868 ('직종label세부직종label근무경력label출신대학label', '_log')
869 ('직종label세부직종label근무경력label대학전공label', '_log')
870 ('직종label세부직종label근무경력label어학시험label', '_log')
871 ('직종label세부직종label출신대학label대학전공label', '_log')
872 ('직종label세부직종label출신대학label어학시험label', '_log')
873 ('직종label세부직종label대학전공label어학시험label', '_log')
874 ('직종label직무태그label근무지역label근무경력label', '_log')
875 ('직종label직무태그label근무지역label출신대학label', '_log')
876 ('직종label직무태그label근무지역label대학전공label', '_log')
877 ('직종label직무태그label근무지역label어학시험label', '_log')
878 ('직종label직무태그label근무경력label출신대학label', '_log')
879 ('직종label직무태그label근무경력label대학전공label', '_log')
880 ('직종label직무태그label근무경력label어학시험label', '_log')
881 ('직종label직무태그label출신대학label대학전공label', '_log')
882 ('직종label직무태그label출신대학label어학시험label', '_log')
883 ('직종label직무태그label대학전공label어학시험label', '_log')
884 ('직종label근무지역label근무경력label출신대학label', '_log')
885 ('직종label근무지역label근무경력label대학전공label', '_log')
886 ('직종label근무지역label근무경력label

In [None]:
for i in cat:
    for j in X_train.iloc[:,528:1013].columns:
        inf_num(i,j)

cat_com = list(combinations(cat,2))
for i in cat_com:
    for j in X_train.iloc[:,528:1013].columns:
        inf_num2(i[0],i[1],j)

***

In [29]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16570 entries, 0 to 16569
Columns: 528 entries, 직종 to ('어학시험', '자격증', '근무지역_3')
dtypes: float64(187), int64(31), object(310)
memory usage: 66.9+ MB


In [30]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11048 entries, 0 to 11047
Columns: 528 entries, 직종 to ('어학시험', '자격증', '근무지역_3')
dtypes: float64(187), int64(31), object(310)
memory usage: 44.6+ MB


***

### csv로 저장

In [31]:
X_train.to_csv('X_train_fe_1124.csv',encoding='cp949', index=False)
X_test.to_csv('X_test_fe_1124.csv',encoding='cp949', index=False)