In [66]:
import math
import pandas as pd
import numpy as np


import warnings
warnings.simplefilter('ignore')

## 데이터 불러오기

In [67]:
df = pd.read_csv('../data/aug_test.csv')
df

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,32403,city_41,0.827,Male,Has relevent experience,Full time course,Graduate,STEM,9,<10,,1,21
1,9858,city_103,0.920,Female,Has relevent experience,no_enrollment,Graduate,STEM,5,,Pvt Ltd,1,98
2,31806,city_21,0.624,Male,No relevent experience,no_enrollment,High School,,<1,,Pvt Ltd,never,15
3,27385,city_13,0.827,Male,Has relevent experience,no_enrollment,Masters,STEM,11,10/49,Pvt Ltd,1,39
4,27724,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10000+,Pvt Ltd,>4,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2124,1289,city_103,0.920,Male,No relevent experience,no_enrollment,Graduate,Humanities,16,,Public Sector,4,15
2125,195,city_136,0.897,Male,Has relevent experience,no_enrollment,Masters,STEM,18,,,2,30
2126,31762,city_100,0.887,Male,No relevent experience,no_enrollment,Primary School,,3,,Pvt Ltd,never,18
2127,7873,city_102,0.804,Male,Has relevent experience,Full time course,High School,,7,100-500,Public Sector,1,84


In [68]:
df.shape

(2129, 13)

### enrollee_id, city 컬럼 제거

In [69]:
df.drop(['enrollee_id','city'],axis=1,inplace=True)

In [70]:
df.shape

(2129, 11)

## Nan값 처리

### gender : Nan값 처리 (삭제)
- Other은 삭제

In [71]:
df['gender'].unique()

array(['Male', 'Female', nan, 'Other'], dtype=object)

In [72]:
df['gender'].replace('Other', np.NaN, inplace=True)

In [73]:
df['gender'].unique()

array(['Male', 'Female', nan], dtype=object)

In [74]:
# Nan값 처리
df = df.dropna(subset=['gender'])

In [75]:
df['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [76]:
df.shape

(1597, 11)

### major_discipline : Other로 넣기

In [77]:
df['major_discipline'].unique()

array(['STEM', nan, 'Other', 'Business Degree', 'Arts', 'Humanities',
       'No Major'], dtype=object)

In [78]:
df['major_discipline']=df['major_discipline'].fillna('Other')

In [79]:
df.shape

(1597, 11)

In [80]:
df['major_discipline'].isnull().sum()

0

In [81]:
df['major_discipline'].unique()

array(['STEM', 'Other', 'Business Degree', 'Arts', 'Humanities',
       'No Major'], dtype=object)

### enrolled_university : no_enrollment로 넣기

In [82]:
df['enrolled_university'].unique()

array(['Full time course', 'no_enrollment', 'Part time course', nan],
      dtype=object)

In [83]:
df['enrolled_university']=df['enrolled_university'].fillna('no_enrollment')

In [84]:
df.shape

(1597, 11)

In [85]:
df['enrolled_university'].isnull().sum()

0

In [86]:
df['enrolled_university'].unique()

array(['Full time course', 'no_enrollment', 'Part time course'],
      dtype=object)

### education_level : 최빈값 처리

In [87]:
df['education_level'].unique()

array(['Graduate', 'High School', 'Masters', nan, 'Phd', 'Primary School'],
      dtype=object)

In [88]:
# 최빈값 확인
df['education_level'].mode()

0    Graduate
dtype: object

In [89]:
df['education_level']=df['education_level'].fillna('Graduate')

In [90]:
df['education_level'].isnull().sum()

0

In [91]:
df.shape

(1597, 11)

In [92]:
df['education_level'].unique()

array(['Graduate', 'High School', 'Masters', 'Phd', 'Primary School'],
      dtype=object)

### experience 값 처리
- '>20' => 21
- '<1'  =>  1

In [93]:
df['experience'].unique()

array(['9', '5', '<1', '11', '>20', '10', '14', '3', '20', '8', '4', '2',
       '6', '1', '19', '15', '7', '13', '16', '18', '12', '17', nan],
      dtype=object)

In [94]:
df['experience'].replace('<1',0, inplace=True)

In [95]:
df['experience'].replace('>20',21, inplace=True)

In [96]:
df = df.dropna(subset=['experience'])

In [97]:
df['experience'].isnull().sum()

0

In [98]:
# int형으로 변경
df = df.astype({'experience': 'int'})

In [99]:
df['experience'].unique()

array([ 9,  5,  0, 11, 21, 10, 14,  3, 20,  8,  4,  2,  6,  1, 19, 15,  7,
       13, 16, 18, 12, 17])

In [100]:
df.shape

(1595, 11)

### company_type : 
- New의 조건 (last_new_job == 'never' and company_type = Nan)

그 외에 Nan값은 제거 

In [101]:
df['company_type'].unique()

array([nan, 'Pvt Ltd', 'Funded Startup', 'Other', 'Public Sector',
       'Early Stage Startup', 'NGO'], dtype=object)

In [102]:
# 조건문을 사용하기 위해 Nan값을 문자열 'nan' 값으로 지정
df['company_type'].replace(np.NaN,'nan', inplace=True)

In [103]:
for i in range(0,len(df)):
    if (df['company_type'].iloc[i] == 'nan') and (df['last_new_job'].iloc[i] == 'never'):
        df['company_type'].replace('nan','New', inplace=True)

In [104]:
# 'nan'값 nan으로 변경
df['company_type'].replace('nan', np.NaN, inplace=True)

In [105]:
df = df.dropna(subset=['company_type'])

In [106]:
df['company_type'].isnull().sum()

0

In [107]:
df['company_type'].unique()

array(['New', 'Pvt Ltd', 'Funded Startup', 'Other', 'Public Sector',
       'Early Stage Startup', 'NGO'], dtype=object)

In [108]:
df.shape

(1595, 11)

### company_size : 
- 0의 조건 (company_type = 'New')

그 외에 Nan값은 제거 

In [109]:
df['company_size'].unique()

array(['<10', nan, '10/49', '10000+', '50-99', '100-500', '1000-4999',
       '500-999', '5000-9999'], dtype=object)

In [110]:
# 조건문을 사용하기 위해 Nan값을 문자열 'nan' 값으로 지정
df['company_size'].replace(np.NaN,'nan', inplace=True)

In [111]:
for i in range(0,len(df)):
    if (df['company_type'].iloc[i] == 'New'):
        df['company_size'].replace('nan','0', inplace=True)

In [112]:
# 'nan'값 nan으로 변경
df['company_size'].replace('nan', np.NaN, inplace=True)

In [113]:
df = df.dropna(subset=['company_size'])

In [114]:
df['company_size'].isnull().sum()

0

In [115]:
df['company_size'].unique()

array(['<10', '0', '10/49', '10000+', '50-99', '100-500', '1000-4999',
       '500-999', '5000-9999'], dtype=object)

In [116]:
df.shape

(1595, 11)

### last_new_job (문자열)
- 'never' => 0
- '>4'    => 5

In [117]:
df['last_new_job'].unique()

array(['1', 'never', '>4', '2', '4', '3', nan], dtype=object)

In [118]:
df = df.dropna(subset=['last_new_job'])

In [119]:
df['last_new_job'].replace('never', 0, inplace=True)

In [120]:
df['last_new_job'].replace('>4', 5, inplace=True)

In [121]:
df = df.astype({'last_new_job': 'int'})

In [122]:
df['last_new_job'].isnull().sum()

0

In [123]:
df['last_new_job'].unique()

array([1, 0, 5, 2, 4, 3])

## 데이터 csv로 저장

In [124]:
df.to_csv('aug_test_1.csv', index=False)

In [125]:
print(df.dtypes)

city_development_index    float64
gender                     object
relevent_experience        object
enrolled_university        object
education_level            object
major_discipline           object
experience                  int64
company_size               object
company_type               object
last_new_job                int64
training_hours              int64
dtype: object


In [126]:
df.isnull().sum()

city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
dtype: int64