# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier

import pycountry
import pycountry_convert as pc

### 데이터 셋 읽어오기

In [5]:
df_train = pd.read_csv("../data/train.csv") # 학습용 데이터
df_test = pd.read_csv("../data/submission.csv") # 테스트 데이터(제출파일의 데이터)

In [6]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


## 2. 데이터 전처리

### 쓸데없는거 빠르게 제거

In [7]:
# df_train = df_train.drop('bant_submit', axis=1)
# df_test = df_test.drop('bant_submit', axis=1)
# df_train = df_train.drop('inquiry_type', axis=1)
# df_test = df_test.drop('inquiry_type', axis=1)
##################################################
# df_train = df_train.drop('ver_win_ratio_per_bu', axis=1)
# df_test = df_test.drop('ver_win_ratio_per_bu', axis=1)
# df_train = df_train.drop('business_area', axis=1)
# df_test = df_test.drop('business_area', axis=1)
# df_train = df_train.drop('ver_cus', axis=1)
# df_test = df_test.drop('ver_cus', axis=1)
# df_train = df_train.drop('idit_strategic_ver', axis=1)
# df_test = df_test.drop('idit_strategic_ver', axis=1)
# df_train = df_train.drop('it_strategic_ver', axis=1)
# df_test = df_test.drop('it_strategic_ver', axis=1)
# df_train = df_train.drop('id_strategic_ver', axis=1)
# df_test = df_test.drop('id_strategic_ver', axis=1)
####################################################
df_train['customer_country'][df_train['customer_country'].str.contains('@', na=False)] = np.nan
df_test['customer_country'][df_test['customer_country'].str.contains('@', na=False)] = np.nan
df_train['response_corporate'] = df_train['response_corporate'].replace('LGEBT', 'LGEPT')
df_train['business_unit'][df_train['business_unit'] == 'CM'] = np.nan
df_test['business_unit'][df_test['business_unit'] == 'CM'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['customer_country'][df_train['customer_country'].str.contains('@', na=False)] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['customer_country'][df_test['customer_country'].str.contains('@', na=False)] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['business_unit'][df_train['business_unit'] == 'CM'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pan

### Customer Job

In [8]:
# customer job 1
# c-level executive
temp = df_train.loc[df_train['customer_job'] == 'c-level executive', 'customer_job'].copy()
df_train.loc[df_train['customer_job'] == 'c-level executive', 'customer_job'] = df_train.loc[df_train['customer_job'] == 'c-level executive', 'customer_position']
df_train.loc[df_train['customer_job'] == 'c-level executive', 'customer_position'] = temp

# the big boss
df_train = df_train[df_train['customer_job'] != 'the big boss']

# 'customer_job'이 'coo'인 경우에 대해 값을 변경
df_train.loc[df_train['customer_job'] == 'coo', 'customer_job'] = np.nan
df_train.loc[df_train['customer_job'] == 'coo', 'customer_position'] = 'c-level executive'

# 'ceo'
df_train.loc[df_train['customer_job'] == 'ceo', 'customer_position'] = 'ceo / founder'
df_train.loc[df_train['customer_job'] == 'ceo', 'customer_job'] = np.nan

# ceo/founder
df_train.loc[(df_train['customer_job'] == 'ceo/founder') & (df_train['customer_position'] == 'none'), 'customer_job'] = np.nan
df_train.loc[(df_train['customer_job'] == 'ceo/founder') & (df_train['customer_position'] == 'none'), 'customer_position'] = 'ceo / founder'
df_train.loc[(df_train['customer_job'] == 'ceo/founder') & (df_train['customer_position'] == 'administrative'), 'customer_job'] = 'administrative'
df_train.loc[(df_train['customer_job'] == 'ceo/founder') & (df_train['customer_position'] == 'administrative'), 'customer_position'] = 'ceo / founder'
df_train.loc[(df_train['customer_job'] == 'ceo/founder') & (df_train['customer_position'] == 'entrepreneurship'), 'customer_job'] = 'entrepreneurship'
df_train.loc[(df_train['customer_job'] == 'ceo/founder') & (df_train['customer_position'] == 'entrepreneurship'), 'customer_position'] = 'ceo / founder'

# intern
temp = df_train.loc[df_train['customer_job'] == 'intern', 'customer_job'].copy()
df_train.loc[df_train['customer_job'] == 'intern', 'customer_job'] = df_train.loc[df_train['customer_job'] == 'intern', 'customer_position']
df_train.loc[df_train['customer_job'] == 'intern', 'customer_position'] = temp

# vice president
temp = df_train.loc[df_train['customer_job'] == 'vice president', 'customer_job'].copy()
df_train.loc[df_train['customer_job'] == 'vice president', 'customer_job'] = df_train.loc[df_train['customer_job'] == 'vice president', 'customer_position']
df_train.loc[df_train['customer_job'] == 'vice president', 'customer_position'] = temp
df_train.loc[df_train['customer_job'] == 'vice president', 'customer_job'] = 'administrative'

# vp/gm
df_train.loc[df_train['customer_job'] == 'vp/gm', 'customer_position'] = 'vice president'
df_train.loc[df_train['customer_job'] == 'vp/gm', 'customer_job'] = 'administrative'

# gm
df_train.loc[df_train['customer_job'] == 'gm', 'customer_position'] = 'vice president'
df_train.loc[df_train['customer_job'] == 'gm', 'customer_job'] = 'administrative'

# chief
df_train.loc[df_train['customer_job'] == 'chief', 'customer_position'] = 'ceo / founder'
df_train.loc[df_train['customer_job'] == 'chief', 'customer_job'] = 'administrative'

# chief eng.
df_train.loc[df_train['customer_job'] == 'chief eng.', 'customer_position'] = 'ceo / founder'
df_train.loc[df_train['customer_job'] == 'chief eng.', 'customer_job'] = 'administrative'

# entry level
df_train.loc[df_train['customer_job'] == 'entry level', 'customer_position'] = 'entry level'
df_train.loc[df_train['customer_job'] == 'entry level', 'customer_job'] = np.nan

# corporate / office
df_train.loc[df_train['customer_job'] == 'corporate / office', 'customer_job'] = 'other'

# director
df_train.loc[(df_train['customer_job'] == 'director') & (df_train['customer_position'] == 'none'), 'customer_job'] = np.nan
df_train.loc[(df_train['customer_job'] == 'director') & (df_train['customer_position'] == 'none'), 'customer_position'] = 'director'
df_train.loc[(df_train['customer_job'] == 'director') & (df_train['customer_position'] == 'administrative'), 'customer_job'] = np.nan
df_train.loc[(df_train['customer_job'] == 'director') & (df_train['customer_position'] == 'administrative'), 'customer_position'] = 'administrative'
df_train.loc[(df_train['customer_job'] == 'director') & (df_train['customer_position'] == 'director'), 'customer_job'] = np.nan
df_train.loc[(df_train['customer_job'] == 'director') & (df_train['customer_position'] == 'director'), 'customer_position'] = 'director'

# project director
df_train.loc[(df_train['customer_job'] == 'project director') & (df_train['customer_position'] == 'none'), 'customer_job'] = np.nan
df_train.loc[(df_train['customer_job'] == 'project director') & (df_train['customer_position'] == 'none'), 'customer_position'] = 'director'

# director of lodging
df_train.loc[(df_train['customer_job'] == 'director of lodging') & (df_train['customer_position'] == 'none'), 'customer_job'] = np.nan
df_train.loc[(df_train['customer_job'] == 'director of lodging') & (df_train['customer_position'] == 'none'), 'customer_position'] = 'director'

# technical / decision maker
df_train.loc[df_train['customer_job'] == 'technical / decision maker', 'customer_position'] = 'decision-maker'
df_train.loc[df_train['customer_job'] == 'technical / decision maker', 'customer_job'] = np.nan

# solutions provider and specifier
df_train.loc[df_train['customer_job'] == 'solutions provider and specifier', 'customer_position'] = 'technical'
df_train.loc[df_train['customer_job'] == 'solutions provider and specifier', 'customer_job'] = 'consulting'

# end user
df_train.loc[df_train['customer_job'] == 'end user', 'customer_position'] = 'end-user'
df_train.loc[df_train['customer_job'] == 'end user', 'customer_job'] = np.nan

# decision maker
df_train.loc[df_train['customer_job'] == 'decision maker', 'customer_position'] = 'decision-maker'
df_train.loc[df_train['customer_job'] == 'decision maker', 'customer_job'] = np.nan

# managing director
df_train.loc[df_train['customer_job'] == 'managing director', 'customer_position'] = 'director'
df_train.loc[df_train['customer_job'] == 'managing director', 'customer_job'] = np.nan

# director comercial
df_train.loc[df_train['customer_job'] == 'director comercial', 'customer_position'] = 'director'
df_train.loc[df_train['customer_job'] == 'director comercial', 'customer_job'] = np.nan

# managing partner
df_train.loc[df_train['customer_job'] == 'managing partner', 'customer_position'] = 'partner'
df_train.loc[df_train['customer_job'] == 'managing partner', 'customer_job'] = np.nan

# general manager (decision maker)
df_train.loc[df_train['customer_job'] == 'general manager (decision maker)', 'customer_position'] = 'decision-maker'
df_train.loc[df_train['customer_job'] == 'general manager (decision maker)', 'customer_job'] = np.nan

# renewable energy
df_train.loc[df_train['customer_job'] == 'renewable energy', 'customer_position'] = 'ceo / founder'
df_train.loc[df_train['customer_job'] == 'renewable energy', 'customer_job'] = np.nan

# directeur technique
df_train.loc[df_train['customer_job'] == 'directeur technique', 'customer_position'] = 'director'
df_train.loc[df_train['customer_job'] == 'directeur technique', 'customer_job'] = np.nan

# test4
df_train = df_train[df_train['customer_job'] != 'test4']

# need 1 tv 55" edge led 4k uhd
df_train = df_train[df_train['customer_job'] != 'need 1 tv 55" edge led 4k uhd']

# executive
df_train.loc[df_train['customer_job'] == 'executive', 'customer_job'] = np.nan
df_train.loc[df_train['customer_job'] == 'executive', 'customer_position'] = 'c-level executive'

# for confrence
df_train = df_train[df_train['customer_job'] != 'for confrence']

# menu
df_train = df_train[df_train['customer_job'] != 'menu']

# digital display vs signage need
df_train = df_train[df_train['customer_job'] != 'digital display vs signage need']

# tradeshow event
df_train = df_train[df_train['customer_job'] != 'tradeshow event']

# requirement close
df_train.loc[df_train['customer_job'] == 'requirement close', 'customer_job'] = np.nan

# no respoxse on phone will try again
df_train.loc[df_train['customer_job'] == 'no respoxse on phone will try again', 'customer_job'] = np.nan

# railway & metro station
df_train.loc[df_train['customer_job'] == 'railway & metro station', 'customer_job'] = 'other'

# fixing tv
df_train = df_train[df_train['customer_job'] != 'fixing tv']

# fixing tv
df_train = df_train[df_train['customer_job'] != 'change tv']

# president for sennco
df_train = df_train[df_train['customer_job'] != 'president for sennco']

# digital signage
df_train = df_train[df_train['customer_job'] != 'digital signage']

# we are in iceland
df_train = df_train[df_train['customer_job'] != 'we are in iceland']

# ranger 2
df_train = df_train[df_train['customer_job'] != 'ranger 2']

# sme
df_train = df_train[df_train['customer_job'] != 'sme']

# display screen from control
df_train = df_train[df_train['customer_job'] != 'display screen from control']

# costar av team
df_train = df_train[df_train['customer_job'] != 'costar av team']

# following up
df_train = df_train[df_train['customer_job'] != 'following up']

# 5% of hotel needs
df_train = df_train[df_train['customer_job'] != '5% of hotel needs']

# asking for quote for client
df_train = df_train[df_train['customer_job'] != 'asking for quote for client']

# need one tv
df_train = df_train[df_train['customer_job'] != 'need one tv']

# liason
df_train = df_train[df_train['customer_job'] != 'liason']

# tester
df_train = df_train[df_train['customer_job'] != 'tester']

# user
df_train = df_train[df_train['customer_job'] != 'user']

# execution
df_train = df_train[df_train['customer_job'] != 'execution']

# overseer
df_train = df_train[df_train['customer_job'] != 'overseer']

# the person with the credit card
df_train = df_train[df_train['customer_job'] != 'the person with the credit card']

# replacing tv
df_train = df_train[df_train['customer_job'] != 'replacing tv']

# supplier
df_train = df_train[df_train['customer_job'] != 'supplier']

# requisition
df_train = df_train[df_train['customer_job'] != 'requisition']

# hardware
df_train = df_train[df_train['customer_job'] != 'hardware']

# repair uhd 120 hz units
df_train = df_train[df_train['customer_job'] != 'repair uhd 120 hz units']

# display
df_train.loc[df_train['customer_job'] == 'display', 'customer_job'] = 'other'

# serving food
df_train = df_train[df_train['customer_job'] != 'serving food']

# underboss
df_train = df_train[df_train['customer_job'] != 'underboss']

# cliente final
df_train.loc[(df_train['customer_job'] == 'cliente final') & (df_train['customer_position'] == 'manager'), 'customer_job'] = 'other'
df_train.loc[(df_train['customer_job'] == 'cliente final') & (df_train['customer_position'] == 'gerente'), 'customer_job'] = 'other'
df_train.loc[(df_train['customer_job'] == 'cliente final') & (df_train['customer_position'] == 'gerente'), 'customer_position'] = 'manager'

# contractor
df_train = df_train.drop(df_train[(df_train['customer_position'] == 'none') & (df_train['customer_job'] == 'contractor')].index)
df_train.loc[(df_train['customer_job'] == 'contractor') , 'customer_job'] = 'other'

# managing contractor
df_train = df_train[df_train['customer_job'] != 'managing contractor']

# signage subcontractor p/m
df_train.loc[df_train['customer_job'] == 'signage subcontractor p/m', 'customer_job'] = 'program and project management'

# electrical contractor
df_train = df_train[df_train['customer_job'] != 'electrical contractor']

# federal government contractor
df_train = df_train[df_train['customer_job'] != 'federal government contractor']

# contractor/owner
df_train = df_train[df_train['customer_job'] != 'contractor/owner']

# gc
df_train = df_train[df_train['customer_job'] != 'gc']

# sub contractor
df_train = df_train[df_train['customer_job'] != 'sub contractor']

# general contractor
df_train = df_train[df_train['customer_job'] != 'general contractor']

# details send
df_train.loc[(df_train['customer_job'] == 'details send') & (df_train['customer_position'] == 'manager'), 'customer_job'] = 'other'
df_train = df_train.drop(7871)

# cintractor
df_train = df_train[df_train['customer_job'] != 'cintractor']

# master mind
df_train = df_train[df_train['customer_job'] != 'master mind']

# hardware selection
df_train = df_train[df_train['customer_job'] != 'hardware selection']

# principal
df_train = df_train[df_train['customer_job'] != 'principal']

# authorize (you are responsible for making the final decision)
df_train = df_train[df_train['customer_job'] != 'authorize (you are responsible for making the final decision)']

# quoting project
df_train = df_train[df_train['customer_job'] != 'quoting project']

# implement
df_train = df_train[df_train['customer_job'] != 'implement']

# energy
df_train = df_train[df_train['customer_job'] != 'energy']

# for presentations
df_train = df_train[df_train['customer_job'] != 'for presentations']

# customer experience
df_train = df_train[df_train['customer_job'] != 'customer experience']

# nothing
df_train = df_train[df_train['customer_job'] != 'nothing']

# distributor
df_train = df_train[df_train['customer_job'] != 'distributor']

# waiter
df_train = df_train[df_train['customer_job'] != 'waiter']

# chef
df_train = df_train[df_train['customer_job'] != 'chef']

# estimator
df_train = df_train[df_train['customer_job'] != 'estimator']

# serving
df_train = df_train[df_train['customer_job'] != 'serving']

# conference room
df_train = df_train[df_train['customer_job'] != 'conference room']

# final approval
df_train = df_train[df_train['customer_job'] != 'final approval']

# elevator company
df_train = df_train[df_train['customer_job'] != 'elevator company']

# principal in charge
df_train = df_train[df_train['customer_job'] != 'principal in charge']

# restaurant display
df_train = df_train[df_train['customer_job'] != 'restaurant display']

# serving robot
df_train = df_train[df_train['customer_job'] != 'serving robot']

# recommendation
df_train = df_train[df_train['customer_job'] != 'recommendation']

# equipment selection
df_train = df_train[df_train['customer_job'] != 'equipment selection']

# stakeholder
df_train = df_train[df_train['customer_job'] != 'stakeholder']

# architect
df_train = df_train[df_train['customer_job'] != 'architect']

# solutions architect
df_train = df_train[df_train['customer_job'] != 'solutions architect']

# project architect
df_train = df_train[df_train['customer_job'] != 'project architect']

# 'customer_country.1' 열에서 NaN 값을 빈 문자열로 대체합니다.
df_train['customer_country.1'] = df_train['customer_country.1'].fillna('')

# '@'가 포함된 행을 필터링하여 삭제합니다.
df_train = df_train[~df_train['customer_country.1'].str.contains('@')]

# using for window display
df_train = df_train[df_train['customer_job'] != 'using for window display']

# community theater
df_train = df_train[df_train['customer_job'] != 'community theater']

# equipment and app provider
df_train = df_train[df_train['customer_job'] != 'equipment and app provider']

# infrastructure
df_train = df_train[df_train['customer_job'] != 'infrastructure']

# tech
df_train = df_train[df_train['customer_job'] != 'tech']

# recommend
df_train = df_train[df_train['customer_job'] != 'recommend']

# f&b director for bicycle casino
df_train = df_train[df_train['customer_job'] != 'f&b director for bicycle casino']

# mindenes
df_train = df_train[df_train['customer_job'] != 'mindenes']

# genel müdür
df_train = df_train[df_train['customer_job'] != 'genel müdür']

# mindenes
df_train = df_train[df_train['customer_job'] != 'mindenes']

# proprietário(a)
df_train = df_train[df_train['customer_job'] != 'proprietário(a)']

# decider
df_train = df_train[df_train['customer_job'] != 'decider']

# project head
df_train = df_train[df_train['customer_job'] != 'project head']

# pricing
df_train = df_train[df_train['customer_job'] != 'pricing']

# solution provider
df_train = df_train[df_train['customer_job'] != 'solution provider']

# gm/part owner
df_train = df_train[df_train['customer_job'] != 'gm/part owner']

# conference table
df_train = df_train[df_train['customer_job'] != 'conference table']

# maintenance technician
df_train = df_train[df_train['customer_job'] != 'maintenance technician']

# submitting proposal
df_train = df_train[df_train['customer_job'] != 'submitting proposal']

# av estimator
df_train = df_train[df_train['customer_job'] != 'av estimator']

# public bidder
df_train = df_train[df_train['customer_job'] != 'public bidder']

# cctv monetoring
df_train = df_train[df_train['customer_job'] != 'cctv monetoring']

# display screen
df_train = df_train[df_train['customer_job'] != 'display screen']

# primary
df_train = df_train[df_train['customer_job'] != 'primary']

# investigator
df_train = df_train[df_train['customer_job'] != 'investigator']

# organizer
df_train = df_train[df_train['customer_job'] != 'organizer']

# appliance specialist
df_train = df_train[df_train['customer_job'] != 'appliance specialist']

# facilities
df_train = df_train[df_train['customer_job'] != 'facilities']

# technical sales
df_train = df_train[df_train['customer_job'] != 'technical sales']

# reviewer
df_train.loc[(df_train['customer_job'] == 'reviewer') , 'customer_job'] = 'other'

# lead
df_train = df_train[df_train['customer_job'] != 'lead']

# head
df_train = df_train[df_train['customer_job'] != 'head']

# influencer
df_train = df_train[df_train['customer_job'] != 'influencer']

# owning company
df_train = df_train[df_train['customer_job'] != 'owning company']

# manger
df_train = df_train[df_train['customer_job'] != 'manger']

# leader
df_train = df_train[df_train['customer_job'] != 'leader']

# part of video wall
df_train = df_train[df_train['customer_job'] != 'part of video wall']

# enterprise resource planning
df_train = df_train[df_train['customer_job'] != 'enterprise resource planning']

# component of video wall
df_train = df_train[df_train['customer_job'] != 'component of video wall']

# correspondence
df_train = df_train[df_train['customer_job'] != 'correspondence']

# quote gathering/proposer to owner
df_train = df_train[df_train['customer_job'] != 'quote gathering/proposer to owner']

# engagement executive
df_train = df_train[df_train['customer_job'] != 'engagement executive']

# supervisor
df_train = df_train[df_train['customer_job'] != 'supervisor']

# coordinator
df_train = df_train[df_train['customer_job'] != 'coordinator']

# primary end-user
df_train = df_train[df_train['customer_job'] != 'primary end-user']

# wall mounted screen mirroring
df_train = df_train[df_train['customer_job'] != 'wall mounted screen mirroring']

# owner representation
df_train = df_train[df_train['customer_job'] != 'owner representation']

# facilitator
df_train = df_train[df_train['customer_job'] != 'facilitator']

# maintenance supervisor
df_train = df_train[df_train['customer_job'] != 'maintenance supervisor']

# team leader
df_train = df_train[df_train['customer_job'] != 'team leader']

# videowall
df_train = df_train[df_train['customer_job'] != 'videowall']

# requirements and buyer
df_train = df_train[df_train['customer_job'] != 'requirements and buyer']

# manufacturing factory / plant
df_train.loc[(df_train['customer_job'] == 'manufacturing factory / plant') , 'customer_job'] = 'other'

# solution advisor
df_train = df_train[df_train['customer_job'] != 'solution advisor']

# system installer
df_train = df_train[df_train['customer_job'] != 'system installer']

# purchaser, it and installer
df_train = df_train[df_train['customer_job'] != 'purchaser, it and installer']

# videowall
df_train = df_train[df_train['customer_job'] != 'installer/sales rep']

# retailer/installer
df_train = df_train[df_train['customer_job'] != 'retailer/installer']

# installer
df_train = df_train[df_train['customer_job'] != 'installer']

# installer.
df_train = df_train[df_train['customer_job'] != 'installer.']

# seller installer
df_train = df_train[df_train['customer_job'] != 'seller installer']

# installer/ system integrater
df_train = df_train[df_train['customer_job'] != 'installer/ system integrater']

# designer/installer
df_train = df_train[df_train['customer_job'] != 'designer/installer']

# furnish and install
df_train = df_train[df_train['customer_job'] != 'furnish and install']

# supplier and installation
df_train = df_train[df_train['customer_job'] != 'supplier and installation']

# planning and installation
df_train = df_train[df_train['customer_job'] != 'planning and installation']

# design and install
df_train = df_train[df_train['customer_job'] != 'design and install']

# design and installation company
df_train = df_train[df_train['customer_job'] != 'design and installation company']

# art installation
df_train = df_train[df_train['customer_job'] != 'art installation']

# engineering, design, and install
df_train = df_train[df_train['customer_job'] != 'engineering, design, and install']

# design/install/training/support
df_train = df_train[df_train['customer_job'] != 'design/install/training/support']

# installation and purchaser
df_train = df_train[df_train['customer_job'] != 'installation and purchaser']

# purchase and install
df_train = df_train[df_train['customer_job'] != 'purchase and install']

# research/install
df_train.loc[(df_train['customer_job'] == 'research/install') , 'customer_job'] = 'research'

# post install support and service
df_train = df_train[df_train['customer_job'] != 'post install support and service']

# vendor / reseller
df_train = df_train[df_train['customer_job'] != 'vendor / reseller']

# technical advisor, reseller
df_train = df_train[df_train['customer_job'] != 'technical advisor, reseller']

# reseller
df_train = df_train[df_train['customer_job'] != 'reseller']

# integrator
df_train = df_train[df_train['customer_job'] != 'integrator']

# specifier / integrator
df_train = df_train[df_train['customer_job'] != 'specifier / integrator']

# intergrator
df_train = df_train[df_train['customer_job'] != 'intergrator']

# integration
df_train = df_train[df_train['customer_job'] != 'integration']

# it integrator
df_train = df_train[df_train['customer_job'] != 'it integrator']

# system designer, integrator
df_train.loc[(df_train['customer_job'] == 'system designer, integrator') , 'customer_position'] = 'others'
df_train.loc[(df_train['customer_job'] == 'system designer, integrator') , 'customer_position'] = 'others'

# management
df_train = df_train[df_train['customer_job'] != 'management']

# general management
df_train = df_train[df_train['customer_job'] != 'general management']

# construction manager
df_train = df_train[df_train['customer_job'] != 'construction manager']

# reseorot general manager
df_train = df_train[df_train['customer_job'] != 'reseorot general manager']

# producer/project manager
df_train = df_train[df_train['customer_job'] != 'producer/project manager']

# facility manager
df_train = df_train[df_train['customer_job'] != 'facility manager']

# asset management
df_train = df_train[df_train['customer_job'] != 'asset management']

# studio manager
df_train = df_train[df_train['customer_job'] != 'studio manager']

# hotel manager
df_train = df_train[df_train['customer_job'] != 'hotel manager']

# managing employee
df_train = df_train[df_train['customer_job'] != 'managing employee']

# managgere
df_train = df_train[df_train['customer_job'] != 'managgere']

# ordering manager
df_train = df_train[df_train['customer_job'] != 'ordering manager']

# general manager
df_train = df_train[df_train['customer_job'] != 'general manager']

# comanager
df_train = df_train[df_train['customer_job'] != 'comanager']

# office manager
df_train = df_train[df_train['customer_job'] != 'office manager']

# project sales/manage
df_train = df_train[df_train['customer_job'] != 'project sales/manage']

# genera manager
df_train = df_train[df_train['customer_job'] != 'genera manager']

# signage manager
df_train = df_train[df_train['customer_job'] != 'signage manager']

# resource manager
df_train = df_train[df_train['customer_job'] != 'resource manager']

# site manager
df_train = df_train[df_train['customer_job'] != 'site manager']

# it manager
df_train = df_train[df_train['customer_job'] != 'it manager']

# tv studio manager
df_train = df_train[df_train['customer_job'] != 'tv studio manager']

# projection manager
df_train = df_train[df_train['customer_job'] != 'projection manager']

# operations manager
df_train = df_train[df_train['customer_job'] != 'operations manager']

# general manager- purchaser
df_train = df_train[df_train['customer_job'] != 'general manager- purchaser']

# purchasing manager
df_train = df_train[df_train['customer_job'] != 'purchasing manager']

# sales manager
df_train = df_train[df_train['customer_job'] != 'sales manager']

# cctv view
df_train = df_train[df_train['customer_job'] != 'cctv view']

# buyer
df_train = df_train[df_train['customer_job'] != 'buyer']

# owner
df_train = df_train[df_train['customer_job'] != 'owner']

# team lead
df_train = df_train[df_train['customer_job'] != 'team lead']

# home theater
df_train = df_train[df_train['customer_job'] != 'home theater']

# emerging technology / innovation
df_train = df_train[df_train['customer_job'] != 'emerging technology / innovation']

# electronics evaluator
df_train = df_train[df_train['customer_job'] != 'electronics evaluator']

# design/build
df_train = df_train[df_train['customer_job'] != 'design/build']

# sourcing & quoting for end user
df_train = df_train[df_train['customer_job'] != 'sourcing & quoting for end user']

# project team member
df_train = df_train[df_train['customer_job'] != 'project team member']

# development coordinator/procurement
df_train = df_train[df_train['customer_job'] != 'development coordinator/procurement']

# video wall
df_train.loc[(df_train['customer_job'] == 'video wall') & (df_train['customer_position'] == 'ceo/founder'), 'customer_position'] = 'ceo / founder'
df_train.loc[(df_train['customer_job'] == 'video wall') & (df_train['customer_position'] == 'ceo / founder'), 'customer_job'] = 'others'

# manufacturer
df_train.loc[(df_train['customer_job'] == 'manufacturer') & (df_train['customer_position'] == 'partner'), 'customer_position'] = 'manufacture'
df_train.loc[(df_train['customer_job'] == 'manufacturer') & (df_train['customer_position'] == 'manufacture'), 'customer_job'] = 'others'

# manufacturer
df_train.loc[(df_train['customer_job'] == 'manufacturer') & (df_train['customer_position'] == 'partner'), 'customer_position'] = 'manufacture'
df_train.loc[(df_train['customer_job'] == 'manufacturer') & (df_train['customer_position'] == 'manufacture'), 'customer_job'] = 'others'

# technical
df_train = df_train[df_train['customer_job'] != 'technical']

# revendedor
df_train.loc[(df_train['customer_job'] == 'revendedor') & (df_train['customer_position'] == 'installer'), 'customer_position'] = 'technical'
df_train.loc[(df_train['customer_job'] == 'revendedor') & (df_train['customer_position'] == 'consultant'), 'customer_position'] = 'consultant'
df_train.loc[(df_train['customer_job'] == 'revendedor') & (df_train['customer_position'] == 'technical'), 'customer_job'] = 'others'
df_train.loc[(df_train['customer_job'] == 'revendedor') & (df_train['customer_position'] == 'consultant'), 'customer_job'] = 'others'

# associate/analyst
temp = df_train.loc[df_train['customer_job'] == 'associate/analyst', 'customer_job'].copy()
df_train.loc[df_train['customer_job'] == 'associate/analyst', 'customer_job'] = df_train.loc[df_train['customer_job'] == 'c-level executive', 'customer_position']
df_train.loc[df_train['customer_job'] == 'associate/analyst', 'customer_position'] = temp

# other stores
df_train = df_train[df_train['customer_job'] != 'other stores']

# facilitator installation services
df_train.loc[(df_train['customer_job'] == 'facilitator installation services'), 'customer_job'] = 'others'

# reseller/integrator
df_train.loc[(df_train['customer_job'] == 'reseller/integrator') & (df_train['customer_position'] == 'ceo/founder'), 'customer_position'] = 'ceo / founder'
df_train.loc[(df_train['customer_job'] == 'reseller/integrator') & (df_train['customer_position'] == 'ceo / founder'), 'customer_job'] = 'others'

# integrador
df_train.loc[(df_train['customer_job'] == 'integrador') & (df_train['customer_position'] == 'ceo/founder'), 'customer_position'] = 'ceo / founder'
df_train.loc[(df_train['customer_job'] == 'integrador') & (df_train['customer_position'] == 'ceo / founder'), 'customer_job'] = 'others'
df_train.loc[(df_train['customer_job'] == 'integrador') & (df_train['customer_position'] == 'gerente'), 'customer_position'] = 'manager'
df_train.loc[(df_train['customer_job'] == 'integrador') & (df_train['customer_position'] == 'manager'), 'customer_job'] = 'others'
df_train.loc[(df_train['customer_job'] == 'integrador') & (df_train['customer_position'] == 'ceo/fundador'), 'customer_position'] = 'ceo / founder'
df_train.loc[(df_train['customer_job'] == 'integrador') & (df_train['customer_position'] == 'ceo / founder'), 'customer_job'] = 'others'
df_train.loc[(df_train['customer_job'] == 'integrador') & (df_train['customer_position'] == 'installer'), 'customer_position'] = 'technical'
df_train.loc[(df_train['customer_job'] == 'integrador') & (df_train['customer_position'] == 'technical'), 'customer_job'] = 'others'
df_train.loc[(df_train['customer_job'] == 'integrador') & (df_train['customer_position'] == 'others'), 'customer_job'] = 'others'
df_train.loc[(df_train['customer_job'] == 'integrador') & (df_train['customer_position'] == 'consultant'), 'customer_job'] = 'others'

# manager
temp = df_train.loc[df_train['customer_job'] == 'manager', 'customer_job'].copy()
df_train.loc[df_train['customer_job'] == 'manager', 'customer_job'] = df_train.loc[df_train['customer_job'] == 'c-level executive', 'customer_position']
df_train.loc[df_train['customer_job'] == 'manager', 'customer_position'] = temp

# medical solution  provider
df_train = df_train[df_train['customer_job'] != 'owner / project manager']

# it support
df_train = df_train[df_train['customer_job'] != 'it support']

# bidder
df_train.loc[(df_train['customer_job'] == 'bidder') & (df_train['customer_position'] == 'manager'), 'customer_job'] = 'others'

# assist in serving food
df_train = df_train[df_train['customer_job'] != 'assist in serving food']

# support/facilitator, designer
df_train = df_train[df_train['customer_job'] != 'support/facilitator, designer']

# department secretary
df_train = df_train[df_train['customer_job'] != 'department secretary']

# service coordinator
df_train = df_train[df_train['customer_job'] != 'service coordinator']

# designer/ engineer
df_train = df_train[df_train['customer_job'] != 'designer/ engineer']

# design engineer
df_train = df_train[df_train['customer_job'] != 'design engineer']

# accounts payable
df_train = df_train[df_train['customer_job'] != 'accounts payable']

# adminisztráció
df_train = df_train[df_train['customer_job'] != 'adminisztráció']

## True

# account exec/manager # True
df_train = df_train[df_train['customer_job'] != 'account exec/manager']

# account management # True
df_train = df_train[df_train['customer_job'] != 'account management']

# 삭제할 직업 목록
jobs_to_remove = ['adminisztráció', 'administración', 'amministrativo', 'it administrator', 'project administrator', 
                  'administration', 'admin assistant', 'admin', 'administrative assistant', 'facility administrator', 
                  'imaging administrator', 'it admin', 'systems administrator', 'network administrator', 
                  'platform administrator', 'pacs administrator', 'művészet_és_design', 'arte y diseño', 
                  'arte_e_design', 'exhibition / convention center', 'graphic design', 'kreation und design', 
                  'kreation_und_design', 'designere / budget', 'sign company', 'interior stylist', 
                  'digital display vs signage need', 'master mind', 'var', 'sho lyrics', 'hardware selection', 
                  'replacement tv', 'guestroom tv', 'photos', 'developer', 'signage for an attraction', 
                  'architect ass interiores', 'art and design', 'designers', 'inquiry-to-buy/contact-us test', 
                  'design', 'design/decision maker', 'interior designer', 'designer, creative technologist', 
                  'creative director', 'lead designer', 'designer, producer', 'sliding pictures of beauty salon', 
                  'producer', 'fashion', 'design and provide equipment', 'designer', 'artist, lead on equipment selection',
                  'business owner', 'content creation, eq consultant', 'technology consultant', 'design consultant', 
                  'consultent', 'consultant,cabinet fabricator', 'consultant', 'consultant / purchaser', 
                  'quotation curator', 'educator', 'instructor', 'teacher', 'teaching', 'institute & academy', 
                  'higher education (college & university)', 'system engineer', 'systems engineer', 'director of engineering', 
                  'chief of engineering', 'principal engineer', 'hardware design engineer', 'senior design engineer', 
                  'solution engineer', 'engineering & technical', 'sales engineering', 'project engineer', 'lead engineer', 
                  'engineering & technical executive', 'chief engineer', 'engineer', 'engineering director', 'finanzas', 
                  'finanzen', 'finance executive', 'pénzügy', 'director of finance', 'cirugano', 'tierarzt', 'főorvos', 
                  'spécialiste_en_imagerie_médicale', 'profesional de cirugía', 'chirurgien', 'surgery professional\u200b', 
                  'profesional de radiología', 'healthcare professionals', 'mental health', 'doctor', 'hr', 'hr posting', 
                  'information technology\u200b', 'si', 'help desk / desktop services', 'application development', 
                  'software developer', 'cloud / mobility', 'collaboration & web apps', 'it tech.', 'director it', 
                  'it specialist', 'it director', 'director,it', 'director of it', "i'm directing it", 'it dairector', 
                  'it department', 'it project lead', 'it hardware technician', 'it - information technology', 'office it', 
                  'computing & it', 'systems designer', 'systems design', 'deputy cio', 'technical director', 
                  'informatics, touch capability', 'helpdesk specialist', 'head of technology', 'av technician', 
                  'technology designer', 'av tech', 'tech service', 'ownner-marketing director', 'product marketing', 
                  'technical marketing', 'marketing operations', 'marketing executive', 'store promotions', 
                  'signage for an attraction', 'event marketing', 'field marketing', 'advertising', 
                  'advertising and promotions team', 'marketing coordinator', 'strategic communications', 
                  'media_e_comunicazione', 'média_és_kommunikáció', 'medien_und_kommunikation', 'medios_de_comunicación', 
                  'media_and_communication', 'military_and_protective_services', 'commander', 'regional director of operations', 
                  'director of operations', 'strategy & operations specialist', 'operaciones', 'üzemeltetés', 'ops mgr', 
                  'equipment planner', 'operations executive', 'sales operations', 'facilities and operations', 
                  'parts coordinator', 'maintenance', 'hotel tv', 'equipment custodian', 'main end user of the product', 
                  'global lead of production', 'designer/pm/gc', 'product owner', 'digital project manager', 'gestión_de_proyectos', 
                  'program-_és_projektmenedzsment', 'programm- und projektmanagement', 'projektmenedzsment\tprogram and project management', 
                  'project manage', 'project designer', 'project sales/manage', 'genera manager', 'projectr mgmt', 
                  'programm-_und_projektmanagement', 'program_and_project_management', 'designer/ project manager', 
                  'a/v project manager', 'project manager/designer', 'r&d project manager', 'general manager - project manager', 
                  'project manager / principal', 'display our products', 'project manager / estimator', 'pm', 'project coordinator', 
                  'project lead', 'project facilitator', 'drop, purchase maxhub', 'buyer, coordinating', 'replacement tv', 
                  'guestroom tv', 'purchase dept', 'designer purchaser', 'purchsing', 'design/purchaser', 'purchasing supervisor', 
                  'procurement specialist', 'procurment', 'purchasers', 'sourcing/procurement', 'purchasing director', 
                  'sourcing', 'purchasing authority', 'purchasing coordinator', 'obtain quotes, process purchase', 
                  'planner/purchaser', 'procurement', 'director purchaser', 'purchase', 'purchasing agent', 
                  'testing and troubleshooting', 'property owner', 'building owner', 'architect/owner', 'product researcher', 
                  'project researcher', 'research products and prices', 'research & development', 'research and developement', 
                  'product research', 'research and instalaltion', 'értékesítés', 'vendite', 'recommender', 'vertrieb', 
                  'field / outside sales', 'sourcing / procurement', 'distributor quotation', 'distribuidor', 'sale', 
                  'sales rep', 'sales executive', 'salesman','it','president','system designer, integrator']

# 삭제할 직업 목록에 해당하는 행을 필터링하여 삭제합니다.
df_train = df_train[~df_train['customer_job'].isin(jobs_to_remove)]

In [9]:
# customer_job 2
# Accounting
change_list = ['accounting']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'accounting'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'accounting'
# Administrative
change_list = ['administrative']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'administrative'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'administrative'
    
# Arts and Design
change_list = ['museum / gallery', 'colorist', 'photographer', 'arts_and_design',
'arts and design', 'graphic/color art']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'arts and design'
    df_test.loc[df_test['customer_job'] == item,'customer_job'] =  'arts and design'
    
# Business Development
change_list = ['business_development', 'business development']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'business development'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'business development'
    
# Community and Social Services
change_list = ['community_and_social_services','community and social services']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'community and social services'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'community and social services'
    
# consulting
change_list = ['arquitecto/consultor',  'consulting']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'consulting'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'consulting'
    
# Curation
change_list = [ 'curation']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'curation'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'curation'
# Education
change_list = ['education', 'k12 school']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'education'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'education'
    
# Engineering
change_list = ['engineering']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'engineering'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'engineering'
    
# Entrepreneurship
change_list = ['entrepreneurship']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'entrepreneurship'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'entrepreneurship'
    
# Finance
change_list = ['finance']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'finance'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'finance'
    
# healthcare services
change_list = ['healthcare','healthcare_services','healthcare services','healthcare services']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'healthcare services'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'healthcare services'

# Human Resources
change_list = ['human resources', 'human_resources']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'human resources'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'human resources'

# information technology
change_list = ['information technology', 'software solution', 'it/software',
'information_technology', 'developer/property']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'information technology'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'information technology'
    
# Legal
change_list = ['legal']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'legal'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'legal'
    
# Marketing
change_list = ['marketing']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'marketing'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'marketing'
    
# Media and Communication
change_list = ['media and communications', 'media and communication', 'broadcasting & media']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'media and communication'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'media and communication'
    
#  Military and Protective Services
change_list = ['military and protective services']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'military and protective services'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'military and protective services'
    
# Operations
change_list = ['operations']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'operations'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'operations'
    
# Product Management
change_list = ['product management', 'recommend (you recommend specific products or technologies for the solution)',
'product_management']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'product management'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'product management'

# Program and Project Management
change_list = ['program and project management', 'planner',  'av project manager',     
'program_and_project_manager','program directors','project manager']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'program and project management'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'program and project management'
    
# Purchasing
change_list = ['purchasing', 'purchaser']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'purchasing'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'purchasing'
    
# Quality Assurance
change_list = ['quality assurance', 'quality_assurance']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'quality assurance'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'quality assurance'
    
# Real Estate
change_list = ['real estate']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'real estate'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'real estate'
    
# Research
change_list = ['research']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'research'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'research'
    
# Sales
change_list = ['sales', 'car dealership']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'sales'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'sales'
    
# Support
change_list = ['support']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'support'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'support'
    
# Others
change_list = ['others', 'other', 'egyéb','autres', 'n.a', 'sonstiges', 'altro', 'var', 
'contributor', 'otros', 'no requirment', 'otro', 'bidder']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'other'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'other'
    
# 'electronics & telco'
change_list = ['electronics & telco']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'electronics & telco'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'electronics & telco'

# 'surgery professional'
change_list = ['surgery professional']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'surgery professional'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'surgery professional'
    
# 'film productionl'
change_list = ['film production']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'film production'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'film production'
    
# 'medical imaging specialist'
change_list = ['medical imaging specialist', 'medical imaging specialist', 'medical imaging  specialist']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'medical imaging specialist'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'medical imaging specialist'
    
# 'clinical specialist'
change_list = ['clinical specialist','clinic','clinical specialist','clinical specialist']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'clinical specialist'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'clinical specialist'
    
# 'radiology professional'
change_list = ['radiology professional','radiology professional',
'radiology  professional', 'radiology_professional','radiology professional']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'radiology professional'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'radiology professional'
    
# 'pathologist'
change_list = ['pathologist']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'pathologist'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'pathologist'
    
# 'medical solution provider'
change_list = ['medical solution provider','medical solution  provider','medical solution provider\u200b']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'medical solution provider'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'medical solution provider'

# '3d/vfx art'
change_list = ['3d/vfx art']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = '3d/vfx art'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  '3d/vfx art'

### Customer_type

In [10]:
# Customer_type (hyuk)
# End Customer
change_list = ['End Customer', 'End-Customer','End-user','Corporate']
for item in change_list:
    df_train.loc[df_train['customer_type'] == item, 'customer_type'] = 'End-Customer'
    df_test.loc[df_test['customer_type'] == item, 'customer_type'] =  'End-Customer'
# Specifier / Influencer
change_list = ['Specifier / Influencer', 'Specifier/ Influencer','Consultant','Installer',
               'Technician','Installer/Contractor','Architect/Consultant']
for item in change_list:
    df_train.loc[df_train['customer_type'] == item, 'customer_type'] = 'Specifier/ Influencer'
    df_test.loc[df_test['customer_type'] == item, 'customer_type'] =  'Specifier/ Influencer'
# Channel Partner
change_list = ['Channel Partner','Reseller','Distributor']
for item in change_list:
    df_train.loc[df_train['customer_type'] == item, 'customer_type'] = 'channel partner'
    df_test.loc[df_test['customer_type'] == item, 'customer_type'] =  'channel partner'
# Service Partner
change_list = ['Service Partner']
for item in change_list:
    df_train.loc[df_train['customer_type'] == item, 'customer_type'] = 'Service Partner'
    df_test.loc[df_test['customer_type'] == item, 'customer_type'] =  'Service Partner'
# Solution Eco-Partner
change_list = ['Solution Eco-Partner']
for item in change_list:
    df_train.loc[df_train['customer_type'] == item, 'customer_type'] = 'Solution Eco-Partner'
    df_test.loc[df_test['customer_type'] == item, 'customer_type'] =  'Solution Eco-Partner'
# Developer
change_list = ['Developer']
for item in change_list:
    df_train.loc[df_train['customer_type'] == item, 'customer_type'] = 'Developer'
    df_test.loc[df_test['customer_type'] == item, 'customer_type'] =  'Developer'
# np.nan
change_list = ['Homeowner','Others','Etc.']
for item in change_list:
    df_train.loc[df_train['customer_type'] == item, 'customer_type'] = np.nan
    df_test.loc[df_test['customer_type'] == item, 'customer_type'] =  np.nan
    
# Administrator
df_train.loc[(df_train['customer_type'] == 'Administrator'), 'customer_job'] = 'administrative'        
df_train.loc[(df_train['customer_type'] == 'Administrator'), 'customer_type'] =np.nan

# Other
df_train = df_train[df_train['customer_type'] != 'Other']   

# 'System Integrator'
df_train = df_train[df_train['customer_type'] != 'System Integrator']   

# Commercial end-user
df_train = df_train[df_train['customer_type'] != 'Commercial end-user'] 

# Engineer
df_train = df_train[df_train['customer_type'] != 'Engineer'] 

# Manager / Director
df_train = df_train[df_train['customer_type'] != 'Manager / Director'] 

# HVAC Engineer
df_train = df_train[df_train['customer_type'] != 'HVAC Engineer'] 

# Software/Solution Provider
df_train = df_train[df_train['customer_type'] != 'Software/Solution Provider']

# Software / Solution Provider
df_train = df_train[df_train['customer_type'] != 'Software / Solution Provider'] 

# Dealer/Distributor
df_train = df_train[df_train['customer_type'] != 'Dealer/Distributor'] 

# Technical Assistant
df_train = df_train[df_train['customer_type'] != 'Technical Assistant'] 

# Interior Designer
df_train = df_train[df_train['customer_type'] != 'Interior Designer'] 

# Home Owner
df_train = df_train[df_train['customer_type'] != 'Home Owner']

### Customer Country

In [11]:
import pycountry

# 나라 채울 리스트 
countries = []

# 나라이름 불러오기
for country in pycountry.countries:
    countries.append(country.name)

# 나라이름에 이메일 들어가는 관측치 삭제
df_train = df_train[~df_train['customer_country'].str.contains('@', na=False)]

# 아예 결측치 인 값에 슬래시 두개 넣기
df_train['customer_country'][df_train['customer_country'].isna()] = '//'
df_test['customer_country'][df_test['customer_country'].isna()] = '//'

# , 를 /로 대치하고 맨 뒤에있는거 그냥 부름
df_train['country'] = df_train['customer_country'].str.replace(',', '/').str.split('/').map(lambda x: x[-1]).str.strip()
df_test['country'] = df_test['customer_country'].str.replace(',', '/').str.split('/').map(lambda x: x[-1]).str.strip()

# 기준 나라에 있는 이름이면 이름 불러오고 아니면 ''불러오기
df_train['customer_country2'] = np.where(df_train['country'].isin(countries), df_train['country'], '')
df_test['customer_country2'] = np.where(df_test['country'].isin(countries), df_test['country'], '')

# 나라의 딕셔너리 구성
country_dict = {"LGERA": "Russia",
"LGEUR": "Ukraine",
"LGEAP": "Australia",
"LGECH": "China",
"LGEHK": "China",
"LGEIL": "India",
"LGEIN": "Indonesia",
"LGEJP": "Japan",
"LGEKR": "Korea",
"LGEML": "Malaysia",
"LGEPH": "Philippines",
"LGESL": "Singapore",
"LGETT": "Taiwan, Province of China",
"LGETH": "Thailand",
"LGEAR": "Argentina",
"LGECZ": "Czechia",
"LGEFS": "France",
"LGEDG": "Germany",
"LGEAG": "Austria",
"LGEHS": "Greece",
"LGEMK": "Hungary",
"LGEIS": "Italy",
"LGEBN": "Netherlands",
"LGEPL": "Poland",
"LGEPT": "Portugal",
"LGERO": "Romania",
"LGEES": "Spain",
"LGESW": "Sweden",
"LGEUK": "United Kingdom",
"LGEAS": "Algeria",
"LGEEG": "Egypt",
"LGELF": "Jordan",
"LGEMC": "Morocco",
"LGESA": "The Republic of South Africa",
"LGEGF": "United Arab Emirates",
"LGEAF": "United Arab Emirates",
"LGETK": "Turkiye",
"LGECI": "Canada",
"LGEMX": "Mexico",
"LGEMS": "Mexico",
"LGEUS": "United States",
"LGECL": "Chile",
"LGECB": "Colombia",
"LGEPS": "Panama",
"LGEPR": "Peru",
"LGESJ": "Saudi Arabia",
"LGESP": "Brazil",
"LGEEF": "Kenya",
"LGEYK": "Palestine, State of",
"LGEEB": "en_EU", 
"LGEVH": "Viet Nam",
"LGELA": "Latvia",
"LGEIR": "Iran, Islamic Republic of",
"LGEBT": "Portugal"
               }

# 지사에 맞게 갑슬 넣어주기
df_train['customer_country3'] = np.where(df_train['customer_country2'] == '',
                                         df_train['response_corporate'].apply(lambda x: country_dict[x]),
                                         df_train['customer_country2'])
df_test['customer_country3'] = np.where(df_test['customer_country2'] == '',
                                         df_test['response_corporate'].apply(lambda x: country_dict[x]),
                                         df_test['customer_country2'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['customer_country'][df_train['customer_country'].isna()] = '//'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['customer_country'][df_test['customer_country'].isna()] = '//'


### Expected_Timeline

In [12]:
# 동혁
# Expected_Timeline
# Test에 존재하는 Data 중 Test에서 비슷한 구문이 있는 데이터 수정
df_train.loc[(df_train["expected_timeline"]=="less_than_3_months") | 
          (df_train["expected_timeline"]=="less than 3 months. customer not answered . to call back") |
          (df_train["expected_timeline"]=="less than 3 months- outdoor led requiment") |
          (df_train["expected_timeline"]=="3 months") |
          (df_train["expected_timeline"]=="less than 3 months ,meeting with the customer for the more details and tentative boq will ne 32 and 43") |
          (df_train["expected_timeline"]=="duplicate lead - il220100042906. less than 3 months"), "expected_timeline"] = "less than 3 months"
df_train.loc[df_train["expected_timeline"]=="3_months_~_6_months", "expected_timeline"] = "3 months ~ 6 months"
df_train.loc[df_train["expected_timeline"]== "more_than_a_year", "expected_timeline"] = "more than a year"
df_train.loc[(df_train["expected_timeline"]== "9_months_~_1_year") |
          (df_train["expected_timeline"]== "9 months - 1 year"), "expected_timeline"] = "9 months ~ 1 year"
df_train.loc[df_train["expected_timeline"]== "6_months_~_9_months", "expected_timeline"] = "6 months ~ 9 months"
df_train.loc[~df_train["expected_timeline"].isin(df_test["expected_timeline"].unique()), "expected_timeline"] = np.nan
# Train과 Test 모두 NaN이 절반 정도 차지하고 있고, 
# NaN 중 is_converted가 True인 경우가 존재하므로 NaN을 Unknown으로 채워줌
df_train["expected_timeline"].fillna("Unknown", inplace=True)
df_test["expected_timeline"].fillna("Unknown", inplace=True)

# Test에 있는 Data만 따로 뽑기, 나머지는 삭제 처리 
# / 현재 Column 삭제 시 Test에 있는 Unique Value가 삭제되는 상황이 있으므로 삭제 처리는 보류
# train = train[train["expected_timeline"].isin(test["expected_timeline"].unique())]

### Customer Position

In [13]:
# customer postition (new/hyuk)
# none + others 어떠한가?
# 1
# 'manager'
df_train = df_train[df_train['customer_position'] != 'av management']
df_train = df_train[df_train['customer_position'] != 'gerente']
# Manager
change_list = ['manager',  'gerente']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'manager'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'manager'

# 2
# 'others' 'homeowner','commercial consultant','software /solution provider','manufacturer','unpaid','medical device manufacturer','consultant','installer','hospital','distributor','employee','clinical specialist','medical solution provider'
df_train = df_train[df_train['customer_position'] != 'other - please specify - cedia association']
# other
change_list = ['others', 'other','homeowner','commercial consultant','software /solution provider','manufacturer','unpaid','medical device manufacturer','consultant','installer','hospital','distributor','sales','employee','clinical specialist','medical solution provider']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'others'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'others'

# 3
# 'director'
df_train = df_train[df_train['customer_position'] != 'principal & director']
df_train = df_train[df_train['customer_position'] != 'business unit director']
# Director
change_list = ['director']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'director'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'director'
   
# 4
# 'entry level'
df_train = df_train[df_train['customer_position'] != 'entrylevel']
# Entry Level
change_list = ['entry level']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'entry level'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'entry level'

# 5
# 'ceo/founder','president'
df_train = df_train[df_train['customer_position'] != 'founder']
df_train = df_train[df_train['customer_position'] != 'entrepreneurship']
# CEO / Founder
change_list = ['ceo/founder', 'ceo/fundador','president']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'ceo / founder'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'ceo / founder'

# 6
# 'partner'
df_train = df_train[df_train['customer_position'] != 'business partner']
# Partner
change_list = ['partner']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'partner'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'partner'

# 7
# 'c-level executive' , 'decision maker', 'decision-maker','decision influencer', 'decision-influencer'
df_train = df_train[df_train['customer_position'] != 'chairman']
df_train = df_train[df_train['customer_position'] != 'chief executive officer']
df_train = df_train[df_train['customer_position'] != 'leadership/executive office/owner']
# C-Level Executive
change_list = ['c-level executive', 'c-levelexecutive', 'decision maker', 'decision-maker','decision influencer', 'decision-influencer']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'c-level executive'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'c-level executive'

# 8
# 'vice president' 
df_train = df_train[df_train['customer_position'] != 'vp']
df_train = df_train[df_train['customer_position'] != 'vicepresident']
# Vice President
change_list = ['vice president']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'vice president'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'vice president'

# 9
# 'educator'
df_train = df_train[df_train['customer_position'] != 'education professional']
df_train = df_train[df_train['customer_position'] != 'teacher']
df_train = df_train[df_train['customer_position'] != 'senior lecturer']
df_train = df_train[df_train['customer_position'] != 'guest faculty']
df_train = df_train[df_train['customer_position'] != 'english trainer for ielts,toefl,pte,gre,sat exams.']
df_train = df_train[df_train['customer_position'] != 'academic specialist']
df_train = df_train[df_train['customer_position'] != 'neet/ olympiad expert faculty']
df_train = df_train[df_train['customer_position'] != 'quantitative aptitude faculty']
df_train = df_train[df_train['customer_position'] != 'professional trainer']
df_train = df_train[df_train['customer_position'] != 'career coach']
df_train = df_train[df_train['customer_position'] != 'teacher/middle school coordinator']
df_train = df_train[df_train['customer_position'] != 'principal at oxford integrated pu science college']
df_train = df_train[df_train['customer_position'] != 'english trainer for ielts,toefl,pte,gre,sat exams.']
# Educator
change_list = ['education', 'educator']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'educator'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'educator'

# 10
# customer
df_train = df_train[df_train['customer_position'] != 'end-user']
df_train = df_train[df_train['customer_position'] != 'commercial end-user']
# Customer
change_list = ['customer']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'customer'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'customer'  

# 11
# 'pgt chemistry'
change_list = ['pgt chemistry','chemistry teacher']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'pgt chemistry'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'pgt chemistry'  

# 12, 13
# 'assistant professor' , 'asst prof.'
df_train = df_train[df_train['customer_position'] != 'assistant professor of enlish']
change_list = ['assistant professor','asst prof.']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'assistant professor'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'assistant professor'  

# 14
# 'math and physics teacher' 
df_train = df_train[df_train['customer_position'] != 'maths lecturer']
# 'math and physics teacher' 
change_list = ['math and physics teacher','physics and mathematics teacher','physics teacher','science teacher','physics faculty']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'math and physics teacher'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'math and physics teacher'  

# 15
# 'academic coordinatorpost graduate teacher (accountancy, business studies)/ tgt (ict)' 
df_train = df_train[df_train['customer_position'] != 'hon dean']

# 16
# 'professor'
df_train = df_train[df_train['customer_position'] != 'associate professor']
df_train = df_train[df_train['customer_position'] != 'associate professor in electronics engg']
# professor' 
change_list = ['professor', 'prof.','professor of mathematics']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'professor'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'professor' 

# 17
# none
change_list = ['none','not applicable']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'none'

##############################################################################################################

# 'administrative'
df_train = df_train[df_train['customer_position'] != 'administrative']
    
# 'pathologist'
df_train.loc[(df_train['customer_position'] == 'pathologist'), 'customer_job'] = 'pathologist'            
df_train.loc[(df_train['customer_position'] == 'pathologist'), 'customer_position'] = 'none'            

# 'research'
df_train.loc[(df_train['customer_position'] == 'research'), 'customer_position'] = 'others'     

#  'engineering'
df_train.loc[(df_train['customer_position'] == 'engineering'), 'customer_job'] = 'engineering'            
df_train.loc[(df_train['customer_position'] == 'engineering'), 'customer_position'] = 'others'    
            
# 'radiology professional''
df_train.loc[(df_train['customer_position'] == 'radiology professional'), 'customer_job'] = 'radiology professional'        
df_train.loc[(df_train['customer_position'] == 'radiology professional'), 'customer_position'] = 'others'   

# 'surgery professional'
df_train.loc[(df_train['customer_position'] == 'surgery professional'), 'customer_job'] = 'surgery professional'        
df_train.loc[(df_train['customer_position'] == 'surgery professional'), 'customer_position'] = 'none'   
            
# 'medical imaging specialist'
df_train.loc[(df_train['customer_job'] == 'other') & (df_train['customer_position'] == 'medical imaging specialist'), 'customer_job'] = 'medical imaging specialist'        
df_train.loc[(df_train['customer_job'] == 'medical imaging specialist') & (df_train['customer_position'] == 'medical imaging specialist'), 'customer_position'] = 'other' 
df_train.loc[(df_train['customer_position'] == 'medical imaging specialist'), 'customer_job'] = 'medical imaging specialist'     
df_train.loc[(df_train['customer_job'] == 'medical imaging specialist') & (df_train['customer_position'] == 'medical imaging specialist'), 'customer_position'] = 'none'             

# 'exhibition' 
df_train = df_train[df_train['customer_position'] != 'exhibition']
            
# 'cargo'
df_train = df_train[df_train['customer_position'] != 'cargo']
            
# 'architecture/consult'
df_train = df_train[df_train['customer_position'] != 'architecture/consult']

# 'no influence'
df_train = df_train[df_train['customer_position'] != 'no influence']
            
# 'architect/consultant'
df_train = df_train[df_train['customer_position'] != 'architecture/consultant']
            
# 'government'
temp = df_train.loc[df_train['customer_position'] == 'government', 'customer_position'].copy()
df_train.loc[df_train['customer_position'] == 'government', 'customer_position'] = df_train.loc[df_train['customer_position'] == 'government', 'customer_job']
df_train.loc[df_train['customer_position'] == 'government', 'customer_job'] = temp

# 'bulgaria'         
df_train = df_train[df_train['customer_position'] != 'bulgaria']

# 'pgt physics'
df_train = df_train[df_train['customer_position'] != 'pgt physics']

# exhibitiontv
df_train = df_train[df_train['customer_position'] != 'exhibitiontv']
            
# this is a consume display requirement for home purpose.
df_train.loc[(df_train['customer_position'] == 'this is a consume display requirement for home purpose.'), 'customer_position'] = 'others'   

# 'technical',
df_train = df_train[df_train['customer_position'] != 'technical']
            
# 'operations'
df_train = df_train[df_train['customer_position'] != 'operations']

# 'business development/sales'
df_train = df_train[df_train['customer_position'] != 'business development/sales']
            
# 'técnico'
df_train = df_train[df_train['customer_position'] !='técnico']
            
# 'system integrator'
df_train = df_train[df_train['customer_position'] !='system integrator']
            
# hospital
df_train = df_train[df_train['customer_position'] != 'hospital']
            
# 'lider de desarrollo'
df_train = df_train[df_train['customer_position'] != 'lider de desarrollo']         

# 'consulting'
df_train = df_train[df_train['customer_position'] != 'consulting']  
            
# 'manufacture'
df_train = df_train[df_train['customer_position'] != 'manufacture']      

# 'business development'
df_train = df_train[df_train['customer_position'] != 'business development']     
            
# 'subsidiary sales (ise)'
df_train = df_train[df_train['customer_position'] != 'subsidiary sales (ise)']      

# 'architect/consultant
df_train = df_train[df_train['customer_position'] != 'architect/consultant']

### inquiry_type

In [14]:
# inquiry_type
# Quotation or purchase consultation
change_list = ['Quotation or purchase consultation', 'quotation_or_purchase_consultation', 'Quotation or Purchase Consultation', 'Quotation or Purchase consultation', 'Purchase or Quotation', 'quotation_', 'Request for quotation or purchase', 'Purchase', 'Sales Inquiry', 'sales', 'Sales inquiry']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'quotation or purchase consultation'
# Others
change_list = ['others', 'Others', 'other_' , 'Etc.' , 'Other', 'ETC.', 'other']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] =  'others'
    df_test.loc[df_test['inquiry_type'] == item, 'inquiry_type'] =  'others'
# Usage or Technical Consultation
change_list = ['Usage or technical consultation', 'Technical Support', 'usage or technical consultation', 'usage_or_technical_consultation', 'Usage or Technical Consultation', 'technical_consultation', 'Request for technical consulting', 'Technical Consultation', 'technical']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'usage or technical consultation'
    df_test.loc[df_test['inquiry_type'] == item, 'inquiry_type'] = 'usage or technical consultation'
# Product Information
change_list = ['Product Information', 'i want to know the details about it', 
'Toi muon tim hieu thong tin ky thuat, gia ca cua sp de su dung', 
'first Info and pricing', 'estoy buscando para Ecuador este producto LG MAGNIT micro LED, para un cliente de 138 pulgadas, con envió marítimo.',
'Hola me pueden cotizar 19 pantallas interactivas de 100 pulgadas entregadas en Guayaquil -Ecuador.',
'display product', 'TV interactive', 'Display Textbook and photos', 'Hotel TV products',
'LED Signage', 'Video Wall', 'IDB', 'High inch 86 / 98 or 110' , 'AIO', 'VRF', 'Standalone', 'window facing product', 'Hospital TV', 'Pantallas Interactivas para Clinicas', 
'Preciso de um monitor médico para radiografia convencional e tomogrtafia.', 'Probeam precio', 'One Quick:Flex', 'Pantallas Interactivas para Clinicas']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'product information'
# error
change_list = ['(Select ID_Needs)', 'Needs', 'Digital platform', 'Not specified', 'teach', 'for school', 'EDUCATIONAL EQUIPMENTS', 
'Evento_SdelEstero', 'Event Inquiry', 'tôi cần tham khảo giá và giải pháp từ LG','Solicito apoyo para realizar cotizacion de los dispositivos que ofrecen en la solución\xa0One Quick:\xa0',
'Vui lòng báo giá giúp mình sản phẩm đo thân nhiệt Xin cảm ơn' ]
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'error'

# 이상치 제거
df_train = df_train[df_train['inquiry_type'] != 'Intégrateur historique du George V']

### product_category

In [15]:
train = df_train.copy()
test = df_test.copy()

In [16]:
# NA는 Not Available 집단으로 반영
train['product_category'][train['product_category'].isna()] = 'NOT-AVAILABLE'
test['product_category'][test['product_category'].isna()] = 'NOT-AVAILABLE'

# 다중 선택은 다중선택 끼리 묶자
train['product_category'][train['product_category'].str.contains(',')] = 'MULTI-SELECTED'
test['product_category'][test['product_category'].str.contains(',')] = 'MULTI-SELECTED'

# 데톱 & 노트북
train['product_category'][train['product_category'].isin(["notebook", 'laptop', 'pc'])] = 'PC'
test['product_category'][test['product_category'].isin(["notebook", 'laptop', 'pc'])] = 'PC'

# display
train['product_category'][train['product_category'].str.contains('display')] = 'DISPLAY'
test['product_category'][test['product_category'].str.contains('display')] = 'DISPLAY'

train['product_category'][train['product_category'].isin(['oled 顯示屏', 
                                                          'led aio 136',
                                                          'khác', 'id',
                                                          '互動式顯示屏',
                                                          '標準顯示屏', 'lg magnit',
                                                          'medical- surgical', 'corpouh5f',
                                                          '高亮度顯示屏', 'corpuh5f-',
                                                         ])] = 'DISPLAY'
test['product_category'][test['product_category'].isin(['oled 顯示屏',
                                                        'led aio 136',
                                                        'khác', 'id',
                                                        '互動式顯示屏',
                                                        '標準顯示屏', 'lg magnit',
                                                        'medical- surgical', 'corpouh5f',
                                                        '高亮度顯示屏', 'corpuh5f-',
                                                       ])] = 'DISPLAY'

# 기타
train['product_category'][train['product_category'].isin(['etc.', 'other', 'others', 'error',
                                                         'not specified', 'vb.'])] = 'O/W'
test['product_category'][test['product_category'].isin(['etc.', 'other', 'others', 'error',
                                                       'not specified', 'vb.'])] = 'O/W'

# 태양열 전지는 Solar로
train['product_category'][train['product_category'].isin(['ess', 'solar', 'energy storage system'])] = 'SOLAR'
test['product_category'][test['product_category'].isin(['ess', 'solar', 'energy storage system'])] = 'SOLAR'

# solution
train['product_category'][train['product_category'].str.contains('solution')] = 'SOLUTION'
test['product_category'][test['product_category'].str.contains('solution')] = 'SOLUTION'

train['product_category'][train['product_category'].isin(['pro centric hotel',
                                                          'single package',
                                                          'lg customer care program',
                                                          'technical support',
                                                          'sales inquiry', 'services', 'inne',
                                                         ])] = 'SOLUTION'
test['product_category'][test['product_category'].isin(['pro centric hotel',
                                                        'single package',
                                                        'lg customer care program',
                                                        'technical support',
                                                        'sales inquiry', 'services', 'inne',
                                                       ])] = 'SOLUTION'

# soft ware
train['product_category'][train['product_category'].isin(['webos',
                                                          'cloud device',
                                                          'pro:centric',
                                                          '軟體',
                                                          'procentric',
                                                         ])] = 'SOFTWARE'
test['product_category'][test['product_category'].isin(['webos',
                                                        'cloud device',
                                                        'pro:centric',
                                                        '軟體',
                                                        'procentric',
                                                        ])] = 'SOFTWARE'

# 간판
train['product_category'][train['product_category'].str.contains('signage')] = 'SIGNAGE'
test['product_category'][test['product_category'].str.contains('signage')] = 'SIGNAGE'

train['product_category'][train['product_category'].str.contains(':')] = 'SIGNAGE'
test['product_category'][test['product_category'].str.contains(':')] = 'SIGNAGE'

train['product_category'][train['product_category'].isin(['video wall', 'ur640', 'ur640s',
                                                          'transparent oled', 'high brightness',
                                                          'meeting & screen sharedirect view leddirect view led',
                                                          'lg led bloc', 'videwall', 'אחר',
                                                          'pantalla led outdoor', 'ultra stretch series',
                                                          'led cinema', 'led', 'standard',
                                                          'ledallinone', 'video wall + aio',
                                                          'interactive digital board', 'education createboard',
                                                          '110 + video wall', 'idb', 'aio',
                                                          'aio | one quick', 'allinone_rmk', 'leadallin',
                                                          'one quick works', 'onequick series',
                                                          'laec015', 'laec015-gn.awz', '43uh5f-h.awzm',
                                                          'virtual production', 'gscd046', '49vl5g-m.awzm',
                                                          '49vl5g-m', '55vm5e-a', 'laec15', '55vm5j-h',
                                                          '49vl5f', '特別顯示屏', 'videowall_rmk',
                                                          '86uh5f', '55tc3d', 'retaildigital',
                                                          '55svh7f-a', 'tr3', '98uh5e', 'gsca046',
                                                          'gscd100', 'lsca039', 'essential series',
                                                         ])] = 'SIGNAGE'
test['product_category'][test['product_category'].isin(['video wall', 'ur640', 'ur640s',
                                                        'transparent oled', 'high brightness',
                                                        'meeting & screen sharedirect view leddirect view led',
                                                        'lg led bloc', 'videwall', 'אחר',
                                                        'pantalla led outdoor', 'ultra stretch series',
                                                        'led cinema', 'led', 'standard',
                                                        'ledallinone', 'video wall + aio',
                                                        'interactive digital board', 'education createboard',
                                                        '110 + video wall', 'idb', 'aio',
                                                        'aio | one quick', 'allinone_rmk', 'leadallin',
                                                        'one quick works', 'onequick series',
                                                        'laec015', 'laec015-gn.awz', '43uh5f-h.awzm',
                                                        'virtual production', 'gscd046', '49vl5g-m.awzm',
                                                        '49vl5g-m', '55vm5e-a', 'laec15', '55vm5j-h',
                                                        '49vl5f', '特別顯示屏', 'videowall_rmk',
                                                        '86uh5f', '55tc3d', 'retaildigital',
                                                        '55svh7f-a', 'tr3', '98uh5e', 'gsca046',
                                                        'gscd100', 'lsca039', 'essential series',
                                                       ])] = 'SIGNAGE'

# tv
train['product_category'][train['product_category'].str.contains('tv')] = 'TV'
test['product_category'][test['product_category'].str.contains('tv')] = 'TV'

train['product_category'][train['product_category'].str.contains('pol')] = 'TV'
test['product_category'][test['product_category'].str.contains('pol')] = 'TV'

train['product_category'][train['product_category'].isin(['led 顯示屏',
                                                          'led 70m2', 'high inch 86 / 98 or 110',
                                                          'standalone', '32lq621cbsb.awz',
                                                          'fhd series', '50us660h0sd.bwz',
                                                          '43us660h0sd.awz', '49uh / 49xf',
                                                          '50uq801c0sb.bwz', 'lainnya', 'uh',
                                                          '55us660h0sd.bwz', '55uq801c0sb.bwz',
                                                          '43uq751c0sf.bwz', '43uq751c0sb.bwz',
                                                          'hospitality', '43us660h (na)',
                                                          'hoteleria_us670h', '醫院電視',
                                                          'autre',
                                                         ])] = 'TV'
test['product_category'][test['product_category'].isin(['led 顯示屏',
                                                        'led 70m2', 'high inch 86 / 98 or 110',
                                                        'standalone', '32lq621cbsb.awz',
                                                        'fhd series', '50us660h0sd.bwz',
                                                        '43us660h0sd.awz', '49uh / 49xf',
                                                        '50uq801c0sb.bwz', 'lainnya', 'uh',
                                                        '55us660h0sd.bwz', '55uq801c0sb.bwz',
                                                        '43uq751c0sf.bwz', '43uq751c0sb.bwz',
                                                        'hospitality', '43us660h (na)',
                                                        'hoteleria_us670h', '醫院電視',
                                                        'autre',
                                                       ])] = 'TV'

# vrf
train['product_category'][train['product_category'].str.contains('vrf')] = 'VRF'
test['product_category'][test['product_category'].str.contains('vrf')] = 'VRF'

train['product_category'][train['product_category'].str.contains('multi v')] = 'VRF'
test['product_category'][test['product_category'].str.contains('multi v')] = 'VRF'

train['product_category'][train['product_category'].isin(['حلول التدفئة',
                                                         ])] = 'VRF'
test['product_category'][test['product_category'].isin(['حلول التدفئة',
                                                       ])] = 'VRF'
# 악세서리
train['product_category'][train['product_category'].isin(['accessories',
                                                          'outros', 'otros',
                                                          '酒店電視',
                                                          'ฯลฯ', 'parts'
                                                         ])] = 'ACCESSORIES'
test['product_category'][test['product_category'].isin(['accessories',
                                                        'outros', 'otros',
                                                        '酒店電視',
                                                        'ฯลฯ', 'parts'
                                                       ])] = 'ACCESSORIES'


# 에어컨
train['product_category'][train['product_category'].isin(["single-split", "multi-split", 
                                                          'single split', 'multi split',
                                                          "rac", "residential air conditioner",
                                                          'system air conditioner', 'تكييف وتبريد',
                                                          'multi-split (plusieurs pièces)',
                                                          'klimatyzacja multi-split',
                                                          'ar condicionado residencial',
                                                          'teto ou cassete inverter', 'scroll compressor',
                                                          'điều hòa trung tâm multi',
                                                          'multi inverter', 'MULTI-SELECTED',
                                                          'split tunggal', 'מזגנים למקום מגורים',
                                                          'pendingin', 'تكييفات', 'điều hòa gia dụng',
                                                          'เครื่องปรับอากาศเผื่อที่อยู่อาศัย', 'climatiseur résidentiel', 
                                                          'điều hòa cục bộ',
                                                         ])] = 'AIRCON'

test['product_category'][test['product_category'].isin(["single-split", "multi-split", 
                                                        'single split', 'multi split',
                                                        "rac", "residential air conditioner",
                                                        'system air conditioner', 'تكييف وتبريد',
                                                        'multi-split (plusieurs pièces)',
                                                        'klimatyzacja multi-split',
                                                        'ar condicionado residencial',
                                                        'teto ou cassete inverter', 'scroll compressor',
                                                        'multi inverter', 'MULTI-SELECTED',
                                                        'split tunggal', 'מזגנים למקום מגורים',
                                                        'pendingin', 'تكييفات',
                                                        'điều hòa trung tâm multi', 'điều hòa gia dụng',
                                                        'เครื่องปรับอากาศเผื่อที่อยู่อาศัย', 'climatiseur résidentiel', 
                                                        'điều hòa cục bộ','điều hòa cục bộ',
                                                       ])] = 'AIRCON'

train['product_category'][train['product_category'].str.contains('ac')] = 'AIRCON'
test['product_category'][test['product_category'].str.contains('ac')] = 'AIRCON'

# heating
train['product_category'][train['product_category'].str.contains('heat')] = 'HEATING'
test['product_category'][test['product_category'].str.contains('heat')] = 'HEATING'

train['product_category'][train['product_category'].isin(['חימום', 
                                                          'isıtma',
                                                          'ogrzewanie (pompy ciepła)',
                                                          'aquecimento',
                                                         ])] = 'HEATING'
test['product_category'][test['product_category'].isin(['חימום',
                                                        'isıtma',
                                                        'ogrzewanie (pompy ciepła)',
                                                        'aquecimento',
                                                       ])] = 'HEATING'

# robot
train['product_category'][train['product_category'].isin(['robots', 'cloud device'])] = 'O/W'
test['product_category'][test['product_category'].isin(['robots', 'cloud device'])] = 'O/W'

# monitor
train['product_category'][train['product_category'].str.contains('monitor')] = 'MONITOR'
test['product_category'][test['product_category'].str.contains('monitor')] = 'MONITOR'

train['product_category'][train['product_category'].isin(['28mq780',
                                                         'פיצול מרובה',
                                                         'a definir',
                                                         ])] = 'MONITOR'
test['product_category'][test['product_category'].isin(['28mq780',
                                                        'פיצול מרובה',
                                                        'a definir',
                                                       ])] = 'MONITOR'

# control
train['product_category'][train['product_category'].str.contains('control')] = 'CONTROL'
test['product_category'][test['product_category'].str.contains('control')] = 'CONTROL'


train['product_category'][train['product_category'].isin(['unitario'])] = 'CONTROL'
test['product_category'][test['product_category'].isin(['unitario'])] = 'CONTROL'


# control
train['product_category'][train['product_category'].str.contains('ventilation')] = 'VENTILATION'
test['product_category'][test['product_category'].str.contains('ventilation')] = 'VENTILATION'

# chiller
train['product_category'][train['product_category'].str.contains('chiller')] = 'CHILLER'
test['product_category'][test['product_category'].str.contains('chiller')] = 'CHILLER'

train['product_category'][train['product_category'].isin(['مبرد (تشيلر)',
                                                          'refrigerator', 'ahu',
                                                          'soğutucu', 'آخر',
                                                          'systèmes de débit à réfrigérant variable (drv)',
                                                         ])] = 'CHILLER'
test['product_category'][test['product_category'].isin(['مبرد (تشيلر)',
                                                        'refrigerator', 'ahu',
                                                        'soğutucu', 'آخر',
                                                        'systèmes de débit à réfrigérant variable (drv)',
                                                       ])] = 'CHILLER'

# projector
train['product_category'][train['product_category'].str.contains('projector')] = 'PROJECTOR'
test['product_category'][test['product_category'].str.contains('projector')] = 'PROJECTOR'

train['product_category'][train['product_category'].isin(['bu50nst',
                                                         ])] = 'PROJECTOR'
test['product_category'][test['product_category'].isin(['bu50nst',
                                                       ])] = 'PROJECTOR'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['product_category'][train['product_category'].isna()] = 'NOT-AVAILABLE'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['product_category'][test['product_category'].isna()] = 'NOT-AVAILABLE'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['product_category'][train['product_category'].str.contains(',')] = 'MULTI-SELECTED'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/p

In [17]:
df_train = train.copy()
df_test = test.copy()

### Enterprise

In [18]:
# Enterprise (Label Incoding 덕분에 필요없음 ㅇㅇ)
# 'Enterprise' 이면 1, 'SMB' 이면 0으로 변환
df_train['enterprise'] = df_train['enterprise'].map({'Enterprise' : 1, 'SMB' : 0})
df_test['enterprise'] = df_test['enterprise'].map({'Enterprise' : 1, 'SMB' : 0})

### Continent

In [19]:
train = df_train.copy()
test = df_test.copy()

In [20]:
# 반영이 안되는 값들 반영 되는 값으로 변경
train['customer_country3'][train['customer_country3'] == 'Turkiye'] = 'Türkiye'
train['customer_country3'][train['customer_country3'] == 'The Republic of South Africa'] = 'South Africa'
train['customer_country3'][train['customer_country3'] == 'Korea'] = 'South Korea'

# 함수 만들기
def country_to_continent(country_name):
    country_alpha2 = pc.country_name_to_country_alpha2(country_name)
    country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    return country_continent_name

# 있으면 대륙이름 없으면 나라이름 대입하기
cont = []
for i in train['customer_country3']:
    try:
        cont.append(country_to_continent(i))
    except:
        cont.append(i)

# test에서도 똑같이
cont2 = []
for i in test['customer_country3']:
    try:
        cont2.append(country_to_continent(i))
    except:
        cont2.append(i)

# 칼럼 만들기
train['continent'] = cont
test['continent'] = cont2

# EU는 유럽
train['continent'][train['continent'] == 'en_EU'] = 'Europe'
test['continent'][test['continent'] == 'en_EU'] = 'Europe'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['customer_country3'][train['customer_country3'] == 'Turkiye'] = 'Türkiye'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['customer_country3'][train['customer_country3'] == 'The Republic of South Africa'] = 'South Africa'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['customer_country3'][train['customer_country3'] == 'Korea'] = 'South Korea'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: htt

In [21]:
df_train = train.copy()
df_test = test.copy()

In [22]:
# 생성한 열 삭제
df_train = df_train.drop(['customer_country2', 'country'
                          ,'customer_country.1'
                          ,'customer_country'], axis = 1)
df_test = df_test.drop(['customer_country2', 'country'
                        ,'customer_country.1'
                        ,'customer_country'], axis = 1)

In [23]:
# 생성한 열 삭제
df_train = df_train.drop(['id_strategic_ver','it_strategic_ver', 'ver_win_rate_x', 'ver_win_ratio_per_bu',
                          'com_reg_ver_win_rate', 'product_subcategory' , 'product_modelname'
                          ], axis = 1)
df_test = df_test.drop(['id_strategic_ver','it_strategic_ver', 'ver_win_rate_x', 'ver_win_ratio_per_bu',
                          'com_reg_ver_win_rate', 'product_subcategory' , 'product_modelname'
                          ], axis = 1)

In [24]:
# 임시 파일 저장
df_train.to_csv('new_imputation_train2.csv', index = False)
df_test.to_csv('new_imputation_test2.csv', index = False)