# 영업 성공 여부 분류 경진대회

# 1. 데이터 확인

### 필수 라이브러리

In [None]:
import pandas as pd
import numpy as np
import re
from pycaret.classification import *
import category_encoders as ce
from time import time

### 데이터 셋 읽어오기

In [None]:
pd.set_option('display.max_rows', None) # DataFrame의 모든 행 출력
pd.set_option('display.max_columns', None) # DataFrame의 모든 열 출력

In [None]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

#from google.colab import drive
#drive.mount('/content/drive')

#df_train = pd.read_csv("/content/drive/My Drive/lg_aimers/train.csv")
#df_test = pd.read_csv("/content/drive/My Drive/lg_aimers/original_submission.csv")
#city_list = pd.read_csv('/content/drive/My Drive/lg_aimers/oecd_city.csv').iloc[1:,0:1]

# 2. 전처리

### inquiry_type 전처리

In [None]:
similar_categories = ['others', 'other_', 'etc.']
df_train['inquiry_type'].replace(similar_categories, 'other', inplace=True)
df_train['inquiry_type'] = df_train['inquiry_type'].str.lower()
similar_categories_mapping = {
    'quotation_or_purchase_consultation': 'quotation or purchase consultation',
    'technical consultation': 'technical support',
    'technical': 'technical support',
    'sales': 'sales inquiry'
}




df_train['inquiry_type'] = df_train['inquiry_type'].replace(similar_categories_mapping)


category_counts = df_train['inquiry_type'].value_counts()
categories_to_remove = category_counts[category_counts <= 100].index
df_train.loc[df_train['inquiry_type'].isin(categories_to_remove), 'inquiry_type'] = 'undefined'
df_train['inquiry_type'].value_counts()

quotation or purchase consultation    42138
sales inquiry                         10081
other                                  1401
product information                    1237
usage or technical consultation        1180
technical support                       547
trainings                               434
services                                415
request for partnership                 297
request for quotation or purchase       230
undefined                               214
request a demo                          184
Name: inquiry_type, dtype: int64

### customer_type 전처리

In [None]:
df_train['customer_type'] = df_train['customer_type'].str.lower()
df_train['customer_type'] = df_train['customer_type'].str.replace(' ', '')
df_train['customer_type'] = df_train['customer_type'].str.replace('/', '')
df_train['customer_type'] = df_train['customer_type'].str.replace('//', '/')
df_train['customer_type'] = df_train['customer_type'].str.replace('_', '')
df_train['customer_type'] = df_train['customer_type'].str.replace('-', '')
df_train['customer_type'] = df_train['customer_type'].str.replace(r'/$', '', regex=True)
df_train['customer_type'] = df_train['customer_type'].str.replace(r'^/', '', regex=True)

category_counts = df_train['customer_type'].value_counts()
categories_to_remove = category_counts[category_counts <= 100].index
df_train.loc[df_train['customer_type'].isin(categories_to_remove), 'customer_type'] = 'undefined'
df_train['customer_type'].value_counts()

endcustomer            10643
specifierinfluencer     2568
channelpartner          1368
servicepartner           349
undefined                264
solutionecopartner       146
Name: customer_type, dtype: int64

### customer_job 전처리

In [None]:
        df_train['customer_job'] = df_train['customer_job'].replace('others', 'other')
        df_train['customer_job'] = df_train['customer_job'].replace('accountedf_trainec/manager', 'accountmanagement')
        df_train['customer_job'] = df_train['customer_job'].replace('accountspayable', 'accounting')
        specific_values = ['admin', 'administración', 'administration', 'administrative', 'adminisztráció', 'amministrativo',
                           'imagingadministrator', 'itadmin', 'itadministrator', 'networkadministrator', 'pacsadministrator',
                           'platformadministrator', 'systemsadministrator']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'admin' if df_train in specific_values else df_train)

        #4.
        df_train['customer_job'] = df_train['customer_job'].replace('administrativeassistant', 'adminassistant')

        #5.
        df_train['customer_job'] = df_train['customer_job'].replace('advertisingandpromotionsteam', 'advertising')
        df_train['customer_job'] = df_train['customer_job'].replace('storepromotions', 'advertising')
        df_train['customer_job'] = df_train['customer_job'].replace('tradeshowevent', 'advertising')

        #6.
        specific_values2 = ['architect', 'architect/owner', 'architectassinteriores', 'arquitecto/consultor', 'projectarchitect']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'architect' if df_train in specific_values2 else df_train)

        #7.
        specific_values3 = ['artanddesign','arte_e_design', 'arteydiseño','artist,leadonequipmentselection', 'arts_and_design', 'artsanddesign']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'artist' if df_train in specific_values3 else df_train)

        #8.
        specific_values4 = ['assistinservingfood', 'serving', 'servingfood', 'servingrobot', 'waiter']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'serving' if df_train in specific_values4 else df_train)

        #9.
        specific_values5 = ['a/vprojectmanager', 'avestimator', 'avprojectmanager', 'avtech', 'avtechnician']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'avtechnician' if df_train in specific_values5 else df_train)

        #10.
        specific_values6 = ['authorize(youareresponsibleformakingthefinaldecision)', 'purchasingauthority']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'authorizer' if df_train in specific_values6 else df_train)

        #11.
        df_train['customer_job'] = df_train['customer_job'].replace('publicbidder', 'bidder')

        #12.
        specific_values7 = ['business_development', 'businessdevelopment']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'businessdevelopment' if df_train in specific_values7 else df_train)

        #13.
        specific_values8 = ['cctvmonetoring', 'cctvview']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'cctvoperator' if df_train in specific_values8 else df_train)

        #14.
        specific_values9 = ['ceo', 'ceo/founder', 'chief', 'cleveledf_trainecutive', 'coo', 'decider', 'decisionmaker',
                            'finalapproval', 'head', 'president', 'presidentforsennco', 'thebigboss', 'underboss',
                            'vicepresident', 'vp/gm']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'chief' if df_train in specific_values9 else df_train)

        #15.
        specific_values10 = ['chiefeng', 'chiefengineer', 'chiefofengineering']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'chiefengineer' if df_train in specific_values10 else df_train)

        #16.
        specific_values11 = ['chirurgien', 'cirugano', 'doctor', 'főorvos',  'profesionaldecirugía', 'surgeryprofessional',
                             'surgeryprofessional\u200b']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'doctor' if df_train in specific_values11 else df_train)

        #17.
        specific_values12 = ['contractor', 'cintractor', 'managingcontractor']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'contractor' if df_train in specific_values12 else df_train)

        #18.
        specific_values13 = ['consultant', 'consultant,cabinetfabricator', 'consultant/purchaser', 'consultent', 'consulting']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'consultant' if df_train in specific_values13 else df_train)

        #19.
        specific_values14 = ['coordinator', 'corporate/office', 'correspondence', 'costaravteam']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'corporate' if df_train in specific_values14 else df_train)

        #20.
        specific_values15 = ['contentcreation,eqconsultant', 'creativedirector']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'creator' if df_train in specific_values15 else df_train)

        #21.
        specific_values16 = ['design', 'design/build', 'design/purchaser', 'designandprovideequipment', 'designer',
                             'designer,creativetechnologist', 'designer,producer', 'designers', 'designere/budget',
                             'designerpurchaser', 'design/purchaser', 'graphicdesign', 'kreation_und_design',
                             'kreationunddesign', 'művészet_és_design']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'designer' if df_train in specific_values16 else df_train)

        #22.
        specific_values17 = ['design/insttrain/training/support', 'designandinsttrain', 'designandinsttrainationcompany', 'designer/insttrainer']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'designinsttrainer' if df_train in specific_values17 else df_train)

        #23.
        specific_values18 = ['designengineer', 'designer/engineer']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'designengineer' if df_train in specific_values18 else df_train)

        #24.
        specific_values19 = ['desicionmaker', 'design/decisionmaker', 'generalmanager(decisionmaker)', 'technical/decisionmaker']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'decisionmaker' if df_train in specific_values19 else df_train)

        #25.
        specific_values20 = ['designer/pm/gc', 'designer/projectmanager']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'desingpm' if df_train in specific_values20 else df_train)

        #26.
        specific_values21 = ['developer', 'developer/property', 'softwaredeveloper']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'developer' if df_train in specific_values21 else df_train)

        #27.
        specific_values22 = ['digitaldisplayvssignageneed', 'digitalsignage']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'digitalsignage' if df_train in specific_values22 else df_train)

        #28.
        specific_values23 = ['directeurtechnique', 'director', 'director,it', 'directorcomercial',
                             'directorit', 'directorofengineering', 'directoroffinance', 'directorofit',
                             'directoroflodging','directorofoperations', 'directorpurchaser', 'engineeringdirector',
                             'f&bdirectorforbicyclecasino', "i'mdirectingit", 'itdairector',  'itdirector',  'managingdirector',
                             'overseer',  'projectdirector', 'purchasingdirector', 'supervisor', 'technicaldirector']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'director' if df_train in specific_values23 else df_train)

        #29.
        specific_values24 = ['distribuidor', 'distributor', 'distributorquotation']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'distributor' if df_train in specific_values24 else df_train)

        #30.
        specific_values25 = ['education', 'educator', 'highereducation(college&university)', 'institute&academy']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'educator' if df_train in specific_values25 else df_train)

        #31.
        specific_values26 = ['engineer', 'engineering', 'engineering&technical', 'engineering,design,andinsttrain',
                             'hardwaredesignengineer', 'projectengineer', 'principalengineer', 'seniordesignengineer',
                             'systemengineer', 'systemsengineer']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'engineer' if df_train in specific_values26 else df_train)

        #32.
        specific_values27 = ['engagementedf_trainecutive', 'engineering&technicaledf_trainecutive', 'edf_trainecutive', 'edf_trainecution',
                             'marketingedf_trainecutive', 'financeedf_trainecutive', 'operationsedf_trainecutive', 'principal',  'principalincharge',
                             'salesedf_trainecutive']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'edf_trainecutive' if df_train in specific_values27 else df_train)

        #33.
        specific_values28 = ['eventmarketing', 'fieldmarketing', 'marketing', 'marketingcoordinator',
                             'marketingoperations', 'productmarketing', 'technicalmarketing']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'marketing' if df_train in specific_values28 else df_train)

        #34.
        specific_values29 = ['equipmentandappprovider', 'equipmentcustodian', 'equipmentplanner', 'equipmentselection']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'equipment' if df_train in specific_values29 else df_train)

        #35.
        specific_values30 = ['facilitator', 'facilitatorinsttrainationservices', 'facilities', 'facilitiesandoperations',
                             'facilityadministrator', 'facilitymanager']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'facilitator' if df_train in specific_values30 else df_train)

        #36.
        specific_values31 = ['field/outsidesales', 'sale', 'sales', 'salesman', 'salesmanager',
                             'salesoperations', 'technicalsales', 'salesrep', 'salesengineering', 'sellerinsttrainer',
                             'vendite', 'vertrieb', 'értékesítés']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'sales' if df_train in specific_values31 else df_train)

        #37.
        specific_values32 = ['finance', 'finanzas', 'finanzen', 'pénzügy']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'finance' if df_train in specific_values32 else df_train)

        #38.
        specific_values33 = ['gc', 'generalcontractor']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'generalcontractor' if df_train in specific_values33 else df_train)

        #39.
        specific_values34 = ['genelmüdür', 'generalmanagement', 'generalmanager', 'generalmanagerpurchaser', 'generamanager',
                             'globtraineadofproduction','gm', 'gm/partowner']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'generalmanager' if df_train in specific_values34 else df_train)

        #40.
        specific_values35 = ['generalmanagerprojectmanager', 'gestión_de_proyectos']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'generalprojectmanager' if df_train in specific_values35 else df_train)

        #41.
        specific_values36 = ['hr','human_resources', 'humanresources', 'hrposting']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'humanresource' if df_train in specific_values36 else df_train)

        #42.
        specific_values37 = ['healthcare_services', 'healthcareprofessionals', 'healthcareservices', 'mentalhealth']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'healthcare' if df_train in specific_values37 else df_train)

        #43.
        specific_values38 = ['helpdesk/desktopservices', 'helpdeskspecialist']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'helpdesk' if df_train in specific_values38 else df_train)

        #44.
        specific_values39 = ['implement', 'informatics,touchcapability', 'information_technology', 'informationtechnology',
                             'informationtechnology\u200b', 'it', 'it/software', 'itdepartment', 'ittech', 'itsupport',
                             'itspecialist', 'itintegrator', 'ithardwaretechnician', 'itinformationtechnology', 'officeit']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'it' if df_train in specific_values39 else df_train)

        #45.
        specific_values40 = ['insttrainationandpurchaser', 'insttrainer', 'insttrainer/salesrep', 'insttrainer/systemintegrater',
                             'postinsttrainsupportandservice',  'planningandinsttraination', 'systeminsttrainer']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'insttrain' if df_train in specific_values40 else df_train)

        #46.
        specific_values41 = ['integrador', 'integration', 'integrator', 'intergrator', 'si', 'specifier/integrator',
                             'systemdesigner,integrator']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'integrator' if df_train in specific_values41 else df_train)

        #47.
        specific_values42 = ['interiordesigner', 'interiorstylist']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'interior' if df_train in specific_values42 else df_train)

        #48.
        specific_values43 = ['instructor', 'teacher', 'teaching']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'instructor' if df_train in specific_values43 else df_train)

        #49.
        specific_values44 = ['lead', 'leaddesigner', 'leadengineer', 'leader', 'itprojectlead',
                             'projectlead', 'teamlead', 'teamleader']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'leader' if df_train in specific_values44 else df_train)

        #50.
        specific_values45 = ['medicalsolutionprovider', 'medicalsolutionprovider\u200b']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'medicalsolutionprovider' if df_train in specific_values45 else df_train)

        #51.
        specific_values46 = ['maintenance', 'maintenancesupervisor', 'maintenancetechnician']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'maintenance' if df_train in specific_values46 else df_train)

        #52.
        specific_values47 = ['management',  'manager', 'managgere', 'managingemployee', 'managingpartner', 'manger',
                             'officemanager', 'üzemeltetés']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'manager' if df_train in specific_values47 else df_train)

        #53.
        specific_values48 = ['manufacturer', 'manufacturingfactory/plant']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'manufacturer' if df_train in specific_values48 else df_train)

        #54.
        specific_values49 = ['media_and_communication', 'media_e_comunicazione', 'mediaandcommunication',
                             'mediaandcommunications',  'medios_de_comunicación', 'medien_und_kommunikation',
                             'média_és_kommunikáció']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'mediaandcommunication' if df_train in specific_values49 else df_train)

        #55.
        specific_values50 = ['military_and_protective_services', 'militaryandprotectiveservices']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'military' if df_train in specific_values50 else df_train)

        #56.
        specific_values51 = ['obtainquotes,processpurchase', 'planner/purchaser', 'purchase', 'purchaseandinsttrain',
                             'purchasedept', 'purchaser', 'purchaser,itandinsttrainer', 'purchasers', 'purchasing',
                             'purchasingagent', 'purchasingcoordinator','purchsing']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'purchase' if df_train in specific_values51 else df_train)

        #57.
        specific_values52 = ['operaciones', 'operations']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'operation' if df_train in specific_values52 else df_train)

        #58.
        specific_values53 = ['operationsmanager', 'opsmgr']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'operationmanager' if df_train in specific_values53 else df_train)

        #59.
        specific_values54 = ['product_management', 'productmanagement']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'productmanager' if df_train in specific_values54 else df_train)

        #60.
        specific_values55 = ['pm', 'producer/projectmanager', 'program_and_project_management', 'program_and_project_manager',
                             'program_és_projektmenedzsment', 'programandprojectmanagement',  'programm_und_projektmanagement',
                             'programmundprojektmanagement',  'projectcoordinator', 'projectadministrator', 'programdirectors',
                             'projectionmanager', 'projectmanage', 'projectmanager', 'projectmanager/designer',
                             'projectmanager/estimator', 'projectmanager/principal','projectsales/manage',
                             'projektmenedzsment\tprogramandprojectmanagement', 'projectdesigner','projectfacilitator',
                             'projecthead', 'projectrmgmt', 'r&dprojectmanager']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'projectmanager' if df_train in specific_values55 else df_train)

        #61.
        specific_values56 =  ['productresearch', 'productresearcher', 'projectresearcher', 'research/insttrain',
                              'researchproductsandprices', 'researchandinstalaltion']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'research' if df_train in specific_values56 else df_train)

        #62.
        specific_values57 = ['medicalimagingspecialist', 'profesionalderadiología', 'spécialiste_en_imagerie_médicale']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'medicalimagingspecialist' if df_train in specific_values57 else df_train)

        #63.
        specific_values58 = ['proprietário(a)', 'propertyowner', 'ownerrepresentation', 'owningcompany',
                             'ownnermarketingdirector', 'owner/projectmanager', 'businessowner', 'productowner',
                             'buildingowner']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'owner' if df_train in specific_values58 else df_train)

        #64.
        specific_values59 = ['partscoordinator', 'buyer,coordinating', 'servicecoordinator']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'coordinator' if df_train in specific_values59 else df_train)

        #65.
        specific_values60 = ['procurement', 'procurementspecialist', 'procurment', 'sourcing/procurement']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'procurement' if df_train in specific_values60 else df_train)

        #66.
        specific_values61 = ['quality_assurance', 'qualityassurance']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'qualityassurance' if df_train in specific_values61 else df_train)

        #67.
        specific_values62 = [ 'quotationcurator',  'quotegathering/proposertoowner', 'quotingproject', 'sourcing',
                             'sourcing&quotingforenduser']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'quotation' if df_train in specific_values62 else df_train)

        #68.
        specific_values63 = ['radiology_professional',  'radiologyprofessional']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'radiology' if df_train in specific_values63 else df_train)

        #69.
        specific_values64 = ['recommend', 'recommend(yourecommendspecificproductsortechnologiesforthesolution)',
                             'recommendation', 'recommender']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'recommend' if df_train in specific_values64 else df_train)

        #70.
        specific_values65 = ['requirementsandbuyer', 'buyer']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'buyer' if df_train in specific_values65 else df_train)

        #71.
        specific_values66 = ['research&development', 'researchanddevelopement']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'researchanddevelopment' if df_train in specific_values66 else df_train)

        #72.
        specific_values67 = ['reseller', 'reseller/integrator', 'technicaladvisor,reseller', 'vendor/reseller']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'reseller' if df_train in specific_values67 else df_train)

        #73.
        specific_values68 = ['retailer/insttrainer', 'revendedor']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'retailer' if df_train in specific_values68 else df_train)

        #74.
        specific_values69 = ['display', 'displayourproducts', 'restaurantdisplay', 'usingforwindowdisplay']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'display' if df_train in specific_values69 else df_train)

        #75.
        specific_values70 = ['energy', 'renewableenergy']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'energy' if df_train in specific_values70 else df_train)

        #76.
        specific_values71 = ['changetv', 'replacementtv', 'replacingtv']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'changetv' if df_train in specific_values71 else df_train)

        #77.
        specific_values72 = ['signageforanattraction', 'signagemanager', 'signagesubcontractorp/m', 'signcompany',
                             'slidingpicturesofbeautysalon']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'signage' if df_train in specific_values72 else df_train)

        #78.
        specific_values73 = ['solutionadvisor', 'solutionengineer', 'solutionprovider', 'solutionsarchitect',
                             'solutionsproviderandspecifier', 'softwaresolution']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'solution' if df_train in specific_values73 else df_train)

        #79.
        specific_values74 = ['systemsdesign', 'systemsdesigner']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'systemdesigner' if df_train in specific_values74 else df_train)

        #80.
        specific_values75 = ['strategiccommunications', 'strategy&operationsspecialist']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'strategy' if df_train in specific_values75 else df_train)

        #81.
        specific_values76 = ['support', 'support/facilitator,designer']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'support' if df_train in specific_values76 else df_train)

        #82.
        specific_values77 = ['supplier', 'supplierandinsttraination']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'supplier' if df_train in specific_values77 else df_train)

        #83.
        specific_values78 = ['tech', 'technical', 'technologyconsultant', 'technologydesigner', 'techservice',
                             'avtechnician', 'fidf_trainingtv', 'emergingtechnology/innovation']
        df_train['customer_job'] = df_train['customer_job'].apply(lambda df_train: 'tech' if df_train in specific_values78 else df_train)

        #84.
        df_train['customer_job'] = df_train['customer_job'].replace('tierarzt', 'vat')

In [None]:
df_train['customer_job'] = df_train['customer_job'].str.lower()
df_train['customer_job'] = df_train['customer_job'].str.replace(' ', '')
df_train['customer_job'] = df_train['customer_job'].str.replace('/', '')
df_train['customer_job'] = df_train['customer_job'].str.replace('//', '/')
df_train['customer_job'] = df_train['customer_job'].str.replace('_', '')
df_train['customer_job'] = df_train['customer_job'].str.replace('-', '')
df_train['customer_job'] = df_train['customer_job'].str.replace(r'/$', '', regex=True)
df_train['customer_job'] = df_train['customer_job'].str.replace(r'^/', '', regex=True)

category_counts = df_train['customer_job'].value_counts()
categories_to_remove = category_counts[category_counts <= 40].index
df_train.loc[df_train['customer_job'].isin(categories_to_remove), 'customer_job'] = 'undefined'
df_train['customer_job'].value_counts()

engineering                      6342
other                            4852
administrative                   3359
education                        2287
sales                            2218
operations                       1961
informationtechnology            1945
purchasing                       1938
businessdevelopment              1806
consulting                       1349
artsanddesign                    1285
entrepreneurship                 1192
marketing                        1161
undefined                        1082
programandprojectmanagement       956
others                            763
mediaandcommunication             710
healthcareservices                636
productmanagement                 574
finance                           463
accounting                        396
support                           358
graphiccolorart                   331
realestate                        295
filmproduction                    241
3dvfxart                          235
clinicalspec

### product_category 전처리

In [None]:
df_train['product_category'] = df_train['product_category'].str.lower()
# 1.
specific2_values = ['videowtrainsignage', 'ledsignage', 'interactivesignage', 'oledsignage', 'standardsignage',
                    'highbrightnesssignage', 'specialsignage', 'ur640s', 'smarttvsignage', 'ur640', 'uhdsignage',
                    'digitalsignage', 'tvsignage', 'monitorsignagecommercialtv']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'signage' if x in specific2_values else x)

# 2.
specific2_values2 = ['hoteltv', 'hospitaltv', 'commercialtv', 'commercialtvtv', 'htv', '43us660h0sdawz']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'tv' if x in specific2_values2 else x)

# 3.
specific2_values3 = ['pc', 'laptop']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'pclaptop' if x in specific2_values3 else x)

# 4.
specific2_values4 = ['solaress', 'solarsystemac']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'solar' if x in specific2_values4 else x)

# 5.
specific2_values5 = ['solarchiller', 'systemacchiller']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'chiller' if x in specific2_values5 else x)

# 6.
specific2_values6 = ['monitorsignagemoniormonitortv', 'monitorpc', 'moniormonitortvtv', 'computermonitors']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'monitor' if x in specific2_values6 else x)

# 7.
specific2_values7 = ['control', 'highbrightness', 'softwaresolution', 'signagecaresolution', 'technicalsupport',
                     'services', 'salesinquiry', 'solaraircare', 'chilleraircare', 'systemacaircare']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'support' if x in specific2_values7 else x)

# 8.
specific2_values8 = ['medicaldisplay', 'commercialdisplay', 'medicaldisplays', 'led', 'ledtraininone', 'fhdseries',
                     'oled']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'display' if x in specific2_values8 else x)

# 9.
specific2_values9 = ['multisplit', 'singlesplit', 'multiinverter', 'trainlgvrfsystems', 'multiv5air', 'multivwater5',
                     'multiv5vrf', 'vrfmultisplitsinglesplit', 'vrfmultisplitsinglesplitchiller', 'vrfsinglesplit',
                     'vrfmultisplit', 'ogrzewaniepompyciepa']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'vrf' if x in specific2_values9 else x)

# 10.
specific2_values10 = ['videowtrain', 'videowtrainrmk', 'projector', 'video']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'videoprojector' if x in specific2_values10 else x)

# 11.
specific2_values11 = ['heating', 'athermodynamicwaterheater']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'heater' if x in specific2_values11 else x)

# 12.
specific2_values12 = ['idb', 'educationcreateboard']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'interactivedigitalboard' if x in specific2_values12 else x)

# 13.
specific2_values13 = ['lgonequickseries', 'onequickseries', 'lgonequick']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'onequick' if x in specific2_values13 else x)

# 14.
specific2_values14 = ['webos', 'procentric', 'clouddevice']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'oscloud' if x in specific2_values14 else x)

# 15.
specific2_values15 = ['rac', 'tetooucasseteinverter', 'arcondicionadoresidencial', 'residentialairconditioner',
                      'aireacondicionadoresidencial']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'airconditioner' if x in specific2_values15 else x)

# 16.
specific2_values16 = ['robots']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'robot' if x in specific2_values16 else x)

 # 17.
specific2_values17 = ['outros', 'otros', 'others']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'other' if x in specific2_values17 else x)

value_counts = df_train['product_category'].value_counts()

#X['product_category'] = X['product_category'].apply(lambda x: 'other' if value_counts[x] < 50 or value_counts[x] == 53 else x)
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'undefined' if pd.isna(x) or value_counts.get(x, 0) < 50 or value_counts.get(x, 0) == 53 else x)
df_train['product_category'].value_counts()

undefined                      20591
interactive digital board       6154
vrf                             5640
multi-split                     3674
video wall signage              2029
etc.                            2018
led signage                     1895
interactive signage             1862
single-split                    1569
airconditioner                  1447
oled signage                    1350
hotel tv                        1112
chiller                         1051
standard signage                 996
medical display                  962
lg one:quick series              768
monitor                          762
one:quick series                 668
heater                           655
high brightness signage          477
ventilation                      420
teto ou cassete inverter         308
support                          283
multi inverter                   257
ar condicionado residencial      224
high brightness                  219
software solution                191
a

### customer_position 전처리

In [None]:
position_mapping = {
    'ceo/founder': 'CEO/Founder', 'founder': 'CEO/Founder', 'chief executive officer': 'CEO/Founder',
    'ceo/fundador': 'CEO/Founder', 'the big boss': 'CEO/Founder',
    'vice president': 'Vice President', 'vicepresident': 'Vice President', 'vp': 'Vice President',
    'c-level executive': 'C-Level Executive', 'c-levelexecutive': 'C-Level Executive',
    'leadership/executive office/owner': 'C-Level Executive',
    'director': 'Director', 'business unit director': 'Director',
    'associate/analyst': 'Associate/Analyst', 'associate professor': 'Associate/Analyst',
    'assistant professor': 'Associate/Analyst', 'asst prof.': 'Associate/Analyst',
    'entry level': 'Entry Level', 'entrylevel': 'Entry Level',
    'manager': 'Manager', 'gerente': 'Manager',
    'consultant': 'Consultant', 'commercial consultant': 'Consultant',
    'architecture/consult': 'Consultant', 'architect/consultant': 'Consultant',

    'teacher': 'Education Professional', 'educator': 'Education Professional',
    'professor': 'Education Professional', 'physics teacher': 'Education Professional',
    'maths lecturer': 'Education Professional', 'quantitative aptitude faculty': 'Education Professional',
    'english trainer for ielts,toefl,pte,gre,sat exams.': 'Education Professional', 'pgt physics': 'Education Professional',
    'chemistry teacher': 'Education Professional', 'math and physics teacher': 'Education Professional',
    'assistant professor of enlish': 'Education Professional', 'professor of mathematics': 'Education Professional',
    'physics and mathematics teacher': 'Education Professional',
    # 기타 분류
    'other': 'other', 'others': 'other', 'not applicable': 'other', 'other - please specify - cedia association': 'other'


}




df_train['customer_position'] = df_train['customer_position'].replace(position_mapping)
df_train['customer_position'] = df_train['customer_position'].str.lower()
df_train['customer_position'] = df_train['customer_position'].str.replace(' ', '')
df_train['customer_position'] = df_train['customer_position'].str.replace('/', '')
df_train['customer_position'] = df_train['customer_position'].str.replace('//', '/')
df_train['customer_position'] = df_train['customer_position'].str.replace('_', '')
df_train['customer_position'] = df_train['customer_position'].str.replace('-', '')
df_train['customer_position'] = df_train['customer_position'].str.replace(r'/$', '', regex=True)
df_train['customer_position'] = df_train['customer_position'].str.replace(r'^/', '', regex=True)

category_counts = df_train['customer_position'].value_counts()
categories_to_remove = category_counts[category_counts <= 400].index
df_train.loc[df_train['customer_position'].isin(categories_to_remove), 'customer_position'] = 'undefined'


df_train['customer_position'].value_counts()

none                19680
manager              8220
ceofounder           7990
other                7472
director             4844
associateanalyst     2607
partner              2536
entrylevel           2219
clevelexecutive       865
trainee               849
undefined             626
vicepresident         521
intern                446
enduser               424
Name: customer_position, dtype: int64

### expected_timeline 전처리

In [None]:
timeline_mapping = {
    # 시간 프레임 관련 값
    'less than 3 months': 'less than 3 months',
    '3 months ~ 6 months': '3 to 6 months',
    '6 months ~ 9 months': '6 to 9 months',
    '9 months ~ 1 year': '9 months to 1 year',
    'more than a year': 'more than a year',
    'less than 6 months': 'less than 6 months',
    '3_months_~_6_months': '3 to 6 months',
    'less_than_3_months': 'less than 3 months',
    '6_months_~_9_months': '6 to 9 months',
    '9_months_~_1_year': '9 months to 1 year',
    'more_than_a_year': 'more than a year',

    # 상황 설명 값
    'quote has been sent to customer.': 'quote sent',
    'client not interested in product..': 'client not interested',
    'being followed up': 'being followed up',
    'update- 7th aug--demo given. customer will confirm next week': 'demo given - follow up',
    'details send': 'details sent',
    'requires detail for tender. no purchase requirement right now.': 'details required for tender',
    'the client is not having any requirement hence closig in system.': 'client has no requirement',
    'discussed with client details mailed.': 'details mailed to client',
    'he is looking for video wtrain & idb for his office.': 'looking for specific products',
    'details shared': 'details shared',
    'demo to be aligned': 'demo to be scheduled',
    'update- 13th spet--follow up to be done on 15th sept': 'follow up scheduled',
    'partner is already in touch with our rd, orno.': 'partner in touch with representative',
    'rnr': 'no response received',
    'scheduling a meeting': 'meeting scheduled',
    'customer want demo of idb.': 'customer wants demo',
    'already shared quotation through si.': 'quotation shared',
    'quotation shared.': 'quotation shared',
    'duplicate lead': 'duplicate lead',
    'invalid lead': 'invalid lead',
    'demo scheduled for first week feb': 'demo scheduled',
    'forwarded to bdo, being followed up': 'being followed up by business development officer',
    'spoke with custome he want 43" tv': 'customer wants specific product',
    'don’t have budget': 'no budget',
    'client shtrain get back for exploring demo of idb': 'client will get back regarding demo',
    'already in discussion with partner from bangalore': 'in discussion with partner',
    'require demo price send': 'demo and price inquiry',
    'size not available': 'specific size not available',
    'eol model new model quote requirment after 30 days.': 'end of life model, new model required after 30 days',
    'need to discuss with client in next two months.': 'discussion planned with client',
    'spoken to client, he will check if they need demo and confirm': 'client will confirm about demo',
    '29thsep2021:-no such requirement as of now': 'no requirement as of now',
    'purchase planning after 3 months': 'planning to purchase after 3 months',
    '09-02-2022 requested for boq of requirement': 'bill of quantities requested',
    'converted this lead into opportunity.': 'lead converted into opportunity',
    'demo scheduled for 24th oct': 'demo scheduled',
    'discussed with client. we need to align demo.': 'discussion with client about demo',
    'require demo': 'demo required',
    'client is looking for 86" display with vc solution': 'client looking for specific display with solution',
    'quote sent to customer.': 'quote sent to customer',
    'under discussion': 'under discussion',
    'meeting planned for further discussion': 'meeting planned',
    'customer will come for demo in next week': 'customer will come for demo',
    'he want demo next week': 'demo requested next week',
    'ctrain and discused to custome customer wants demo.': 'customer wants demo, discussed over ctrain',
    'demo planned, will update further status once its completed': 'demo planned and update pending',
    'quote sent, the client is required demo in june': 'quote and demo scheduled in june',
    'will come for the demo': 'client will come for demo',
    'customer has not answering ctrain': 'customer not answering'
}


df_train['expected_timeline'] = df_train['expected_timeline'].replace(position_mapping)
df_train['expected_timeline'] = df_train['expected_timeline'].str.lower()
df_train['expected_timeline'] = df_train['expected_timeline'].str.replace('/', '')
df_train['expected_timeline'] = df_train['expected_timeline'].str.replace('.', '')
df_train['expected_timeline'] = df_train['expected_timeline'].str.replace('~', '')
df_train['expected_timeline'] = df_train['expected_timeline'].str.replace(' ', '')
df_train['expected_timeline'] = df_train['expected_timeline'].str.replace('//', '/')
df_train['expected_timeline'] = df_train['expected_timeline'].str.replace('_', '')
df_train['expected_timeline'] = df_train['expected_timeline'].str.replace('-', '')
df_train['expected_timeline'] = df_train['expected_timeline'].str.replace(r'/$', '', regex=True)
df_train['expected_timeline'] = df_train['expected_timeline'].str.replace(r'^/', '', regex=True)

category_counts = df_train['expected_timeline'].value_counts()
categories_to_remove = category_counts[category_counts <= 110].index
df_train.loc[df_train['expected_timeline'].isin(categories_to_remove), 'expected_timeline'] = 'undefined'

df_train['expected_timeline'].value_counts()

lessthan3months    17326
3months6months      5035
morethanayear       3027
9months1year        1108
6months9months      1102
undefined            838
Name: expected_timeline, dtype: int64