# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

### 데이터 셋 읽어오기

In [None]:
# df_train = pd.read_csv("train.csv") # 학습용 데이터
# df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

from google.colab import drive
drive.mount('/content/drive')

df_train = pd.read_csv("/content/drive/My Drive/lg_aimers/train.csv")
df_test = pd.read_csv("/content/drive/My Drive/lg_aimers/submission.csv")
city_list = pd.read_csv('/content/drive/My Drive/lg_aimers/oecd_city.csv').iloc[1:,0:1]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 고정현 전처리 (customer_country)

In [None]:
df_train['customer_country'] = df_train['customer_country'].str.lower()
df_train['customer_country'] = df_train['customer_country'].str.replace(' ', '')
df_train['customer_country'] = df_train['customer_country'].str.replace('//', '/')
df_train['customer_country'] = df_train['customer_country'].str.replace('//', '/')
df_train['customer_country'] = df_train['customer_country'].str.replace(r'/$', '', regex=True)
df_train['customer_country'] = df_train['customer_country'].str.replace(r'^/', '', regex=True)

In [None]:
city_list = city_list.rename(columns = {'#NAME?' : 'name'})
city_list.loc[:,'name'] = city_list['name'].apply( lambda x : x.replace(' ', ''))
city_list['name'] = city_list['name'].str.lower()
country_cities = {}
current_country = None

for ind, row in city_list.iterrows():

    if not re.search(r'\d', row['name']):
        current_country = row['name'].split(':')[1]
        country_cities[current_country] = []

    else:
        if current_country:
            city_name = row['name'].split(':')[1]
            city_name_cleaned = re.sub(r'\s*\([^)]*\)', '', city_name).strip()
            country_cities[current_country].append(city_name_cleaned)

countries = [
    "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia",
    "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin",
    "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi",
    "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia",
    "Comoros", "Congo", "Costa Rica", "Cote d'Ivoire", "Croatia",
    "Cuba", "Cyprus", "Czech Republic", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt",
    "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia", "Fiji", "Finland", "France",
    "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau",
    "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel",
    "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "southkorea", "Kosovo",
    "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania",
    "Luxembourg", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania",
    "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique",
    "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Macedonia",
    "Norway", "Oman", "Pakistan", "Palau", "Palestine", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines",
    "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines",
    "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone",
    "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Spain", "Sri Lanka",
    "Sudan", "Suriname", "Sweden", "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste",
    "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine",
    "u.a.e", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City",
    "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe",\
    'us', 'puertorico', 'türkiye', 'usa',\
    'hongkong'
]


countries = list(map(lambda x : x.replace(' ', ''), countries))
countries = [country.lower() for country in countries]

pattern_updated = '|'.join([f'\\b{country}\\b|{country}$' for country in countries])
pattern2 = r'[A-Za-z]{2}\d{5}$'
pattern3 = r'salem|ny|nevada|ohio|kansascity|chulavista|wichita|oh44483|keshena|andersenafb|goleta|cincinnati'
pattern4 = r'madurai|mumbai|ludhiana|bangalore'
pattern5 = r'medellin'
pattern6 = r'macedonia'
pattern7 = r'charlestown'
pattern8 = r'上海'

def classify_country(x):
    if re.search(pattern3, x, re.IGNORECASE):
        return 'unitedstates'
    elif re.search(pattern4, x, re.IGNORECASE):
        return 'india'
    elif re.search(pattern5, x, re.IGNORECASE):
        return 'colombia'
    elif re.search(pattern6, x, re.IGNORECASE):
        return 'northmacedonia'
    elif re.search(pattern7, x, re.IGNORECASE):
        return 'saintkittsandnevis'
    elif re.search(pattern8, x, re.IGNORECASE):
        return 'china'
    else:
        return x



country_patterns = {country: '|'.join(map(re.escape, cities)) for country, cities in country_cities.items()}

def classify_country2(sentence):
    for country, pattern in country_patterns.items():
        if re.search(pattern, sentence, re.IGNORECASE):
            return country
    return sentence

final_map = {'us' : 'unitedstates', 'usa' : 'unitedstates', 'korea' : 'southkorea', 'tã¼rkiye': 'turkey', 'czechia' : 'czechrepublic',
            'slovakia' :'slovakrepublic', 'türkiye' : 'turkey'}

In [None]:
df_train['customer_country'] = df_train['customer_country'].fillna('undefined')
df_train.loc[:,'customer_country'] = df_train['customer_country'].apply\
    (lambda x : re.search(pattern_updated, x, re.IGNORECASE).group() if re.search(pattern_updated, x, re.IGNORECASE) else x)

df_train.loc[:,'customer_country'] = df_train['customer_country'].apply\
    (lambda x: 'unitedstates' if re.search(pattern2, x) else x)
df_train['customer_country'] = df_train['customer_country'].apply(classify_country)
df_train['customer_country'] = df_train['customer_country'].apply(classify_country2)
df_train['customer_country'] = df_train['customer_country'].apply(lambda x : final_map[x] if x in final_map.keys() else x)
df_train['customer_country'] = df_train['customer_country'].apply(lambda x: x if x in countries else 'undefined')

In [None]:
country_ratio = pd.crosstab(df_train['customer_country'], df_train['is_converted'])
country_ratio['sum'] = country_ratio[False]+country_ratio[True]
country_ratio['ratio'] = country_ratio[True]/country_ratio['sum']
country_ratio = country_ratio.sort_values('ratio', ascending=False)
country_ratio.head(30)

is_converted,False,True,sum,ratio
customer_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gambia,0,2,2,1.0
laos,1,2,3,0.666667
coted'ivoire,3,4,7,0.571429
taiwan,24,26,50,0.52
mali,1,1,2,0.5
gabon,2,2,4,0.5
myanmar,2,2,4,0.5
congo,4,4,8,0.5
seychelles,1,1,2,0.5
nigeria,102,84,186,0.451613


## 권순찬 전처리 (inquiry_type, customer_position, expected_timeline)

In [None]:
df_train['inquiry_type'] = df_train['inquiry_type'].str.lower()
category_counts = df_train['inquiry_type'].value_counts()
categories_to_remove = category_counts[category_counts <= 12].index
df_train.loc[df_train['inquiry_type'].isin(categories_to_remove), 'inquiry_type'] = 'other'

similar_categories = ['others', 'other_', 'etc.']
df_train['inquiry_type'].replace(similar_categories, 'other', inplace=True)

similar_categories_mapping = {
    'quotation_or_purchase_consultation': 'quotation or purchase consultation',
    'technical consultation': 'technical support',
    'technical': 'technical support',
    'sales': 'sales inquiry'
}

df_train['inquiry_type'] = df_train['inquiry_type'].replace(similar_categories_mapping)
df_train['inquiry_type'].unique()

array(['quotation or purchase consultation', 'product information',
       'other', 'usage or technical consultation', 'trainings',
       'services', 'sales inquiry', 'technical support',
       'request for partnership', nan, 'request a demo',
       'request for distributorship', 'request for quotation or purchase',
       'request for technical consulting'], dtype=object)

In [None]:
position_mapping = {
    'ceo/founder': 'CEO/Founder', 'founder': 'CEO/Founder', 'chief executive officer': 'CEO/Founder',
    'ceo/fundador': 'CEO/Founder', 'the big boss': 'CEO/Founder',
    'vice president': 'Vice President', 'vicepresident': 'Vice President', 'vp': 'Vice President',
    'c-level executive': 'C-Level Executive', 'c-levelexecutive': 'C-Level Executive',
    'leadership/executive office/owner': 'C-Level Executive',
    'director': 'Director', 'business unit director': 'Director',
    'associate/analyst': 'Associate/Analyst', 'associate professor': 'Associate/Analyst',
    'assistant professor': 'Associate/Analyst', 'asst prof.': 'Associate/Analyst',
    'entry level': 'Entry Level', 'entrylevel': 'Entry Level',
    'manager': 'Manager', 'gerente': 'Manager',
    'consultant': 'Consultant', 'commercial consultant': 'Consultant',
    'architecture/consult': 'Consultant', 'architect/consultant': 'Consultant',

    'teacher': 'Education Professional', 'educator': 'Education Professional',
    'professor': 'Education Professional', 'physics teacher': 'Education Professional',
    'maths lecturer': 'Education Professional', 'quantitative aptitude faculty': 'Education Professional',
    'english trainer for ielts,toefl,pte,gre,sat exams.': 'Education Professional', 'pgt physics': 'Education Professional',
    'chemistry teacher': 'Education Professional', 'math and physics teacher': 'Education Professional',
    'assistant professor of enlish': 'Education Professional', 'professor of mathematics': 'Education Professional',
    'physics and mathematics teacher': 'Education Professional',
    # 기타 분류
    'other': 'Other', 'others': 'Other', 'not applicable': 'Other', 'no influence': 'Other',
    'other - please specify - cedia association': 'Other',
    'this is a consume display requirement for home purpose.': 'Not Specified',
    'bulgaria': 'Not Specified', 'exhibitiontv': 'Not Specified'
}


df_train['customer_position'] = df_train['customer_position'].replace(position_mapping)

In [None]:
df_train['customer_position'] = df_train['customer_position'].str.lower()

category_counts = df_train['customer_position'].value_counts()
categories_to_remove = category_counts[category_counts <= 30].index
df_train.loc[df_train['customer_position'].isin(categories_to_remove), 'customer_position'] = 'other'
df_train['customer_position'].unique()

array(['entry level', 'ceo/founder', 'partner', 'manager',
       'vice president', 'associate/analyst', 'c-level executive', 'none',
       'director', 'other', 'intern', 'trainee', 'installer', 'hospital',
       'end-user'], dtype=object)

In [None]:
timeline_mapping = {
    # 시간 프레임 관련 값
    'less than 3 months': 'less than 3 months',
    '3 months ~ 6 months': '3 to 6 months',
    '6 months ~ 9 months': '6 to 9 months',
    '9 months ~ 1 year': '9 months to 1 year',
    'more than a year': 'more than a year',
    'less than 6 months': 'less than 6 months',
    '3_months_~_6_months': '3 to 6 months',
    'less_than_3_months': 'less than 3 months',
    '6_months_~_9_months': '6 to 9 months',
    '9_months_~_1_year': '9 months to 1 year',
    'more_than_a_year': 'more than a year',

    # 상황 설명 값
    'quote has been sent to customer.': 'quote sent',
    'client not interested in product..': 'client not interested',
    'being followed up': 'being followed up',
    'update- 7th aug--demo given. customer will confirm next week': 'demo given - follow up',
    'details send': 'details sent',
    'requires detail for tender. no purchase requirement right now.': 'details required for tender',
    'the client is not having any requirement hence closig in system.': 'client has no requirement',
    'discussed with client details mailed.': 'details mailed to client',
    'he is looking for video wall & idb for his office.': 'looking for specific products',
    'details shared': 'details shared',
    'demo to be aligned': 'demo to be scheduled',
    'update- 13th spet--follow up to be done on 15th sept': 'follow up scheduled',
    'partner is already in touch with our rd, orno.': 'partner in touch with representative',
    'rnr': 'no response received',
    'scheduling a meeting': 'meeting scheduled',
    'customer want demo of idb.': 'customer wants demo',
    'already shared quotation through si.': 'quotation shared',
    'quotation shared.': 'quotation shared',
    'duplicate lead': 'duplicate lead',
    'invalid lead': 'invalid lead',
    'demo scheduled for first week feb': 'demo scheduled',
    'forwarded to bdo, being followed up': 'being followed up by business development officer',
    'spoke with custome he want 43" tv': 'customer wants specific product',
    'don’t have budget': 'no budget',
    'client shall get back for exploring demo of idb': 'client will get back regarding demo',
    'already in discussion with partner from bangalore': 'in discussion with partner',
    'require demo price send': 'demo and price inquiry',
    'size not available': 'specific size not available',
    'eol model new model quote requirment after 30 days.': 'end of life model, new model required after 30 days',
    'need to discuss with client in next two months.': 'discussion planned with client',
    'spoken to client, he will check if they need demo and confirm': 'client will confirm about demo',
    '29thsep2021:-no such requirement as of now': 'no requirement as of now',
    'purchase planning after 3 months': 'planning to purchase after 3 months',
    '09-02-2022 requested for boq of requirement': 'bill of quantities requested',
    'converted this lead into opportunity.': 'lead converted into opportunity',
    'demo scheduled for 24th oct': 'demo scheduled',
    'discussed with client. we need to align demo.': 'discussion with client about demo',
    'require demo': 'demo required',
    'client is looking for 86" display with vc solution': 'client looking for specific display with solution',
    'quote sent to customer.': 'quote sent to customer',
    'under discussion': 'under discussion',
    'meeting planned for further discussion': 'meeting planned',
    'customer will come for demo in next week': 'customer will come for demo',
    'he want demo next week': 'demo requested next week',
    'call and discused to custome customer wants demo.': 'customer wants demo, discussed over call',
    'demo planned, will update further status once its completed': 'demo planned and update pending',
    'quote sent, the client is required demo in june': 'quote and demo scheduled in june',
    'will come for the demo': 'client will come for demo',
    'customer has not answering call': 'customer not answering'
}


df_train['expected_timeline'] = df_train['expected_timeline'].replace(position_mapping)

In [None]:
df_train['expected_timeline'].fillna('undefined', inplace=True)
df_train['expected_timeline'] = df_train['expected_timeline'].str.lower().str.replace('_', '')
df_train['expected_timeline'] = df_train['expected_timeline'].str.lower().str.replace('.', '')
df_train['expected_timeline'] = df_train['expected_timeline'].str.lower().str.replace(' ', '')
category_counts = df_train['expected_timeline'].value_counts()
categories_to_remove = category_counts[category_counts <= 22].index
df_train.loc[df_train['expected_timeline'].isin(categories_to_remove), 'expected_timeline'] = 'etc'

df_train['expected_timeline'].unique()

  df_train['expected_timeline'] = df_train['expected_timeline'].str.lower().str.replace('.', '')


array(['lessthan3months', 'undefined', '3months~6months', '9months~1year',
       'morethanayear', '6months~9months', 'etc', 'beingfollowedup',
       'lessthan6months'], dtype=object)

## 박소해 전처리 (customer_job, business_subarea)

In [None]:
# remove un-used columns
df_train = df_train.drop(columns=['product_subcategory', 'product_modelname', 'customer_country.1'])
df_test = df_test.drop(columns=['product_subcategory', 'product_modelname', 'customer_country.1'])

# null to 0
df_train.update(df_train[['historical_existing_cnt','id_strategic_ver','it_strategic_ver', 'idit_strategic_ver']].fillna(0))

# preprocess object-type columns
str_cols = df_train.columns[df_train.dtypes==object]
df_train[str_cols] = df_train[str_cols].fillna('undefined')  # null to 'undefined'
df_train[str_cols] = df_train[str_cols].apply(lambda x: x.str.replace(' ', ''))  # remove spaces
df_train[str_cols] = df_train[str_cols].apply(lambda x: x.str.lower())  # lower case

df_train[str_cols] = df_train[str_cols].apply(lambda x: x.str.replace(pat=r'[^a-z0-9]', repl=r'', regex=True))  # remove special characters (case 1. all, including non-eng to eng)
# df_train[str_cols] = df_train[str_cols].apply(lambda x: x.str.replace(pat=r'[^a-z0-9,/]', repl=r'', regex=True))  # remove special characters (case 2. only a few for tokenizing issue)

# merging
df_train = df_train.replace(dict.fromkeys(['etc', 'others'], 'other'))

In [None]:
#1.
df_train['customer_job'] = df_train['customer_job'].replace('accountexec/manager', 'accountmanagement')

#2.
df_train['customer_job'] = df_train['customer_job'].replace('accountspayable', 'accounting')

#3.
specific_values = ['admin', 'administración', 'administration', 'administrative', 'adminisztráció', 'amministrativo',
                   'imagingadministrator', 'itadmin', 'itadministrator', 'networkadministrator', 'pacsadministrator',
                   'platformadministrator', 'systemsadministrator']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'admin' if x in specific_values else x)

#4.
df_train['customer_job'] = df_train['customer_job'].replace('administrativeassistant', 'adminassistant')

#5.
df_train['customer_job'] = df_train['customer_job'].replace('advertisingandpromotionsteam', 'advertising')
df_train['customer_job'] = df_train['customer_job'].replace('storepromotions', 'advertising')
df_train['customer_job'] = df_train['customer_job'].replace('tradeshowevent', 'advertising')

#6.
specific_values2 = ['architect', 'architect/owner', 'architectassinteriores', 'arquitecto/consultor', 'projectarchitect']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'architect' if x in specific_values2 else x)

#7.
specific_values3 = ['artanddesign','arte_e_design', 'arteydiseño','artist,leadonequipmentselection', 'arts_and_design', 'artsanddesign']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'artist' if x in specific_values3 else x)

#8.
specific_values4 = ['assistinservingfood', 'serving', 'servingfood', 'servingrobot', 'waiter']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'serving' if x in specific_values4 else x)

#9.
specific_values5 = ['a/vprojectmanager', 'avestimator', 'avprojectmanager', 'avtech', 'avtechnician']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'avtechnician' if x in specific_values5 else x)

#10.
specific_values6 = ['authorize(youareresponsibleformakingthefinaldecision)', 'purchasingauthority']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'authorizer' if x in specific_values6 else x)

#11.
df_train['customer_job'] = df_train['customer_job'].replace('publicbidder', 'bidder')

#12.
specific_values7 = ['business_development', 'businessdevelopment']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'businessdevelopment' if x in specific_values7 else x)

#13.
specific_values8 = ['cctvmonetoring', 'cctvview']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'cctvoperator' if x in specific_values8 else x)

#14.
specific_values9 = ['ceo', 'ceo/founder', 'chief', 'clevelexecutive', 'coo', 'decider', 'decisionmaker',
                    'finalapproval', 'head', 'president', 'presidentforsennco', 'thebigboss', 'underboss',
                    'vicepresident', 'vp/gm']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'chief' if x in specific_values9 else x)

#15.
specific_values10 = ['chiefeng', 'chiefengineer', 'chiefofengineering']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'chiefengineer' if x in specific_values10 else x)

#16.
specific_values11 = ['chirurgien', 'cirugano', 'doctor', 'főorvos',  'profesionaldecirugía', 'surgeryprofessional',
                     'surgeryprofessional\u200b']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'doctor' if x in specific_values11 else x)

#17.
specific_values12 = ['contractor', 'cintractor', 'managingcontractor']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'contractor' if x in specific_values12 else x)

#18.
specific_values13 = ['consultant', 'consultant,cabinetfabricator', 'consultant/purchaser', 'consultent', 'consulting']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'consultant' if x in specific_values13 else x)

#19.
specific_values14 = ['coordinator', 'corporate/office', 'correspondence', 'costaravteam']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'corporate' if x in specific_values14 else x)

#20.
specific_values15 = ['contentcreation,eqconsultant', 'creativedirector']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'creator' if x in specific_values15 else x)

#21.
specific_values16 = ['design', 'design/build', 'design/purchaser', 'designandprovideequipment', 'designer',
                     'designer,creativetechnologist', 'designer,producer', 'designers', 'designere/budget',
                     'designerpurchaser', 'design/purchaser', 'graphicdesign', 'kreation_und_design',
                     'kreationunddesign', 'művészet_és_design']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'designer' if x in specific_values16 else x)

#22.
specific_values17 = ['design/install/training/support', 'designandinstall', 'designandinstallationcompany', 'designer/installer']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'designinstaller' if x in specific_values17 else x)

#23.
specific_values18 = ['designengineer', 'designer/engineer']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'designengineer' if x in specific_values18 else x)

#24.
specific_values19 = ['desicionmaker', 'design/decisionmaker', 'generalmanager(decisionmaker)', 'technical/decisionmaker']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'decisionmaker' if x in specific_values19 else x)

#25.
specific_values20 = ['designer/pm/gc', 'designer/projectmanager']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'desingpm' if x in specific_values20 else x)

#26.
specific_values21 = ['developer', 'developer/property', 'softwaredeveloper']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'developer' if x in specific_values21 else x)

#27.
specific_values22 = ['digitaldisplayvssignageneed', 'digitalsignage']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'digitalsignage' if x in specific_values22 else x)

#28.
specific_values23 = ['directeurtechnique', 'director', 'director,it', 'directorcomercial',
                     'directorit', 'directorofengineering', 'directoroffinance', 'directorofit',
                     'directoroflodging','directorofoperations', 'directorpurchaser', 'engineeringdirector',
                     'f&bdirectorforbicyclecasino', "i'mdirectingit", 'itdairector',  'itdirector',  'managingdirector',
                     'overseer',  'projectdirector', 'purchasingdirector', 'supervisor', 'technicaldirector']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'director' if x in specific_values23 else x)

#29.
specific_values24 = ['distribuidor', 'distributor', 'distributorquotation']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'distributor' if x in specific_values24 else x)

#30.
specific_values25 = ['education', 'educator', 'highereducation(college&university)', 'institute&academy']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'educator' if x in specific_values25 else x)

#31.
specific_values26 = ['engineer', 'engineering', 'engineering&technical', 'engineering,design,andinstall',
                     'hardwaredesignengineer', 'projectengineer', 'principalengineer', 'seniordesignengineer',
                     'systemengineer', 'systemsengineer']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'engineer' if x in specific_values26 else x)

#32.
specific_values27 = ['engagementexecutive', 'engineering&technicalexecutive', 'executive', 'execution',
                     'marketingexecutive', 'financeexecutive', 'operationsexecutive', 'principal',  'principalincharge',
                     'salesexecutive']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'executive' if x in specific_values27 else x)

#33.
specific_values28 = ['eventmarketing', 'fieldmarketing', 'marketing', 'marketingcoordinator',
                     'marketingoperations', 'productmarketing', 'technicalmarketing']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'marketing' if x in specific_values28 else x)

#34.
specific_values29 = ['equipmentandappprovider', 'equipmentcustodian', 'equipmentplanner', 'equipmentselection']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'equipment' if x in specific_values29 else x)

#35.
specific_values30 = ['facilitator', 'facilitatorinstallationservices', 'facilities', 'facilitiesandoperations',
                     'facilityadministrator', 'facilitymanager']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'facilitator' if x in specific_values30 else x)

#36.
specific_values31 = ['field/outsidesales', 'sale', 'sales', 'salesman', 'salesmanager',
                     'salesoperations', 'technicalsales', 'salesrep', 'salesengineering', 'sellerinstaller',
                     'vendite', 'vertrieb', 'értékesítés']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'sales' if x in specific_values31 else x)

#37.
specific_values32 = ['finance', 'finanzas', 'finanzen', 'pénzügy']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'finance' if x in specific_values32 else x)

#38.
specific_values33 = ['gc', 'generalcontractor']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'generalcontractor' if x in specific_values33 else x)

#39.
specific_values34 = ['genelmüdür', 'generalmanagement', 'generalmanager', 'generalmanagerpurchaser', 'generamanager',
                     'globalleadofproduction','gm', 'gm/partowner']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'generalmanager' if x in specific_values34 else x)

#40.
specific_values35 = ['generalmanagerprojectmanager', 'gestión_de_proyectos']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'generalprojectmanager' if x in specific_values35 else x)

#41.
specific_values36 = ['hr','human_resources', 'humanresources', 'hrposting']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'humanresource' if x in specific_values36 else x)

#42.
specific_values37 = ['healthcare_services', 'healthcareprofessionals', 'healthcareservices', 'mentalhealth']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'healthcare' if x in specific_values37 else x)

#43.
specific_values38 = ['helpdesk/desktopservices', 'helpdeskspecialist']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'helpdesk' if x in specific_values38 else x)

#44.
specific_values39 = ['implement', 'informatics,touchcapability', 'information_technology', 'informationtechnology',
                     'informationtechnology\u200b', 'it', 'it/software', 'itdepartment', 'ittech', 'itsupport',
                     'itspecialist', 'itintegrator', 'ithardwaretechnician', 'itinformationtechnology', 'officeit']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'it' if x in specific_values39 else x)

#45.
specific_values40 = ['installationandpurchaser', 'installer', 'installer/salesrep', 'installer/systemintegrater',
                     'postinstallsupportandservice',  'planningandinstallation', 'systeminstaller']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'install' if x in specific_values40 else x)

#46.
specific_values41 = ['integrador', 'integration', 'integrator', 'intergrator', 'si', 'specifier/integrator',
                     'systemdesigner,integrator']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'integrator' if x in specific_values41 else x)

#47.
specific_values42 = ['interiordesigner', 'interiorstylist']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'interior' if x in specific_values42 else x)

#48.
specific_values43 = ['instructor', 'teacher', 'teaching']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'instructor' if x in specific_values43 else x)

#49.
specific_values44 = ['lead', 'leaddesigner', 'leadengineer', 'leader', 'itprojectlead',
                     'projectlead', 'teamlead', 'teamleader']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'leader' if x in specific_values44 else x)

#50.
specific_values45 = ['medicalsolutionprovider', 'medicalsolutionprovider\u200b']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'medicalsolutionprovider' if x in specific_values45 else x)

#51.
specific_values46 = ['maintenance', 'maintenancesupervisor', 'maintenancetechnician']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'maintenance' if x in specific_values46 else x)

#52.
specific_values47 = ['management',  'manager', 'managgere', 'managingemployee', 'managingpartner', 'manger',
                     'officemanager', 'üzemeltetés']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'manager' if x in specific_values47 else x)

#53.
specific_values48 = ['manufacturer', 'manufacturingfactory/plant']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'manufacturer' if x in specific_values48 else x)

#54.
specific_values49 = ['media_and_communication', 'media_e_comunicazione', 'mediaandcommunication',
                     'mediaandcommunications',  'medios_de_comunicación', 'medien_und_kommunikation',
                     'média_és_kommunikáció']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'mediaandcommunication' if x in specific_values49 else x)

#55.
specific_values50 = ['military_and_protective_services', 'militaryandprotectiveservices']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'military' if x in specific_values50 else x)

#56.
specific_values51 = ['obtainquotes,processpurchase', 'planner/purchaser', 'purchase', 'purchaseandinstall',
                     'purchasedept', 'purchaser', 'purchaser,itandinstaller', 'purchasers', 'purchasing',
                     'purchasingagent', 'purchasingcoordinator','purchsing']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'purchase' if x in specific_values51 else x)

#57.
specific_values52 = ['operaciones', 'operations']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'operation' if x in specific_values52 else x)

#58.
specific_values53 = ['operationsmanager', 'opsmgr']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'operationmanager' if x in specific_values53 else x)

#59.
specific_values54 = ['product_management', 'productmanagement']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'productmanager' if x in specific_values54 else x)

#60.
specific_values55 = ['pm', 'producer/projectmanager', 'program_and_project_management', 'program_and_project_manager',
                     'program_és_projektmenedzsment', 'programandprojectmanagement',  'programm_und_projektmanagement',
                     'programmundprojektmanagement',  'projectcoordinator', 'projectadministrator', 'programdirectors',
                     'projectionmanager', 'projectmanage', 'projectmanager', 'projectmanager/designer',
                     'projectmanager/estimator', 'projectmanager/principal','projectsales/manage',
                     'projektmenedzsment\tprogramandprojectmanagement', 'projectdesigner','projectfacilitator',
                     'projecthead', 'projectrmgmt', 'r&dprojectmanager']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'projectmanager' if x in specific_values55 else x)

#61.
specific_values56 =  ['productresearch', 'productresearcher', 'projectresearcher', 'research/install',
                      'researchproductsandprices', 'researchandinstalaltion']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'research' if x in specific_values56 else x)

#62.
specific_values57 = ['medicalimagingspecialist', 'profesionalderadiología', 'spécialiste_en_imagerie_médicale']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'medicalimagingspecialist' if x in specific_values57 else x)

#63.
specific_values58 = ['proprietário(a)', 'propertyowner', 'ownerrepresentation', 'owningcompany',
                     'ownnermarketingdirector', 'owner/projectmanager', 'businessowner', 'productowner',
                     'buildingowner']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'owner' if x in specific_values58 else x)

#64.
specific_values59 = ['partscoordinator', 'buyer,coordinating', 'servicecoordinator']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'coordinator' if x in specific_values59 else x)

#65.
specific_values60 = ['procurement', 'procurementspecialist', 'procurment', 'sourcing/procurement']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'procurement' if x in specific_values60 else x)

#66.
specific_values61 = ['quality_assurance', 'qualityassurance']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'qualityassurance' if x in specific_values61 else x)

#67.
specific_values62 = [ 'quotationcurator',  'quotegathering/proposertoowner', 'quotingproject', 'sourcing',
                     'sourcing&quotingforenduser']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'quotation' if x in specific_values62 else x)

#68.
specific_values63 = ['radiology_professional',  'radiologyprofessional']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'radiology' if x in specific_values63 else x)

#69.
specific_values64 = ['recommend', 'recommend(yourecommendspecificproductsortechnologiesforthesolution)',
                     'recommendation', 'recommender']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'recommend' if x in specific_values64 else x)

#70.
specific_values65 = ['requirementsandbuyer', 'buyer']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'buyer' if x in specific_values65 else x)

#71.
specific_values66 = ['research&development', 'researchanddevelopement']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'researchanddevelopment' if x in specific_values66 else x)

#72.
specific_values67 = ['reseller', 'reseller/integrator', 'technicaladvisor,reseller', 'vendor/reseller']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'reseller' if x in specific_values67 else x)

#73.
specific_values68 = ['retailer/installer', 'revendedor']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'retailer' if x in specific_values68 else x)

#74.
specific_values69 = ['display', 'displayourproducts', 'restaurantdisplay', 'usingforwindowdisplay']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'display' if x in specific_values69 else x)

#75.
specific_values70 = ['energy', 'renewableenergy']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'energy' if x in specific_values70 else x)

#76.
specific_values71 = ['changetv', 'replacementtv', 'replacingtv']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'changetv' if x in specific_values71 else x)

#77.
specific_values72 = ['signageforanattraction', 'signagemanager', 'signagesubcontractorp/m', 'signcompany',
                     'slidingpicturesofbeautysalon']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'signage' if x in specific_values72 else x)

#78.
specific_values73 = ['solutionadvisor', 'solutionengineer', 'solutionprovider', 'solutionsarchitect',
                     'solutionsproviderandspecifier', 'softwaresolution']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'solution' if x in specific_values73 else x)

#79.
specific_values74 = ['systemsdesign', 'systemsdesigner']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'systemdesigner' if x in specific_values74 else x)

#80.
specific_values75 = ['strategiccommunications', 'strategy&operationsspecialist']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'strategy' if x in specific_values75 else x)

#81.
specific_values76 = ['support', 'support/facilitator,designer']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'support' if x in specific_values76 else x)

#82.
specific_values77 = ['supplier', 'supplierandinstallation']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'supplier' if x in specific_values77 else x)

#83.
specific_values78 = ['tech', 'technical', 'technologyconsultant', 'technologydesigner', 'techservice',
                     'avtechnician', 'fixingtv', 'emergingtechnology/innovation']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'tech' if x in specific_values78 else x)

#84.
df_train['customer_job'] = df_train['customer_job'].replace('tierarzt', 'vat')

#
specific_valuess = ['altro', 'egyéb', 'otros', 'n/a', 'nothing', 'norespoxseonphonewilltryagain',
                    'na', 'other', 'otro', 'otro', 'others', 'otherstores', 'requirementclose',
                    'var', 'askingforquoteforclient', 'autres', 'conferenceroom', 'conferencetable', 'entrylevel',
                    'enduser', 'forconfrence', 'hometheater', 'hoteltv', 'hardware', 'infrastructure',
                    'inquirytobuy/contactustest', 'mastermind', 'mainenduseroftheproduct', 'menu', 'mindenes',
                    'need1tv55"edgeled4kuhd', 'needonetv', 'norequirment', 'partofvideowall', 'photos',
                    'primary', 'primaryenduser',  'projectteammember', 'sonstiges', 'stakeholder', 'submittingproposal',
                    'test4', 'tester', 'thepersonwiththecreditcard', 'undefined', 'user', 'videowall', 'weareiniceland']
df_train['customer_job'] = df_train['customer_job'].apply(lambda x: 'other' if x in specific_valuess else x)

In [None]:
vals = ['other', 'undefined']
df_train['business_subarea'] = df_train['business_subarea'].apply(lambda x: 'other' if x in vals else x)

## 김규림 전처리 (product_category)

In [None]:
# 1.
specific2_values = ['videowallsignage', 'ledsignage', 'interactivesignage', 'oledsignage', 'standardsignage',
                    'highbrightnesssignage', 'specialsignage', 'ur640s', 'smarttvsignage', 'ur640', 'uhdsignage',
                    'digitalsignage', 'tvsignage', 'monitorsignagecommercialtv']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'signage' if x in specific2_values else x)

# 2.
specific2_values2 = ['hoteltv', 'hospitaltv', 'commercialtv', 'commercialtvtv', 'htv', '43us660h0sdawz']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'tv' if x in specific2_values2 else x)

# 3.
specific2_values3 = ['pc', 'laptop']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'pclaptop' if x in specific2_values3 else x)

# 4.
specific2_values4 = ['solaress', 'solarsystemac']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'solar' if x in specific2_values4 else x)

# 5.
specific2_values5 = ['solarchiller', 'systemacchiller']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'chiller' if x in specific2_values5 else x)

# 6.
specific2_values6 = ['monitorsignagemoniormonitortv', 'monitorpc', 'moniormonitortvtv', 'computermonitors']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'monitor' if x in specific2_values6 else x)

# 7.
specific2_values7 = ['control', 'highbrightness', 'softwaresolution', 'signagecaresolution', 'technicalsupport',
                     'services', 'salesinquiry', 'solaraircare', 'chilleraircare', 'systemacaircare']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'support' if x in specific2_values7 else x)

# 8.
specific2_values8 = ['medicaldisplay', 'commercialdisplay', 'medicaldisplays', 'led', 'ledallinone', 'fhdseries',
                     'oled']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'display' if x in specific2_values8 else x)

# 9.
specific2_values9 = ['multisplit', 'singlesplit', 'multiinverter', 'alllgvrfsystems', 'multiv5air', 'multivwater5',
                     'multiv5vrf', 'vrfmultisplitsinglesplit', 'vrfmultisplitsinglesplitchiller', 'vrfsinglesplit',
                     'vrfmultisplit', 'ogrzewaniepompyciepa']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'vrf' if x in specific2_values9 else x)

# 10.
specific2_values10 = ['videowall', 'videowallrmk', 'projector', 'video']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'videoprojector' if x in specific2_values10 else x)

# 11.
specific2_values11 = ['heating', 'athermodynamicwaterheater']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'heater' if x in specific2_values11 else x)

# 12.
specific2_values12 = ['idb', 'educationcreateboard']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'interactivedigitalboard' if x in specific2_values12 else x)

# 13.
specific2_values13 = ['lgonequickseries', 'onequickseries', 'lgonequick']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'onequick' if x in specific2_values13 else x)

# 14.
specific2_values14 = ['webos', 'procentric', 'clouddevice']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'oscloud' if x in specific2_values14 else x)

# 15.
specific2_values15 = ['rac', 'tetooucasseteinverter', 'arcondicionadoresidencial', 'residentialairconditioner',
                      'aireacondicionadoresidencial']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'airconditioner' if x in specific2_values15 else x)

# 16.
specific2_values16 = ['robots']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'robot' if x in specific2_values16 else x)

# 17.
specific2_values17 = ['outros', 'otros']
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'other' if x in specific2_values17 else x)

value_counts = df_train['product_category'].value_counts()
df_train['product_category'] = df_train['product_category'].apply(lambda x: 'other' if value_counts[x] < 50 or value_counts[x] == 53 else x)

## 2. 데이터 전처리

### 레이블 인코딩

In [None]:
# pd.set_option('display.max_rows', None) # DataFrame의 모든 행 출력
# pd.set_option('display.max_columns', None) # DataFrame의 모든 열 출력

In [None]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [None]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

다시 학습 데이터와 제출 데이터를 분리합니다.

In [None]:
for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

### 2-2. 학습, 검증 데이터 분리

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

## 3. 모델 학습

### 모델 정의

In [None]:
model = DecisionTreeClassifier()

### 모델 학습

In [None]:
model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[  735   212]
 [  246 10667]]

정확도: 0.9614
정밀도: 0.7492
재현율: 0.7761
F1: 0.7624


## 4. 제출하기

### 테스트 데이터 예측

In [None]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [None]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

934

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("/content/drive/My Drive/lg_aimers/submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("/content/drive/My Drive/lg_aimers/submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**