# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

### 데이터 셋 읽어오기

In [2]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [3]:
df_train

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.00,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.00,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.00,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.00,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.00,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59294,1.00,/Sląskie/Poland,AS,,33747,End Customer,SMB,,,,...,LGEPL,3 months ~ 6 months,0,0,0.000026,0.028777,public facility,Others,694,False
59295,0.75,/Bogotá DC /Colombia,AS,0.040000,35420,Specifier/ Influencer,Enterprise,,,,...,LGECB,9 months ~ 1 year,0,0,0.000026,0.028777,public facility,,39,False
59296,0.75,/Pisco/Peru,AS,0.040000,19249,Specifier/ Influencer,Enterprise,,,,...,LGEPR,less than 3 months,0,0,0.000026,0.028777,public facility,,125,False
59297,1.00,/santa cruz bolivia/Peru,AS,0.040000,40327,,Enterprise,,,,...,LGEPR,more than a year,0,0,0.000026,0.028777,public facility,,134,False


# expected_timeline

In [4]:
df_train['expected_timeline'].isnull().sum()

30863

In [5]:
# 특정 키워드 매핑
def update_expected_timeline(value):
    if pd.notna(value):
        original_value = value

        mapping_rules = {
            ('requi', 'reqi'): 'requirement',
            ('no', 'not'): 'no requirement',
            ('follow',): 'follow',
            ('details',): 'details shared',
            ('call back', 'call later', 'call after'): 'call back'
        }

        for keywords, updated_value in mapping_rules.items():
            if any(keyword in value for keyword in keywords):
                return updated_value

        return original_value


#행의 값이 1인 경우에는 Other 로 분류
def classify_other(df):
    value_counts = df['expected_timeline'].value_counts()
    single_occurrence_values = value_counts[value_counts == 1].index
    df['expected_timeline'] = df['expected_timeline'].apply(lambda x: 'Other' if x in single_occurrence_values else x)
    return df



# 직접 매핑하는 함수
def mapping(df, timeline_mapping):
    df['expected_timeline'] = df['expected_timeline'].apply(update_expected_timeline)
    df['expected_timeline'] = df['expected_timeline'].map(timeline_mapping).fillna(df['expected_timeline'])
    return df
    
timeline_mapping = {
    "less than 3 months": "0-3m",
    "less_than_3_months": "0-3m",
    
    "3 months ~ 6 months": "3-6m",
    "3_months_~_6_months": "3-6m",
    
    "6 months ~ 9 months": "6-9m",
    "6_months_~_9_months": "6-9m",
    
    "9 months ~ 1 year": "9-12m",
    "9_months_~_1_year": "9-12m",
    
    "up to december": "1y",
    "more than a year": "1y",
    "more_than_a_year": "1y",
    
    "couldn't connect": "no response",
    "didn't respond" : "no response",
    "rnr": "no response",
    "reponse": "response"
}

df_train = mapping(df_train, timeline_mapping)
df_test = mapping(df_test, timeline_mapping)

df_train = classify_other(df_train)
df_test = classify_other(df_test)

#nan -> no requriemnet
df_train['expected_timeline'].fillna('no requirement', inplace=True)
df_test['expected_timeline'].fillna('no requirement', inplace=True)

In [6]:

unique_values = df_train['expected_timeline'].nunique()

print(f"unique 값: {unique_values}")

value_counts = df_train['expected_timeline'].value_counts(dropna=False)

# 결과 출력
print("빈도:")
for value, count in zip(value_counts.index, value_counts.values):
    print(f"{value}: {count}")

unique 값: 28
빈도:
no requirement: 30983
0-3m: 17326
3-6m: 5035
1y: 3028
9-12m: 1107
6-9m: 1102
Other: 176
requirement: 112
less than 6 months: 108
follow: 108
etc.: 95
details shared: 45
no response: 17
call back: 17
duplicate lead: 6
quote send: 5
budget issue: 4
december 2022: 3
already touch with customers: 3
assigned to partner. intial meeting done. will convert to opp post complete info: 3
price shared: 2
less than 5 months: 2
low budget: 2
fu under progress: 2
more then 3 months: 2
less then 6 months: 2
fu under progress.: 2
october 2022: 2


# business관련 결측치 처리

In [7]:
df_train['business_area'].isnull().sum()

40882

In [8]:
df_train['business_subarea'].isnull().sum()

53773

In [9]:
# df_train
grouped_data_train = df_train.groupby('business_area')['business_subarea'].apply(list)
mode_subarea_by_area_train = df_train.groupby('business_area')['business_subarea'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
df_train['business_subarea'] = df_train.apply(lambda row: mode_subarea_by_area_train.get(row['business_area'], row['business_subarea']), axis=1)

# df_test
grouped_data_test = df_test.groupby('business_area')['business_subarea'].apply(list)
mode_subarea_by_area_test = df_test.groupby('business_area')['business_subarea'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
df_test['business_subarea'] = df_test.apply(lambda row: mode_subarea_by_area_test.get(row['business_area'], row['business_subarea']), axis=1)



In [10]:
df_train['business_subarea'].isnull().sum()

40882

# customer_country

In [11]:
# 나라명으로 컬럼 대체하기.
df_train['customer_country'].isna().sum() # 135개 '' 처리
df_c = df_train.fillna('None')

df_c = df_c['customer_country'].str.replace(pat = '[ ]', 
                                            repl = '', regex = True).reset_index().drop('index', axis = 1)

# 분리해서 리스트 생성.
c_list = df_c['customer_country'].str.split('/')
c_list

result = []
for i in range(0, len(c_list)):        
    result.append(c_list[i][-1])
    
result = pd.DataFrame(result).rename(columns = {0:'customer_country'})

df_train['customer_country'] = result

replace_country = {
    'Manaus': 'Brazil',
    'Aparecida' : 'Brazil',
    'BR' : 'Brazil',
    'Dourados' : 'Brazil',
    'Cuiabá' : 'Brazil',
    'Recife' : 'Brazil',
    'SãoPaulo' : 'Brazil',
    'SaoPaulo' : 'Brazil',
    'JoãoPessoa' : 'Brazil',
    'Capãodacanoa' : 'Brazil',
    'BeloHorizonte' : 'Brazil',
    'SãoPaulo,Pinheiros' : 'Brazil',
    'CentrodeProduçãoAudiovisual-SescSãoPaulo' : 'Brazil',
    'FozdeIguaçu-PRAvenidaTancredoNeves6731JardimItaipu' : 'Brazil',
    'ViaE.DeAmicis,23.90044Carini(PA)' : 'Italy',
    'NewHampshire': 'UnitedStates',
    '1HoagDr.' : 'UnitedStates',
    '21903RanierLn' : 'UnitedStates',
    'JacksonvilleFlorida' : 'UnitedStates',
    'USVirginIslands' : 'UnitedStates',
    '3NassonAvenue' : 'UnitedStates',
    'Zip98433' : 'UnitedStates',
    'Nevada' : 'UnitedStates',
    'Ohio' : 'UnitedStates',
    'AnandViharDelhi' : 'India',
    'uttarpradesh' : 'India',
    'mumbai' : 'India',
    'Telangana' : 'India',
    'bangalore': 'India',
    'gujarat' : 'India',
    'gurgaon': 'India',
    'indore' : 'India',
    'kerela' : 'India',
    'Chennai' : 'India',
    'lucknow' : 'India',
    'Gujarat' : 'India',
    'Pune' : 'India',
    'odisha' : 'India',
    'hyderabad' : 'India',
    'bhilwara' : 'India',
    'CACERES' : 'Spain',
    'GRANCANARIASPLAYADELINGLES' : 'Spain',
    'VALENCIA' : 'Spain',
    'MADRID' : 'Spain',
    'SPAIN' : 'Spain',
    '1605Ave.PoncedeLeón,Suite400SanJuan,00909,PuertoRico' :'PuertoRico',
    'ΘέσηΠέτσαΒακαλοπούλουΒΙΟΠΑΠαλλήνης15351' : 'Greece',
    'Barranquilla' : 'Colombia',
    'Bucaramanga' : 'Colombia',
    'Cartagena' : 'Colombia',
    'COLOMBIA' : 'Colombia',
    'CARRERA11A94-46EDIFICIOCHICO3000PISO3BOGOTA' : 'Colombia',
    '1919MinnesotaCt,Mississauga,ONL5N' : 'Canada',
    'HaNoi' : 'Vietnam',
    'ARGENTINA' : 'Argentina',
    'EGYPT' : 'Egypt',
    'PerU' : 'Peru',
    'UAEDubai' : 'U.A.E',
    'Antigua' : 'AntiguaandBarbuda',
    'Dominicanrepublic' : 'DominicanRepublic',
    'NetherlandsAntilles' : 'Netherlands', 
    'A' : 'None',
    'country' : 'None',
    '':'None'
}

df_train['customer_country'] = df_train['customer_country'].replace(replace_country)

df_train.loc[df_train['customer_country'].str.contains('UnitedStates'), 'customer_country'] = 'UnitedStates'
df_train.loc[df_train['customer_country'].str.contains('Italy'), 'customer_country'] ='Italy'
df_train.loc[df_train['customer_country'].str.contains('Colombia'), 'customer_country'] ='Colombia'
df_train.loc[df_train['customer_country'].str.contains('ALICANTE'), 'customer_country'] ='Spain'

df_train.loc[df_train['customer_country'].str.contains(r'^[A-Z]{2}$|[A-Z]{2}\d{5}$'), 'customer_country'] = 'UnitedStates'
df_train.loc[df_train['customer_country'].str.contains(r'\b[A-Z]{2}\d{5}\b|USA'), 'customer_country'] = 'UnitedStates'
df_train['customer_country'][(df_train['customer_country'].str.contains(r'\d{5}'))
                             & (~df_train['customer_country'].str.contains('@'))] = 'UnitedStates'
df_train.loc[df_train['customer_country'].str.contains(r'\d{4}$|^\d{4}|^\d{3}'), 'customer_country'] = 'UnitedStates'

df_train['customer_country'][df_train['customer_country'].str.contains('@')] = 'None'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['customer_country'][(df_train['customer_country'].str.contains(r'\d{5}'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['customer_country'][df_train['customer_country'].str.contains('@')] = 'None'


# customer_position

In [12]:
# 각 단어별로 분리해 보려고 했으나 그럼 여러 직위, 직무와 중복되는 현상이 발생.
# 단어 전체를 리스트에 넣어 각각 분류 진행.

none = ['none', 'unpaid', 'not applicable', 'bulgaria', ]

other = ['other', 'others', 'no influence']

customer = ['customer', 'end user', 'commercial end user',
                 'this is a consume display requirement for home purpose ']

# ceo/founder
ceo = ['ceo', 'founder', 'cofounder', 'ceo fundador', 'co founder',
           'ceo founder', 'c level executive', 'c levelexecutive', 
           'entrepreneurship', 'vp', 'proprietário a ', 'genel müdür', 
           'the big boss', 'chief executive officer']

# 의사결정자 decision maker(이사, 의장 등도 포함)
decision = ['decision', 'decision maker', 'decision influencer', 'gerente', 
                 'business unit director', 'chairman', 
                 'principal director', 'principal   director',
                'leadership executive office owner']

# 정부 government
government = ['government', 'vicepresident', 'vice president', 'president',]

# 인턴 intern
intern = ['intern', 'entrylevel', 'entry level']

# 교사, 교수 등 educator
educator = ['educator', 'math and physics teacher', 'physics teacher', 
            'professor of mathematics', 'assistant professor', 
            'principal at oxford integrated pu science college', 
            'physics and mathematics teacher', 
            'science teacher', 'prof', 'prof ', 'science teacher', 'academic specialist',
            'education', 'maths lecturer', 
            'academic coordinator  post graduate teacher  accountancy  business studies   tgt  ict ', 
            'english trainer for ielts toefl pte gre sat exams ', 
            'director cum faculty at gaining apex coaching centre',
            'associate professor in electronics engg', 'asst prof ', 
            'neet  olympiad expert faculty', 'teacher', 'guest faculty', 
            'physics faculty', 'teacher middle school coordinator', 
            'assistant professor of enlish', 'professor', 'quantitative aptitude faculty', 
            'associate professor', 'hon dean', 'chemistry teacher', 'education professional', 
            'senior lecturer', 'pgt physics', 'pgt chemistry']

# 컨설턴트 consultant
consultant = ['consultant', 'consulting', 'architecture consult', 
             'architect consultant', 'commercial consultant']

# 의료직 hospital
hospital = ['hospital', 'tierarzt', 'főorvos', 'surgery professional', 
                 'pathologist', 'radiology professional']

# 영업직 sales
sales = ['sales', 'business development sales', 'subsidiary sales  ise ', ]

# 전시회 exhibition
exhibition = ['exhibition', 'exhibitiontv', 'mindenes', 
             'other   please specify   cedia association']

# 리서치 research
research = ['research', 'market intelligence research']

# 제조
manufacturer = ['manufacturer', 'medical device manufacturer']

# 기술직 technical
technical = ['technical', 'técnico']

lists = [none, other, customer, ceo, decision, 
         government, intern, educator, consultant, hospital,
         sales, exhibition, research, manufacturer, technical]

def set_pos(df_col):
    for j in range(0, len(lists)):
        for i in range(0, len(df_col)):
             if df_col.loc[i] in lists[j]:
                df_col.loc[i] = lists[j][0]
                
cp_set = df_train['customer_position'].str.replace(pat = '[\W_]', repl = ' ', regex = True)
set_pos(cp_set)
df_train['customer_position'] = cp_set

cp_set_test = df_test['customer_position'].str.replace(pat = '[\W_]', repl = ' ', regex = True)
set_pos(cp_set_test)
df_test['customer_position'] = cp_set_test

# product_category

In [13]:
categories = {
    'ultra stretch signage': 'DIGITAL Signage',
    'ultra stretch series': 'DIGITAL Signage',
    'signage': 'Signage',

    'led signage': 'LED Signage',
    'led aio 136': 'LED Signage',
    'meeting & screen sharedirect view leddirect view led': 'LED Signage',
    'laec015': 'LED Signage',
    'leadallin': 'LED Signage',
    'ledallinone': 'LED Signage',
    'laec015-gn.awz': 'LED Signage',
    'gscd046': 'LED Signage',
    'laec15': 'LED Signage',
    'lg led bloc': 'LED Signage',
    'lg magnit': 'LED Signage',
    'led cinema': 'LED Signage',
    'pantalla led outdoor': 'LED Signage',
    'gsca046': 'LED Signage',
    'gscd100': 'LED Signage',
    'lsca039': 'LED Signage',
    'essential series': 'LED Signage',

    'oled signage': 'OLED Signage',
    'transparent oled': 'OLED Signage',

    'uhd signage': 'DIGITAL Signage',
    'uh': 'DIGITAL Signage',

    'digital signage': 'DIGITAL Signage',
    'one:quick series': 'DIGITAL Signage',
    'lg one:quick': 'DIGITAL Signage',
    'lg one:quick series': 'DIGITAL Signage',
    'one quick works': 'DIGITAL Signage',
    'onequick series': 'DIGITAL Signage',
    'one quick:flex': 'DIGITAL Signage',
    'one:quick flex': 'DIGITAL Signage',
    'one:quick': 'DIGITAL Signage',
    '43uh5f-h.awzm': 'DIGITAL Signage',
    '49vl5g-m.awzm': 'DIGITAL Signage',
    'corpouh5f': 'DIGITAL Signage',
    'corpuh5f-': 'DIGITAL Signage',
    '86uh5f': 'DIGITAL Signage',
    '55tc3d': 'DIGITAL Signage',
    '5svh7f-a': 'DIGITAL Signage',
    'tr3': 'DIGITAL Signage',
    '98uh5e': 'DIGITAL Signage',
    'standard signage': 'DIGITAL Signage',
    'high brightness signage': 'DIGITAL Signage',
    'interactive signage': 'DIGITAL Signage',
    'special signage': 'DIGITAL Signage',
    'accessories': 'DIGITAL Signage',
    'standard': 'DIGITAL Signage',
    'high brightness': 'DIGITAL Signage',

    'tv signage': 'DIGITAL Signage',
    'smart tv signage': 'DIGITAL Signage',
    'ur640': 'DIGITAL Signage',
    'ur640s': 'DIGITAL Signage',

    'aio': 'AIO',
    'allinone_rmk': 'AIO',

    'system air conditioner': 'CAC',
    'single cac': 'CAC',
    'điều hòa trung tâm multi': 'CAC',
    'cac': 'CAC',
    'system ac': 'CAC',

    'vrf': 'VRF',
    'all lg vrf systems': 'VRF',
    'multi v5 vrf': 'VRF',
    'نظام التدفق المتغيرvrf': 'VRF',
    'vrf - multi v s': 'VRF',
    'kimatyzacja vrf': 'VRF',
    'điều hòa trung tâm vrf': 'VRF',
    'systèmes de débit à réfrigérant variable (drv)': 'VRF',

    'điều hòa trung tâm chiller': 'Chiller',
    'chiller': 'Chiller',
    'مبرد (تشيلر)': 'Chiller',
    'chiller/enfriadoras': 'Chiller',

    'software solution': 'Solution',
    'signage care solution': 'Solution',
    'signage care solutions': 'Solution',
    'lg home bliss air solution': 'Solution',
    'lg paradise air solution': 'Solution',
    'lg salang air solution for dream homes': 'Solution',
    'حلول التدفئة': 'Solution',
    'pro:centric': 'Solution',
    'pro centric hotel': 'Solution',
    'procentric': 'Solution',

    'heating': 'Heating',
    'חימום': 'Heating',
    'isıtma': 'Heating',
    'ogrzewanie (pompy ciepła)': 'Heating',
    'calefacción': 'Heating',
    'aquecimento': 'Heating',

    'rac': 'RAC',
    'điều hòa gia dụng': 'RAC',
    'เครื่องปรับอากาศเผื่อที่อยู่อาศัย': 'RAC',
    'ac rumah': 'RAC',
    'aire acondicionado residencial': 'RAC',
    'climatiseur résidentiel': 'RAC',
    'điều hòa cục bộ': 'RAC',
    'residential air conditioner': 'RAC',
    'מזגנים למקום מגורים': 'RAC',
    'تكييفات': 'RAC',
    'ar condicionado residencial': 'RAC',
     'pendingin': 'RAC',

    'commercial display': 'Display',
    'medical display': 'Display',
    '互動式顯示屏': 'Display',
    'led 顯示屏': 'Display',
    '標準顯示屏': 'Display',
    'collaboration displays': 'Display',
    'oled 顯示屏': 'Display',
    'window facing display': 'Display',
    'medical displays': 'Display',
    'radiology displays': 'Display',
    'taa lcd lfd displays': 'Display',
    '特別顯示屏': 'Display',
    '高亮度顯示屏': 'Display',

    'hotel tv': 'TV',
    'commercial tv': 'TV',
    'hospital tv': 'TV',
    'tv': 'TV',
    '酒店電視': 'TV',
    'tv,commercial tv': 'TV',
    'htv': 'TV',
    'tv 60"': 'TV',
    'comercial tv': 'TV',
    '醫院電視': 'TV',
    'ctv': 'TV',
    'smart tv': 'TV',
    'tv 55"': 'TV',
    'tv 43 pol': 'TV',
    '43us660h0sd.awz': 'TV',
    '50uq801c0sb.bwz': 'TV',
    '55us660h0sd.bwz': 'TV',
    '32lq621cbsb.awz': 'TV',
    '55uq801c0sb.bwz': 'TV',
    '43uq751c0sf.bwz': 'TV',
    '43uq751c0sb.bwz': 'TV',
    '50us660h0sd.bwz': 'TV',
    '43us660h (na)': 'TV',
    'hoteleria_us670h': 'TV',

    'monitor': 'Monitor',
    'computer monitors': 'Monitor',
    'medical monitors': 'Monitor',
    'monitor & pc': 'Monitor',
    'medical monitor': 'Monitor',
    'surgical monitor': 'Monitor',
    '28mq780': 'Monitor',
    'medical- surgical': 'Monitor',
    'monitorindustrial_rmk': 'Monitor',

    'multi-split': 'Multi Split',
    'multi split': 'Multi Split',
    'פיצול מרובה': 'Multi Split',
    'multi-split (plusieurs pièces)': 'Multi Split',
    'klimatyzacja multi-split': 'Multi Split',

    'single-split': 'Single Split',
    'single split': 'Single Split',
    'split tunggal': 'Single Split',

    'videowall_rmk': 'Video Wall',
    'videwall': 'Video Wall',
    '110 + video wall': 'Video Wall',
    'video wall': 'Video Wall',
    '49vl5g-m': 'Video Wall',
    '55vm5e-a': 'Video Wall',
    '55vm5j-h': 'Video Wall',
    '49vl5f': 'Video Wall',
    'videowall signage': 'Video Wall',

    'etc.': 'Other',
    'lainnya': 'Other',
    'אחר': 'Other',
    'ฯลฯ': 'Other',
    'khác': 'Other',
    'outros': 'Other',
    'آخر': 'Other',
    'not specified': 'Other',
    'other': 'Other',
    'others': 'Other',
    'otros': 'Other',
    'autre': 'Other',

    'idb': 'IDB',
    'interactive digital board': 'IDB',

     'washing machine,dryer': 'Several',
     'solar,chiller': 'Several',
     'system ac,rac': 'Several',
     'monitor signage,commercial tv,monior/monitor tv': 'Several',
     'monitor signage,monior/monitor tv': 'Several',
     'aircare,built-in/cooking': 'Several',
     'monitor signage,commercial tv,monior/monitor tv,projector,tv': 'Several',
     'monitor signage,commercial tv,monior/monitor tv,tv': 'Several',
     'commercial tv,tv': 'Several',
     'monitor signage,commercial tv,solar,ess,monior/monitor tv,pc,projector,robot,system ac,ems,rac,chill': 'Several',
     'monior/monitor tv,tv': 'Several',
     'chiller,aircare': 'Several',
     'solar,aircare': 'Several',
     'commercial tv,audio/video': 'Several',
     'solar,ess': 'Several',
     'solar,system ac': 'Several',
     'vrf,multi-split': 'Several',
     'vrf,multi-split,chiller': 'Several',
     'vrf,multi-split,single-split,chiller,heating': 'Several',
     'system ac,solar,washing machine': 'Several',
     'solar,ess,ems': 'Several',
     'tv,mobile': 'Several',
     'commercial tv,projector': 'Several',
     'aircare,water care': 'Several',
     'monior/monitor tv,chiller': 'Several',
     'system ac,chiller': 'Several',
     'system ac,aircare': 'Several',
     'monitor signage,commercial tv': 'Several',
     'system ac,tv': 'Several',
     'monitor signage,audio/video': 'Several',
     'monitor signage,commercial tv,solar,ess,monior/monitor tv,pc': 'Several',
     'monitor signage,pc': 'Several',
     'monitor signage,commercial tv,solar,ess': 'Several',
     'monior/monitor tv,system ac,tv,refrigerator,washing machine,dryer,built-in/cooking': 'Several',
     'commercial tv,robot': 'Several',
     'monitor signage,solar': 'Several',
     'solar,projector': 'Several',
     'tv,audio/video': 'Several',
     'solar,dryer': 'Several',
     'solar,monior/monitor tv': 'Several',
     'chiller,dryer': 'Several',
     'monior/monitor tv,pc': 'Several',
     'solar,refrigerator': 'Several',
     'monitor signage,system ac': 'Several',
     'system air conditioner,solar': 'Several',
     'solar,tv': 'Several',
     'monitor signage,tv': 'Several',
     'dryer,chiller': 'Several',
     'monitor signage,monior/monitor tv,system ac,vacuum cleaner,tv,home beauty,commercial tv,mobile,audio': 'Several',
     'solar,vacuum cleaner': 'Several',
     'solar,monior/monitor tv,pc,tv,refrigerator,washing machine,dryer,home beauty': 'Several',
     'monitor signage,solar,robot,water care': 'Several',
     'mobile,audio/video': 'Several',
     'system ac,refrigerator,washing machine,dryer': 'Several',
     'solar,system ac,aircare': 'Several',
     'projector,system ac,water care': 'Several',
     'chiller,water care': 'Several',
     'monior/monitor tv,tv,commercial tv,pc,refrigerator,solar,rac,washing machine,mobile,ess,audio/video': 'Several',
     'ess,chiller': 'Several',
     'monitor signage,monior/monitor tv,pc,tv': 'Several',
     'solar,water care': 'Several',
     'digital signage or commercial tvs': 'Several',
     'monitor signage,commercial tv,audio/video': 'Several',
     'solar,built-in/cooking': 'Several',
     'monitor signage,monior/monitor tv,commercial tv': 'Several',
     'commercial tv,monior/monitor tv': 'Several',
     'robot,system ac': 'Several',
     'commercial tv,water care': 'Several',
     'pc,washing machine': 'Several',
     'monitor signage,solar,monior/monitor tv,pc,projector,robot,system ac,tv,refrigerator,washing machine': 'Several',
     'system ac,tv,refrigerator,washing machine,built-in/cooking,audio/video': 'Several',
     'system air conditioner,energy storage system': 'Several',
     'system ac,home beauty': 'Several',
     'information display,monitor': 'Several',
     'rac/cac': 'Several',
     'system ac,chiller,aircare': 'Several',
     'tv,refrigerator,washing machine': 'Several',
     'monior/monitor tv,refrigerator': 'Several',
     'monior/monitor tv,system ac,tv,pc,refrigerator,water care,solar,washing machine,mobile,chiller,built': 'Several',
     'projector,audio/video': 'Several',
     'monior/monitor tv,audio/video': 'Several',
     'monitor signage,commercial tv,monior/monitor tv,pc,tv,home beauty,audio/video': 'Several',
     'pc,robot,system ac,chiller,tv,refrigerator,washing machine,vacuum cleaner,styler,dryer,mobile,audio/': 'Several',
     'refrigerator,washing machine,built-in/cooking': 'Several',
     'monitor signage,monior/monitor tv,tv,audio/video': 'Several',
     'tv,refrigerator,washing machine,vacuum cleaner,audio/video': 'Several',
     'pc,tv': 'Several',
     'aircare,mobile': 'Several',
     'solar,ess,system ac': 'Several',
     'system ac,refrigerator': 'Several',
     'tv,pc': 'Several',
     'monitor signage,commercial tv,solar,pc,projector,system ac,ems,rac,chiller,refrigerator,washing mach': 'Several',
     'chiller,refrigerator': 'Several',
     'monitor signage,commercial tv,solar,projector,robot,chiller,refrigerator,built-in/cooking,water care': 'Several',
     'commercial tv,solar': 'Several',
     'monior/monitor tv,projector,audio/video': 'Several',
     'refrigerator,chiller': 'Several',
     'chiller,tv': 'Several',
     'projector,ems': 'Several',
     'vrf,chiller': 'Several',
     'monitor signage,mobile': 'Several',
     'robot,vacuum cleaner': 'Several',
     'monitor signage,commercial tv,monior/monitor tv,audio/video': 'Several',
     'video wall + aio': 'Several',
     'vrf,single-split': 'Several',
     'vrf,multi-split,single-split,chiller,etc.': 'Several',
     'multi-split,single-split': 'Several',
     'vrf,multi-split,single-split': 'Several',
     'vrf,multi-split,etc.': 'Several',
     'vrf,multi-split,heating': 'Several',
     'vrf,multi-split,single-split,chiller': 'Several',
     'vrf,multi-split,single-split,heating': 'Several',
     'vrf,single-split,chiller': 'Several',
     'commercial tv,solar,ess,projector,system ac,tv,washing machine,home beauty,audio/video': 'Several',
     'solar,robot': 'Several',
     'monitor signage,commercial tv,monior/monitor tv,pc,projector,tv,audio/video': 'Several',
     'solar,energy storage system': 'Several',
     'system ac,solar': 'Several',
     'solar,system ac,water care': 'Several',
     'vrf,heating': 'Several',
     'monitor signage,monior/monitor tv,vacuum cleaner,tv,home beauty,commercial tv,pc,refrigerator,styler': 'Several',
     'refrigerator,built-in/cooking': 'Several',
     'ems,audio/video': 'Several',
     'projector,ems,mobile,audio/video': 'Several',
     'commercial tv,solar,ess,monior/monitor tv,pc,projector,robot,system ac,ems,rac,chiller,refrigerator,': 'Several',
     'tv,refrigerator': 'Several',
     'monior/monitor tv,refrigerator,audio/video': 'Several'
}

# 주어진 값을 카테고리에 따라 변환
def map_to_category(value):
    return categories.get(value, value)

# 판다스 시리즈를 입력으로 받아 카테고리에 따라 변환된 값을 반환
def transform_column_values_series(column_series):
    return column_series.map(map_to_category)


# 변환된 칼럼 값 출력
df_train['product_category'] = transform_column_values_series(df_train['product_category'])
df_test['product_category'] = transform_column_values_series(df_test['product_category'])

# inquiry_type

In [14]:
## 주어진 값들을 범주에 따라 매핑하는 딕셔너리 생성
category_mapping = {
    'Quotation or purchase consultation': 'Quotation',
    'Quotation or Purchase Consultation': 'Quotation',
    'quotation_or_purchase_consultation': 'Quotation',
    'Request for quotation or purchase': 'Quotation',
    'Quotation or Purchase consultation': 'Quotation',
    'Purchase': 'Quotation',
    'quotation_': 'Quotation',
    'Purchase or Quotation': 'Quotation',
    'first Info and pricing': 'Quotation',
    'Hola me pueden cotizar 19 pantallas interactivas de 100 pulgadas entregadas en Guayaquil -Ecuador.': 'Quotation',
    'Vui lòng báo giá giúp mình sản phẩm đo thân nhiệt Xin cảm ơn': 'Quotation',
    'Probeam precio': 'Quotation',
    'Solicito apoyo para realizar cotizacion de los dispositivos que ofrecen en la solución\xa0One Quick:\xa0': 'Quotation',
    'Toi muon tim hieu thong tin ky thuat, gia ca cua sp de su dung': 'Quotation',
    'tôi cần tham khảo giá và giải pháp từ LG': 'Quotation',
    
    'Product Information': 'Product',
    'One Quick:Flex': 'Product',
    'AIO': 'Product',
    'Hospital TV': 'Product',
    'EDUCATIONAL EQUIPMENTS': 'Product',
    'Digital platform': 'Product',
    'TV interactive': 'Product',
    'Display Textbook and photos': 'Product',
    'High inch 86 / 98 or 110': 'Product',
    'display product': 'Product',
    'estoy buscando para Ecuador este producto LG MAGNIT micro LED, para un cliente de 138 pulgadas, con envió marítimo.': 'Product',
    'Hotel TV products': 'Product',
    'Pantallas Interactivas para Clinicas': 'Product',
    'IDB': 'Product',
    'LED Signage': 'Product',
    'Standalone': 'Product',
    'Video Wall': 'Product',
    'Preciso de um monitor médico para radiografia convencional e tomogrtafia.': 'Product',
    'VRF': 'Product',
    'window facing product': 'Product',
    
    'Usage or technical consultation': 'Technical',
    'Technical Support': 'Technical',
    'Usage or Technical Consultation': 'Technical',
    'Technical Consultation': 'Technical',
    'technical': 'Technical',
    'usage or technical consultation': 'Technical',
    'usage_or_technical_consultation': 'Technical',
    'Request for technical consulting': 'Technical',
    'technical_consultation': 'Technical',
    
    'Other': 'Other',
    'Etc.': 'Other',
    'other': 'Other',
    'other_': 'Other',
    'Others': 'Other',
    'others': 'Other',
    'ETC.': 'Other',
    'Not specified': 'Other',
    
    'Event Inquiry': 'Event',
    'Evento_SdelEstero': 'Event'
}

# 주어진 값을 카테고리에 따라 변환
def map_to_category(value):
    return category_mapping.get(value, value)

# 판다스 시리즈를 입력으로 받아 카테고리에 따라 변환된 값을 반환
def transform_column_values_series(column_series):
    return column_series.map(map_to_category)


# 변환된 칼럼 값 출력
df_train['inquiry_type'] = transform_column_values_series(df_train['inquiry_type'])
df_test['inquiry_type'] = transform_column_values_series(df_test['inquiry_type'])

# product_subcategory 
# product_modelname

In [15]:
# LED, OLED, TV 등으로 바꾸는 건 카테고리에서 확인 가능하므로 기존 값을 최대한 건드리지 않음.
# 이상한 문장만 other 처리.

# 대소문자 통일, 좌우 여백 제거
df_train['product_subcategory'] = df_train.product_subcategory.str.title().str.strip()
df_test['product_subcategory'] = df_test.product_subcategory.str.title().str.strip()
df_train['product_modelname'] = df_train.product_modelname.str.title().str.strip()
df_test['product_modelname'] = df_test.product_modelname.str.title().str.strip()

subcategory_mapping = {
    'Etc.':'other',
    'Etc ':'other',
    'etc ':'other',
    'Etc':'other',
    'etc':'other',
    'Other':'other',
    'Others':'other',
    '其他':'other',
    'Budget High For Vrf , He Will Buy The Split Ac':'other',
    'Budget Is Higher For Vrf':'other',
    'Architect , We Are Meeting For Enqiry Generation ( This Is Not A Inquiry)':'other',
    'This Is Being Dealt With By Lg Germany.':'other',
    'We Dont Offer T/F Spoke To Ha Pm':'other',
    'Inquiry Forwarded To Shaker':'other',
    'Want Split Ac':'other',
    'Passed On To Fixxy Distribution':'other',
    'This Is Being Dealt With By Lg Germany.':'other',
    
    'Pro:Centeric Tv':'Pro:Centric Tv',
    'Pro:Centrc Tv':'Pro:Centric Tv',
 
    'OLED 透明觸控顯示屏':'OLED transparent touch display',
    'All Monitors &Pcs': 'All Monitors & Pcs',
    '透明 LED 顯示貼':'Transparent LED display sticker',
    'SH7DD 系列':'SH7DD series',
    'จอภาพเพื่อการวินิจฉัย':'Monitor for navigation',
    'UH7F 系列':'UH7F series',
    'VL5F 系列':'VL5F series',
    'TR3BG 系列':'TR3BG series',
    'UT781H 系列':'UT781H series',
    'Indoor Versatile 系列':'Indoor Versatile series',
    'VL5D 系列':'VL5D series',
    'LT660H 系列':'LT660H series',
    'จอภาพสำหร บการตรวจสอบทางคล น ก':'Monitor for mechanical inspection',
    'จอภาพสำหรับการตรวจสอบทางคลินิก':'Monitor for mechanical inspection',
    'Curvable Oled Sigange':'Curvable Oled Signage',
    'Diagnostic Monitor':'Diagnostic Monitors',
    'Flat Oled Sigange':'Flat Oled Signage',
    'Interactive Digital Board':'Idb',
}

def sub_mapping(df):
    df.loc[:,'product_subcategory'] = df['product_subcategory'].map(subcategory_mapping).fillna(df['product_subcategory'])
    df.loc[:,'product_modelname'] = df['product_modelname'].map(subcategory_mapping).fillna(df['product_modelname'])

def subcategory_name_mapping(df):
    # 서브카테고리 결측치를 모델명으로 대체
    df.loc[(df['product_subcategory'].isna()) 
           & ~(df['product_modelname'].isna()), 'product_subcategory'] = df[(df['product_subcategory'].isna()) 
                                                                                    & ~(df['product_modelname'].isna())]['product_modelname']
    # 모델명 결측치를 서브카테고리로 대체
    df.loc[((df['product_modelname'].isna()) 
           & ~(df['product_subcategory'].isna())), 'product_modelname'] = df[(df['product_modelname'].isna()) 
                                                                                    & ~(df['product_subcategory'].isna())]['product_subcategory']

subcategory_name_mapping(df_train)
subcategory_name_mapping(df_test)
sub_mapping(df_train)
sub_mapping(df_test)

In [16]:
df_train['product_category'].value_counts()


product_category
IDB                         6185
VRF                         5736
DIGITAL Signage             5615
Multi Split                 3683
Other                       2132
                            ... 
32 / 43 pol                    1
sac                            1
high inch 86 / 98 or 110       1
refrigerator                   1
parts                          1
Name: count, Length: 74, dtype: int64

## 2. 데이터 전처리

### 레이블 인코딩

In [17]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [18]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [19]:
df_all

Unnamed: 0,customer_country,business_subarea,business_area,business_unit,customer_type,enterprise,customer_job,inquiry_type,product_category,product_subcategory,product_modelname,customer_country.1,customer_position,response_corporate,expected_timeline
0,2477,7,0,0,10,0,420,9,14,376,885,9070,21,33,0
1,2477,7,0,0,10,0,303,9,14,376,885,8406,7,33,0
2,2407,7,0,0,10,0,160,8,20,376,885,6535,29,21,0
3,2407,7,0,0,10,0,166,9,23,376,885,3388,7,21,0
4,2407,7,0,0,29,0,84,9,14,376,885,5799,29,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,1878,11,6,0,33,0,468,9,49,376,885,10650,1,43,22
5267,2378,11,6,3,33,0,323,9,49,376,885,16640,26,50,22
5268,901,11,6,0,29,0,166,20,14,376,885,1079,23,43,0
5269,19,11,6,3,9,1,323,20,30,376,885,24,26,12,22


다시 학습 데이터와 제출 데이터를 분리합니다.

In [20]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [21]:
df_train

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.00,2477,0,0.066667,32160,10,0,,,,...,33,0,1,0,0.003079,0.026846,0,7,0,True
1,1.00,2477,0,0.066667,23122,10,0,12.0,,,...,33,0,1,0,0.003079,0.026846,0,7,1,True
2,1.00,2407,0,0.088889,1755,10,0,144.0,,,...,21,0,1,0,0.003079,0.026846,0,7,2,True
3,1.00,2407,0,0.088889,4919,10,0,,,,...,21,0,1,0,0.003079,0.026846,0,7,3,True
4,1.00,2407,0,0.088889,17126,29,0,,,,...,21,0,0,0,0.003079,0.026846,0,7,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59294,1.00,2481,0,,33747,9,1,,,,...,34,2,0,0,0.000026,0.028777,8,7,694,False
59295,0.75,2345,0,0.040000,35420,29,0,,,,...,7,4,0,0,0.000026,0.028777,8,7,39,False
59296,0.75,2476,0,0.040000,19249,29,0,,,,...,35,0,0,0,0.000026,0.028777,8,7,125,False
59297,1.00,2476,0,0.040000,40327,33,0,,,,...,35,1,0,0,0.000026,0.028777,8,7,134,False


In [22]:
df_test

Unnamed: 0,id,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,19844,0.00,6,2,0.073248,47466,9,0,53.0,,...,43,22,1,0,0.001183,0.049840,10,6,278,False
1,9738,0.25,2151,3,,5405,9,1,,,...,50,22,0,0,0.000013,,12,7,437,True
2,8491,1.00,48,2,,13597,29,1,,,...,18,0,0,0,0.000060,0.131148,4,3,874,False
3,19895,0.50,743,2,0.118644,17204,33,0,,,...,50,1,0,0,0.001183,0.049840,10,6,194,False
4,10465,1.00,1168,2,0.074949,2329,9,0,2.0,1.0,...,43,0,1,1,0.003079,0.064566,0,7,167,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,13855,0.50,1878,0,,40292,33,0,10.0,,...,43,22,0,0,,,6,11,97,False
5267,7979,0.25,2378,3,,47466,33,0,0.0,,...,50,22,0,0,,,6,11,438,True
5268,12887,0.75,901,0,,46227,29,0,,,...,43,0,0,0,,,6,11,97,False
5269,17530,0.00,19,3,,45667,9,1,,,...,12,22,0,0,,,6,11,429,False


# AUTO ML

In [23]:
pip install h2o


Note: you may need to restart the kernel to use updated packages.


In [27]:
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import os

# H2O 클러스터 시작
h2o.init()

# 데이터셋을 H2OFrame으로 변환
df_train_h2o = h2o.H2OFrame(df_train.fillna(0))

# 훈련 데이터와 검증 데이터로 나누기
train_h2o, val_h2o = df_train_h2o.split_frame(ratios=[0.8], seed=684050)

# AutoML 모델 생성 및 학습
automl_model_h2o = H2OAutoML(max_runtime_secs=120, stopping_metric='AUC')
x = train_h2o.columns
y = 'is_converted'
automl_model_h2o.train(x=x, y=y, training_frame=train_h2o)

# 검증 데이터로 예측
automl_pred_h2o = automl_model_h2o.predict(val_h2o).as_data_frame()["predict"].values

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

# 검증 데이터의 실제값과 예측값으로 성능 평가
get_clf_eval(val_h2o['is_converted'].as_data_frame(), automl_pred_h2o)

# AutoML 리더보드 
leaderboard = automl_model_h2o.leaderboard
print(leaderboard.as_data_frame())

# 모델 저장할 디렉터리 생성
save_dir = "/Users/dessert_gomjelly/Desktop/LG Aimers 해커톤/LG-Aimers-Hackathon/2월16일"
os.makedirs(save_dir, exist_ok=True)

# 모델 저장
h2o.save_model(model=automl_model_h2o.leader, path=save_dir, force=True)

# 검증 데이터 예측
v_pred = automl_model_h2o.predict(val_h2o).as_data_frame()["predict"].values
get_clf_eval(val_h2o['is_converted'].as_data_frame(), v_pred)

# 테스트 데이터 예측 및 제출 파일 저장
t_pred = automl_model_h2o.predict(h2o.H2OFrame(df_test.drop(["is_converted", "id"], axis=1).fillna(0))).as_data_frame()["predict"].values
df_s = pd.read_csv("submission.csv")
df_s["is_converted"] = t_pred
df_s.to_csv("submission.csv", index=False)



Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,25 mins 07 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,1 month and 27 days
H2O_cluster_name:,H2O_from_python_dessert_gomjelly_tgbctu
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.519 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
00:04:56.196: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
오차행렬:
 [[  797   211]
 [   62 10792]]

정확도: 0.9770
정밀도: 0.9278
재현율: 0.7907
F1: 0.8538
                                             model_id       auc   logloss  \
0   StackedEnsemble_AllModels_3_AutoML_4_20240217_...  0.986728  0.066940   
1   StackedEnsemble_AllModels_4_AutoML_4_20240217_...  0.986604  0.066965   
2          GBM_grid_1_AutoML_4_20240217_00456_model_6  0.985444  0.074396   
3   StackedEnsemble_BestOfFamily_4_AutoML_4_202402...  0.985057  0.069549   
4   StackedEnsemble_AllModels_2_AutoML_4_20240217_...  0.985011  0.070356   
5   StackedEnsemble_BestOfFamily_3_AutoML_4_202402...  0.984798  0.070625   
6   StackedEnsemble_AllMode



███████████████████████████████████████████| (done) 100%
오차행렬:
 [[  797   211]
 [   62 10792]]

정확도: 0.9770
정밀도: 0.9278
재현율: 0.7907
F1: 0.8538
Parse progress: |



████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%




import h2o
from h2o.automl import H2OAutoML
import pandas as pd

# H2O 클러스터 시작
h2o.init()

# 저장된 모델 로드
saved_model_path = "/path/to/save/model/StackedEnsemble_AllModels_3_AutoML_3_20240216_234725"
loaded_model = h2o.load_model(saved_model_path)

# 테스트 데이터 불러오기 (전처리된 데이터를 사용하고, 필요에 따라 적절한 전처리를 수행해야 합니다.)
df_test = pd.read_csv("test.csv")

# H2OFrame으로 변환
test_h2o = h2o.H2OFrame(df_test.fillna(0))

# 예측 수행
test_pred_h2o = loaded_model.predict(test_h2o).as_data_frame()["predict"].values

# 제출용 데이터프레임에 예측 결과 추가
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred_h2o

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

# H2O 클러스터 종료
h2o.shutdown()


### 2-2. 학습, 검증 데이터 분리

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=684050,
)





## 3. 모델 학습

### 모델 정의 

In [None]:
model = DecisionTreeClassifier()

### 모델 학습

In [None]:
model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

## 4. 제출하기

### 테스트 데이터 예측

In [None]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [None]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**