In [None]:
! pip install optuna
! pip install catboost
! pip install pytorch-tabnet

In [None]:
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    precision_recall_curve,
    mean_squared_error
)
import catboost as cb
from catboost import CatBoostClassifier, Pool, cv
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import StratifiedKFold

import re
from collections import Counter
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords

import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric

# 경고 메시지 무시
import warnings
warnings.filterwarnings('ignore')

# seed 설정
seed = 400
torch.manual_seed(seed)
np.random.seed(seed)

# GPU 사용이 가능하면 GPU 랜덤 시드도 설정
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [None]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

## data_preprocessing

In [None]:
# 중복 데이터 제거
df_train = df_train.drop_duplicates()
df_train.shape

In [None]:
df_train

In [None]:
# 중복 제거한 이후 데이터 인덱스
idx = [12764, 14640, 15041, 15229, 17580, 17582, 17884, 18943, 19077,
       19355, 34058, 34249, 34250, 36195, 36213, 36228, 39329, 40206,
       40826, 43245, 53956, 55462]

In [None]:
# 이상치라 판단된 행 제거
df_train = df_train[~df_train.index.isin(idx)]
df_train.shape

In [None]:
def make_country_list(df1, df2):
    df_all = pd.concat([df1, df2.drop('id', axis = 1)])

    # null값은 모두 'unknown'으로 전처리
    # 추후에 다시 0으로 처리
    df_all['customer_country'] = df_all['customer_country'].fillna('unknown')
    df_all['customer_country'] = df_all['customer_country'].str.lower()
    # '/' 기준 분리 후 맨 뒤를 국적으로 별도 변수 생성
    def country_split(text):
        text = text.split('/')
        return text[-1]
    country_list = df_all['customer_country'].apply(country_split)
    # 공백 제거
    country_list = country_list.str.strip()

    # 20개 이상 국가만 사용: 550 -> 83
    country_list = list(country_list.value_counts()[:83].index)
    country_list.remove('')
    # indianapolis 는 US 도시
    country_list.remove('india')
    country_list.append('india')
    return country_list

In [None]:
country_list = make_country_list(df_train, df_test)

In [None]:
def make_weight_dic(df):
    df['expected_timeline'] = df['expected_timeline'].fillna('unknown')

    delete_data = ['less than 3 months','3 months ~ 6 months','more than a year','9 months ~ 1 year',
               '6 months ~ 9 months','less than 6 months']

    # 명사와 동사만 남기는 전처리
    def text_clean(text):
        tokens = word_tokenize(text)
        # 불용어 제거
        stop_word = set(stopwords.words('english'))
        tokens = [t for t in tokens if t not in stop_word]
        pos_tags = pos_tag(tokens)

        nouns = [word for word, pos in pos_tags if pos.startswith('N')]
        verbs = [word for word, pos in pos_tags if pos.startswith('V')]

        all = nouns + verbs

        return all


    # 실패/성공에 대한 결과가 있어야 하므로 train만 적용
    # 영업전환 실패한 고객들의 timeline
    timeline_f = []
    for i in df[df['is_converted'] == False]['expected_timeline']:
        if i not in delete_data:
            text = text_clean(i)
            timeline_f.extend(text)

    # 영업전환 성공한 고객들의 timeline
    timeline_t = []
    for i in df[df['is_converted']]['expected_timeline']:
        if i not in delete_data:
            text = text_clean(i)
            timeline_t.extend(text)

    count_f = Counter(timeline_f).most_common()
    count_f = pd.DataFrame(count_f, columns = ['word', 'count'])

    count_t = Counter(timeline_t).most_common()
    count_t = pd.DataFrame(count_t, columns = ['word', 'count'])

    # 가장 빈도수 높은 단어 10개(실패)
    dic_ex_f = count_f.head(10).set_index('word').to_dict()['count']

    # 가장 빈도수 높은 단어 10개(성공)
    # unknown 제외
    dic_ex = count_t.head(10).set_index('word').to_dict()['count']

    # top10 단어들로 가중치 주기
    dic_ex_f['client'] = -10
    dic_ex_f['details'] = -9
    dic_ex_f['etc'] = -8
    dic_ex_f['followed'] = -7
    dic_ex_f['requirement'] = -6
    dic_ex_f['shared'] = -5
    dic_ex_f['system'] = -4
    dic_ex_f['customer'] = -3
    dic_ex_f['hence'] = -2
    dic_ex_f['call'] = -1

    dic_ex['demo'] = 10
    dic_ex['client'] = 9
    dic_ex['customer'] = 8
    dic_ex['shared'] = 7
    dic_ex['details'] = 6
    dic_ex['call'] = 5
    dic_ex['send'] = 4
    dic_ex['discussed'] = 3
    dic_ex['quote'] = 2
    dic_ex['followed'] = 1

    dic_all = {}

    for key, value in dic_ex_f.items():
        dic_all[key] = value

    for key, value in dic_ex.items():
        dic_all[key] = dic_all.get(key, 0) + value

    return dic_all

In [None]:
dic_all = make_weight_dic(df_train)

In [None]:
def preprocessing(df):
    # product_category
    product_dict = {}
    product_dict['HVAC/ESS'] = ['control', 'ventilation', 'vrf', 'multi-split', 'single-split', 'chiller', 'heating']
    product_dict['commercial display'] = ['oled signage', 'led signage', 'video wall signage', 'interactive signage',
                                        'high brightness signage', 'special signage', 'standard signage', 'hotel tv', 'hospital tv', 'software solution',
                                        'signage care solution', 'webos', 'procentric', 'one quick', 'interactive digital board']
    product_dict['it products'] = ['monitor', 'laptop', 'projector', 'cloud device', 'medical display']

    def original_category(text):
        for key in product_dict.keys():
            if text in product_dict[key]:
                return key

    # site 속 카테고리 기준 original 카테고리 변수 생성
    df['category'] = df['product_category'].apply(original_category)

    # ',' 포함한 데이터는 integrated solution으로 solution 카테고리로 처리
    df.loc[df['product_category'].str.contains(',', na = False), 'category'] = 'solution'

    # 임시로 nan으로 처리
    df['category'] = df['category'].fillna('nan')

    # product_dict로 처리되지 않은 카테고리 확인
    df[df['category'] == 'nan']['product_category'].value_counts()

    # 카테고리 입력 함수
    def category_input(text, input):
        df.loc[(df['product_category'].str.contains(text, na = False))&(df['category'] == 'nan'), 'category'] = input

    # 언어 번역
    def category_lang(text, input):
        df.loc[df['product_category'].str.contains(text, na = False), 'product_category'] = input

    # teto ou cassete inverter 번역 => 'ceiling or inverter cassette' => hvac/ess
    category_lang('teto ou cassete inverter', 'ceiling or inverter cassette')

    # rac => residential air conditioner => hvac/ess
    category_lang('rac', 'residential air conditioner')

    # ar condicionado residencial => residential air conditioner => hvac/ess
    category_lang('ar condicionado residencial', 'residential air conditioning')

    # outros => others
    category_lang('outros', 'others')

    # technical support, lg customer care program => support
    category_input('technical support', 'support')
    category_input('lg customer care program', 'support')

    # scroll compressor => compressor/motor
    category_input('scroll compressor', 'compressor/motor')

    # robots => robot
    category_input('robots', 'robot')


    # ogrzewanie (pompy ciepła) => heating(heat pumps)
    category_lang('ogrzewanie', 'heating(heat pumps)')

    # aire acondicionado residencial => residential air conditioning
    category_lang('aire acondicionado residencial', 'residential air conditioning')
    # led 顯示屏 => led display
    category_lang('led 顯示屏', 'led display')
    # isıtma => heating
    category_lang('isıtma', 'heating')
    # lainnya => other
    category_lang('lainnya', 'other')
    # calefacción => heating
    category_lang('calefacción', 'heating')
    # 互動式顯示屏 => interactive display
    category_lang('互動式顯示屏', 'interactive display')
    # 標準顯示屏 => standard display
    category_lang('標準顯示屏', 'standard display')
    # điều hòa trung tâm vrf => vrf central air conditioner
    category_lang('điều hòa trung tâm vrf', 'vrf central air conditioner')
    # soğutucu => cooler
    category_lang('soğutucu', 'cooler')
    # تكييف وتبريد => conditioning and cooling
    category_lang('تكييف وتبريد', 'conditioning and cooling')
    # 特別顯示屏 => special display
    category_lang('特別顯示屏', 'special display')
    # מזגנים למקום מגורים => residential air conditioner
    category_lang('מזגנים למקום מגורים', 'residential air conditioner')
    # เครื่องปรับอากาศเผื่อที่อยู่อาศัย => residential air conditioner
    category_lang('เครื่องปรับอากาศเผื่อที่อยู่อาศัย', 'residential air conditioner')
    # חימום => heating
    category_lang('חימום', 'heating')
    # تكييفات => air conditioner
    category_lang('تكييفات', 'air conditioner')
    # điều hòa cục bộ => local air conditioning
    category_lang('điều hòa cục bộ', 'local air conditioning')
    # 醫院電視 => hospital tv
    category_lang('醫院電視', 'hospital tv')
    # 高亮度顯示屏 => high brightness display
    category_lang('高亮度顯示屏', 'high brightness display')
    # 軟體 => software
    category_lang('軟體', 'software')
    # פיצול מרובה => multi split
    category_lang('פיצול מרובה', 'multi split')
    # 酒店電視 => hotel tv
    category_lang('酒店電視', 'hotel tv')
    # حلول التدفئة => heating solution
    category_lang('حلول التدفئة', 'heating solution')
    # אחר => other
    category_lang('אחר', 'other')
    # مبرد (تشيلر) => chiller
    category_lang('مبرد', 'chiller')
    # ฯลฯ => etc.
    category_lang('ฯลฯ', 'etc.')
    # điều hòa gia dụng => household air conditioner
    category_lang('điều hòa gia dụng', 'household air conditioner')
    # khác => other
    category_lang('khác', 'other')
    # otros => others
    category_lang('otros', 'others')
    # pendingin => cooler
    category_lang('pendingin', 'cooler')
    # ac rumah => home air conditioning
    category_lang('ac rumah', 'home air conditioning')
    # climatiseur résidentiel => residential air conditioner
    category_lang('climatiseur résidentiel', 'residential air conditioner')

    # it ptoducts
    it_products = ['pc', 'medical display', '28mq780', 'medical', 'monitor',
                'radiology displays', 'bu50nst', 'notebook']
    it_text = '|'.join(it_products)
    category_input(it_text, 'it products')

    # hvac/ess
    hvac_products = ['all lg vrf systems', 'multi', 'a thermodynamic water heater',
                    'residential', 'heating', 'chiller', 'condition', 'vrf',
                    'cooler', 'split','energy storage system', 'cac', 'single cac',
                    'system ac', 'ceiling or inverter cassette', 'residential air conditioner',
                    'multi inverter', 'residential air conditioning', 'ess', 'drv']
    hvac_text = '|'.join(hvac_products)
    category_input(hvac_text, 'hvac/ess')

    # commercial display
    display_products = ['ur640', 'signage', 'virtual production', 'commercial tv',
                        'videowall','43us660h0sd.awz','ledallinone','onequick',
                        'led display','education createboard', '.awz','allinone',
                        'leadallin','tv','fhd series', 'bwz', 'interactive display',
                        'one quick', 'series', 'aio', 'led','lsca039','43us660h',
                        '55vm5e', 'pro centric', 'gscd100','standard', 'lg magnit',
                        '86uh5f', '49vl5f','98uh5e', '55vm5j-h', '55tc3d', '49vl5g-m', '55svh7f-a', 'hospitality', 'laec15',
                        'retaildigital','gscd046', 'gsca046', 'collaboration displays', 'tr3', 'taa lcd lfd displays',
                        'window facing display', 'special display', 'hoteleria_us670h', 'software',
                        'laec015', 'high brightness display','videwall', 'idb', 'one:quick',
                        'high brightness', 'video wall', 'pro:centric', 'commercial display',
                        'lg paradise air solution'

                        ]
    display_text = '|'.join(display_products)
    category_input(display_text, 'commercial display')

    # product_dict로 처리되지 않은 카테고리 확인
    df[df['category'] == 'nan']['product_category'].value_counts()

    # 카테고리 입력 함수
    # subcategory 기준
    def sub_input(text, input):
        df.loc[(df['product_subcategory'].str.contains(text, na = False))&(df['category'] == 'nan'), 'category'] = input

    # product_subcategory, modelname만 적혀있는것도 큰 카테고리로 분류
    df['product_subcategory'] = df['product_subcategory'].str.lower()

    # จอภาพสำหรับการตรวจสอบทางคลินิก => monitor for clinical monitoring -> it products
    # จอภาพเพื่อการวินิจฉัย => diagnostic monitor
    # 其他 => other
    df.loc[df['product_subcategory'].str.contains('其他', na = False), 'category'] = 'other'
    # monitor => it products
    it_sub = ['monitor', 'medical', 'จอภาพสำหรับการตรวจสอบทางคลินิก', 'จอภาพเพื่อการวินิจฉัย',
            'cloud device', 'digital x-ray detectors', 'thin clients',
            'all projectors', 'laptops', 'probeam', 'zero clients']
    it_text = '|'.join(it_sub)
    sub_input(it_text, 'it products')

    # hvac/ess
    hvac_sub = ['all lg vrf systems', 'multi', 'a thermodynamic water heater',
                    'residential', 'heating', 'chiller', 'condition', 'vrf',
                    'cooler', 'split','energy storage system', 'cac', 'single cac',
                    'system ac', 'ess', '3.0 tr -1 nos. cassette']
    hvac_text = '|'.join(hvac_sub)
    sub_input(hvac_text, 'hvac/ess')

    # commercial display
    display_sub = ['pro:centric', 'signage', 'one:quick' ,'one-quick', 'webos box',
                'interactive digital board', 'tr3dj series', 'tr3bg series',
                'lg ops player', '65tr3bf', 'idb', 'lg smart cam pro','65tr3dj', 'supersign cms']
    display_text = '|'.join(display_sub)
    sub_input(display_text, 'commercial display')

    # 처리되지 않은 subcategory 확인
    df[df['category'] == 'nan']['product_subcategory'].value_counts()

    # 카테고리 입력 함수
    # modelname 기준
    def model_input(text, input):
        df.loc[(df['product_modelname'].str.contains(text, na = False))&(df['category'] == 'nan'), 'category'] = input

    # modelname 카테고리 분류

    it_model = ['UltraFine', '28MQ780', 'Ergo Dual', '21HQ513D', 'UltraWide', '32UN880',
                '31HN713D', '14HQ701G-BP', '38CL950P', 'Radiology']
    it_text = '|'.join(it_model)
    model_input(it_text, 'it products')

    # hvac/ess
    hvac_model = ['all lg vrf systems', 'multi', 'a thermodynamic water heater',
                    'residential', 'heating', 'chiller', 'condition', 'vrf',
                    'cooler', 'split','energy storage system', 'cac', 'single cac',
                    'system ac']
    hvac_text = '|'.join(hvac_model)
    model_input(hvac_text, 'hvac/ess')

    # commercial display
    display_model = ['43HT3WJ', '55CT5WJ', 'SC-00DA', 'LG SuperSign CMS', '65EP5G OLED Pro',
                    '34WN780', 'IDB', 'LSVP']
    display_text = '|'.join(display_model)
    model_input(display_text, 'commercial display')

    # 처리되지 않은 subcategory 확인
    df[df['category'] == 'nan']['product_modelname'].value_counts()

    # 처리되지 않은 것 모두 other로 통일
    df[df['category'] == 'nan']['product_category'].value_counts()

    df.loc[df['category'] == 'nan', 'category'] = 'other'

    # product 변수 얼마나 작성하였는지
    df['product_category'] = df['product_category'].fillna('unknown')
    df['product_subcategory'] = df['product_subcategory'].fillna('unknown')
    df['product_modelname'] = df['product_modelname'].fillna('unknown')

    df['product_count'] = 0
    for i, row in df.iterrows():
        count = 0
        if row['product_category'] != 'unknown':
            count += 1
        if row['product_subcategory'] != 'unknown':
            count += 1
        if row['product_modelname'] != 'unknown':
            count += 1
        df.loc[i, 'product_count'] = count

    # '_' -> 공백으로 처리
    df['expected_timeline'] = df['expected_timeline'].str.replace('_', ' ')

    # null값 unknown
    df['expected_timeline'] = df['expected_timeline'].fillna('unknown')

    # 단어가 들어있는 비중에 따라 가중치
    df['timeline_count'] = ''
    for i, row in df.iterrows():
        score = 0
        if row['expected_timeline'] == 'unknown':
            pass
        else:
            for key in dic_all.keys():
                if key in row['expected_timeline']:
                    score += dic_all[key]

        df.loc[i, 'timeline_count'] = score

    delete_data = ['less than 3 months','3 months ~ 6 months','more than a year','9 months ~ 1 year',
                   '6 months ~ 9 months','less than 6 months']

    df.loc[~df['expected_timeline'].isin(delete_data), 'expected_timeline'] = 0

    df['historical_existing_cnt'] = df['historical_existing_cnt'].fillna(0)

    # other 통일
    df.loc[df['customer_job'].str.contains('other', na = False), 'customer_job'] = 'others'

    # '_' 공백으로 변경
    df['customer_job'] = df['customer_job'].str.replace('_', ' ')

    # null값 0으로 처리
    df['customer_job'] = df['customer_job'].fillna('others')


    def job_categorize(text):
        if 'accounting' in text:
            return 'accounting'
        elif 'administrative' in text:
            return 'administrative'
        elif 'arts and design' in text:
            return 'arts and design'
        elif 'business development' in text:
            return 'business development'
        elif 'community and social services' in text:
            return 'community and social services'
        elif 'consulting' in text:
            return 'consulting'
        elif 'curation' in text:
            return 'curation'
        elif 'education' in text:
            return 'education'
        elif 'engineering' in text:
            return 'engineering'
        elif 'entrepreneurship' in text:
            return 'entrepreneurship'
        elif 'finance' in text:
            return 'finance'
        elif 'healthcare services' in text:
            return 'healthcare services'
        elif 'human resources' in text:
            return 'human resources'
        elif 'information technology' in text:
            return 'information technology'
        elif 'legal' in text:
            return 'legal'
        elif 'marketing' in text:
            return 'marketing'
        elif 'media and communication' in text:
            return 'media and communication'
        elif 'military and protective services' in text:
            return 'military and protective services'
        elif 'operations' in text:
            return 'operations'
        elif 'product management' in text:
            return 'product management'
        elif 'program and project management' in text:
            return 'program and project management'
        elif 'purchasing' in text:
            return 'purchasing'
        elif 'quality assurance' in text:
            return 'quality assurance'
        elif 'real estate' in text:
            return 'real estate'
        elif 'research' in text:
            return 'research'
        elif 'sales' in text:
            return 'sales'
        elif 'support' in text:
            return 'support'
        else:
            return 'other'

    df['customer_job'] = df['customer_job'].apply(job_categorize)

    # '-' 제거 및 '/' -> ','으로 처리
    df['customer_type'] = df['customer_type'].str.replace('-', ' ')
    df['customer_type'] = df['customer_type'].str.replace('/', ',')

    # ',' 뒤에만 공백이 남도록 전처리
    df['customer_type'] = df['customer_type'].apply(lambda x: re.sub(r'\s*,\s*', ', ', x) if isinstance(x, str) else x)

    # other 통일
    df.loc[df['customer_type'].str.contains('Other', na = False), 'customer_type'] = 'other'

    # etc도 other로 통일
    df['customer_type'] = df['customer_type'].str.replace('Etc.', 'other')

    # homeowner 통일
    df.loc[df['customer_type'].str.contains('Home', na = False), 'customer_type'] = 'Homeowner'

    # 1. null값 모두 other로 변경해서 처리
    df['customer_type'] = df['customer_type'].fillna('other')

    # other를 0으로 전처리
    df['customer_type'] = df['customer_type'].replace('other', 0)

    # installer -> installer, contractor
    df.loc[df['customer_type'].str.contains('Installer', na = False), 'customer_type'] = 'Installer, Contractor'

    # distributor -> dealer, distributor
    df.loc[df['customer_type'].str.contains('Distributor', na = False), 'customer_type'] = 'Dealer, Distributor'

    # consultant -> architect, consultant
    df.loc[df['customer_type'].str.contains('Consultant', na = False), 'customer_type'] = 'Architect, Consultant'

    # 위에 false만 있는 값을 모두 모아 1로 묶음
    df['customer_type'] = df['customer_type'].replace(['Corporate', 'Dealer, Distributor', 'System Integrator', 'Technician', 'Engineer', 'Manager, Director', 'Developer', 'End user', 'HVAC Engineer', 'Reseller', 'Software, Solution Provider', 'Technical Assistant', 'Commercial end user', 'Interior Designer', 'Administrator'], 1)

    # 이건 모두 null값 0으로 처리
    columns = ['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver']

    for col in columns:
        df[col] = df[col].fillna(0)

    # idit_all : id = 1, it =2, 결측치 = 0

    # 'idit_all' 열 초기화
    df['idit_all'] = 0

    # 'id_strategic_ver'에서 1 -> 1
    df.loc[df['id_strategic_ver'] == 1, 'idit_all'] = 1

    # 'it_strategic_ver'에서 1 -> 2
    df.loc[df['it_strategic_ver'] == 1, 'idit_all'] = 2

    # 나머지는 0으로 저장
    df['idit_all'].fillna(0)

    # id_strategic_ver, it_strategic_ver, idit_strategic_ver 드롭
    df.drop(columns=['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver'], inplace=True)

    def preprocess_country(text):
        # 나라 이름으로 전처리
        for country in list(country_list):
            if country in text:
                # print(country, text)
                return country
        return 'other'

    # 'customer_country' 열에 대해 전처리 적용
    df['customer_country'] = df['customer_country'].fillna('unknown')
    df['customer_country'] = df['customer_country'].str.lower()
    df['customer_country'] = df['customer_country'].apply(preprocess_country)

    # customer_country.1 드롭
    df.drop(columns=['customer_country.1'], inplace=True)

    # 데이터를 문자열로 변환하는 함수
    def convert_to_string(value):
        return str(value) if value is not None else ''

    def preprocess_inquiry(text):
        # 문자열로 변환
        text = convert_to_string(text)

        # 소문자로 변환
        text = text.lower()

        # 특수 문자 제거
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # 중복된 값 처리
        if 'technical' in text and text != 'technical support':
            return 'technical'
        if 'quotation' in text:
            return 'quotation'
        if 'sales' in text:
            return 'sales'
        if 'other' in text or 'etc' in text:
            return 'other'
        for inquiry in ['customer suggestions', 'nan', 'technical support', 'partnership', 'distributorship', 'demo', 'services', 'product information', 'trainings']:
            if inquiry in text:
                return text
        return 'others'

    # 'inquiry_type' 열에 대해 전처리 적용
    df['inquiry_type'] = df['inquiry_type'].apply(preprocess_inquiry)

In [None]:
preprocessing(df_train)
preprocessing(df_test)

In [None]:
# 파생변수 생성 함수
def generate_feature(df):
    # bant_submit
    df['bant_submit_count'] = df['bant_submit'].apply(lambda x: 1 if x == 0 else 0)

    # com_reg_ver_win_rate
    df['com_reg_count'] = df['com_reg_ver_win_rate'].apply(lambda x: 1 if x > 0.04 else 0)

    # customer_idx
    idx_count = df['customer_idx'].value_counts()
    df['idx_count'] = df['customer_idx'].apply(lambda x: 1 if x in idx_count[idx_count>1].index else 0)

    # lead_desc_length
    df['lead_log'] = df['lead_desc_length'].apply(lambda x: np.log(x))
    df['lead_count'] = df['lead_log'].apply(lambda x: 1 if x > 3.367296 else 0)
    # 전처리 과정에서 일단 lead_desc_length를 제거하진 않겠습니다

    # historical_existing_cnt
    df['enterprise_count'] = 0
    df.loc[(df['enterprise'] == 'Enterprise')&(df['historical_existing_cnt']!=0), 'enterprise_count'] = 1

    # enterprise, SMB 둘 다 있는 회사명에 가중치
    enterprise_2 = df.groupby('customer_idx')['enterprise'].nunique()
    idx = enterprise_2[enterprise_2==2].index
    df.loc[df['customer_idx'].isin(idx), 'enterprise_weight'] = 1
    df.loc[~df['customer_idx'].isin(idx), 'enterprise_weight'] = 0


In [None]:
generate_feature(df_train)
generate_feature(df_test)

In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

In [None]:
# 전처리 파일 저장
df_train.to_csv("train_last.csv", index=False)
df_test.to_csv("submission_last.csv", index=False)

## catboost

In [None]:
# 레이블 인코딩할 칼럼들
cat_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "customer_idx",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    "category",
    "product_count",
    "timeline_count",
    "idit_all",
    "lead_owner",

    "com_reg_count",
    "idx_count",
    "lead_count",
    "enterprise_count",
    "enterprise_weight"
]

def index_processing(context_df, train, test, column_name):
    idx = {v:k for k,v in enumerate(context_df[column_name].unique())}
    train[column_name] = train[column_name].map(idx)
    test[column_name] = test[column_name].map(idx)
    # train.loc[:, column_name] = train[column_name].map(idx)
    # test.loc[:, column_name] = test[column_name].map(idx)
    return idx

def process_context_data(train_df, test_df):
    context_df = pd.concat([train_df[cat_columns], test_df[cat_columns]]).reset_index(drop=True)
    idx = {}
    for col in cat_columns:
        idx_name = index_processing(context_df, train_df, test_df, col)
        idx[col+'2idx'] = idx_name
    return idx, train_df, test_df

def context_data_load():
    ######################## DATA LOAD
    train = pd.read_csv('train_last.csv', low_memory=False)
    test = pd.read_csv('submission_last.csv')

    idx, context_train, context_test = process_context_data(train, test)
    field_dims = np.array([len(toidx) for toidx in idx], dtype=np.int32)

    data = {
            'train':context_train.fillna(0),
            'test':context_test.fillna(0),
            'field_dims':field_dims,
            'cat_columns' : cat_columns,
            }


    return data

def context_data_split(data):
    # SMOTE를 사용하여 데이터 오버샘플링
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(data['train'].drop(['is_converted'], axis=1), data['train']['is_converted'])

    # 샘플링된 데이터를 다시 훈련 데이터와 테스트 데이터로 분할
    X_train, X_valid, y_train, y_valid = train_test_split(X_resampled,
                                                      y_resampled,
                                                      test_size=0.2,
                                                      random_state=42,
                                                      stratify=y_resampled)

    y_train = y_train.astype(np.int32) ; y_valid = y_valid.astype(np.int32)
    data['X_train'], data['X_valid'], data['y_train'], data['y_valid'], data['X_resampled'], data['y_resampled'] = X_train, X_valid, y_train, y_valid, X_resampled, y_resampled

    return data


In [None]:
data = context_data_load()
data = context_data_split(data)

In [None]:
x_train, x_val, y_train, y_val = data['X_train'], data['X_valid'], data['y_train'], data['y_valid']

In [None]:
x_train = x_train.drop('bant_submit_count', axis=1)
x_train = x_train.drop('ver_pro', axis=1)
x_val = x_val.drop('ver_pro', axis=1)
x_val = x_val.drop('bant_submit_count', axis=1)
data['X_resampled'] = data['X_resampled'].drop('ver_pro', axis=1)
data['X_resampled'] = data['X_resampled'].drop('bant_submit_count', axis=1)
data['test']=data['test'].drop('ver_pro', axis=1)
data['test']=data['test'].drop('bant_submit_count', axis=1)
data['X_valid'] = data['X_valid'].drop('ver_pro', axis=1)
data['X_valid']=data['X_valid'].drop('bant_submit_count', axis=1)

In [None]:
# bool-> 1,0
y_train = y_train.astype(int)
y_valid = y_val.astype(int)

In [None]:
param = {"random_state":42,
            "objective": "Logloss",
            "cat_features" : data['cat_columns'],
         'learning_rate': 0.06979873507394162, 'bagging_temperature': 49.5227392420259, 'n_estimators': 1309, 'max_depth': 15, 'random_strength': 26, 'l2_leaf_reg': 1.987904330777592e-05, 'min_child_samples': 34, 'max_bin': 356, 'od_type': 'IncToDec'}

In [None]:
model = cb.CatBoostClassifier(**param, devices = '0')

In [None]:
model.fit(
            data['X_resampled'],
            data['y_resampled'].astype(int),
            eval_set=[(x_val, y_val)],
            early_stopping_rounds = 50,
            verbose=10
        )

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
# 테스트 데이터로 예측
pred_proba = model.predict_proba(data['X_valid'])[:,1]
threshold = np.median(pred_proba)
pred = pred_proba >= threshold
get_clf_eval(data['y_valid'], pred)

In [None]:
print(threshold)

In [None]:
# 예측에 필요한 데이터 분리
test_pred_proba = model.predict_proba(data['test'].drop(["is_converted", "id"], axis=1))[:,1]

In [None]:
threshold = np.median(test_pred_proba)
test_pred = (test_pred_proba >= threshold).astype(bool)

In [None]:
sum(test_pred) # True로 예측된 개수

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

In [None]:
# 제출 파일 저장
df_sub.to_csv("submission_cat.csv", index=False)

## TabNet

In [None]:
# categorical 변수
label = ['customer_country', 'business_unit', 'customer_idx', 'customer_type',
         'enterprise', 'customer_job', 'inquiry_type', 'product_category', 'product_subcategory', 'product_modelname',
         'customer_position', 'response_corporate', 'expected_timeline',
         'business_area', 'business_subarea', 'lead_owner', 'category', 'idit_all']

In [None]:
# f1 score metric 지정
class F1_Score(Metric):
    def __init__(self):
        self._name = 'F1_score'
        self._maximize = True

    def __call__(self, y_true, y_score):
        y_pred = np.argmax(y_score, axis = 1)
        score = f1_score(y_true, y_pred, average = 'macro')
        return score

In [None]:
def preprocessing(df_train, df_test, label):
    df_all = pd.concat([df_train, df_test.drop('id', axis = 1)] , ignore_index = True)

    # 전처리
    df_all['inquiry_type'] = df_all['inquiry_type'].fillna('nan')

    # business area, subarea 모두 nan으로 채움
    df_all['business_area'] = df_all['business_area'].fillna('nan')
    df_all['business_subarea'] = df_all['business_subarea'].fillna('nan')

    # ver_win_rate_x, ver_win_ratio_per_bu, com_reg_ver_win_rate 모두 null값 0으로 처리
    df_all['ver_win_rate_x'] = df_all['ver_win_rate_x'].fillna(0)
    df_all['ver_win_ratio_per_bu'] = df_all['ver_win_ratio_per_bu'].fillna(0)
    df_all['com_reg_ver_win_rate'] = df_all['com_reg_ver_win_rate'].fillna(0)

    # label encoding
    for col in label:
        le = LabelEncoder()
        df_all[col] = le.fit_transform(df_all[col].values)

    # train, test 다시 분리
    df_train = df_all.iloc[: len(df_train)]
    df_test = df_all.iloc[len(df_train):]

    # target 인코딩
    df_train['is_converted'] = df_train['is_converted'].apply(lambda x: 1 if x == True else 0)

    return df_train, df_test, df_all

# smote
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

def smote(method, df):
    smt = SMOTE(random_state = 400)
    smoteto = SMOTETomek(tomek = TomekLinks(sampling_strategy = 'majority'), random_state = 400)
    bdsmt = BorderlineSMOTE(random_state = 400)

    if method == 'smt':
        trsx, trsy = smt.fit_resample(df.drop('is_converted', axis = 1), df['is_converted'])

    elif method == 'smoteto':
        trsx, trsy = smoteto.fit_resample(df.drop('is_converted', axis = 1), df['is_converted'])

    elif method == 'bdsmt':
        trsx, trsy = bdsmt.fit_resample(df.drop('is_converted', axis = 1), df['is_converted'])
    else:
        print('method error')

    return trsx, trsy

def before_train(smote_method, df, label):
    # 오버샘플링
    # smote: smt, smote+tomek: smoteto, borderline smote: bdsmt
    trsx, trsy = smote(smote_method, df)

    # cat_idxs, cat_dims 지정
    use_label = [col for col in df.columns if col in label]

    categorical_dim = {}
    for col in use_label:
        categorical_dim[col] = df_all[col].nunique()

    cat_idxs = [trsx.columns.get_loc(col) for col in use_label]
    cat_dims = [categorical_dim[f] for f in use_label]

    return trsx, trsy, cat_idxs, cat_dims

def result_vis(model, df):
    # f1_score, loss
    fig, ax = plt.subplots(1,2, figsize = (16,8))

    ax[0].plot(model.history['val_0_F1_score'])
    ax[0].set_title('f1_score')
    ax[1].plot(model.history['loss'])
    ax[1].set_title('loss')

    # feature importances
    feature_importances = model.feature_importances_
    num_features = len(feature_importances)
    plt.figure(figsize = (10,6))
    plt.barh(range(num_features), feature_importances, align = 'center')
    plt.yticks(np.arange(num_features), df.drop('is_converted', axis = 1).columns)
    plt.xlabel('feature importance')
    plt.title('tabnet feature importances')
    plt.show()

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
# 데이터 전처리
df_train, df_test, df_all = preprocessing(df_train, df_test, label)

In [None]:
# 상관관계 낮은 변수 제거
train_04 = df_train[['customer_country', 'com_reg_ver_win_rate', 'enterprise',
       'historical_existing_cnt', 'customer_job', 'customer_position',
       'expected_timeline', 'ver_win_rate_x', 'business_subarea', 'lead_owner',
       'is_converted', 'product_count', 'timeline_count', 'bant_submit_count',
       'lead_count', 'enterprise_weight']]

test_04 = df_test[['customer_country', 'com_reg_ver_win_rate', 'enterprise',
       'historical_existing_cnt', 'customer_job', 'customer_position',
       'expected_timeline', 'ver_win_rate_x', 'business_subarea', 'lead_owner',
       'is_converted', 'product_count', 'timeline_count', 'bant_submit_count',
       'lead_count', 'enterprise_weight']]

In [None]:
# borderline smote 사용
trsx, trsy, cat_idxs, cat_dims = before_train('bdsmt', df_train, label)

In [None]:
trsx = trsx.values
trsy = trsy.values

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    trsx,
    trsy,
    stratify = trsy,
    shuffle = True,
    random_state = 400
)

In [None]:
tabnet_params = dict(n_d = 8, n_a = 8, n_steps = 1,
                    cat_idxs = cat_idxs, cat_dims = cat_dims,
                    gamma = 2.0, lambda_sparse = 2.057796554216087e-05,
                    optimizer_fn = torch.optim.Adam,
                    optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
                    mask_type = 'entmax', n_shared = 2, n_independent = 5,
                    cat_emb_dim = 9,
                    seed = 0,
                    scheduler_params = dict(mode = 'min',
                                            patience = 5,
                                            min_lr = 1e-5,
                                            factor = 0.5),
                    scheduler_fn = torch.optim.lr_scheduler.ReduceLROnPlateau,
                    verbose = 1)

In [None]:
clf = TabNetClassifier(**tabnet_params)
clf.fit(X_train = x_train, y_train = y_train,
        eval_set = [(x_val, y_val)],
        patience = 3, max_epochs = 52,
        virtual_batch_size = 256, batch_size = 2048,
        weights = 0,
        num_workers = 0,
        eval_metric = ['F1_score'])

In [None]:
# pr curve에서 가장 최적점(precision, recall 모두 높은 threshold) 찾는 함수
# 참고용
def find_optimal_threshold(y_true, y_scores):
  precision, recall, thresholds = precision_recall_curve(y_true, y_scores)

  f1_scores = 2 *(precision * recall) / (precision + recall)
  optimal_threshold = thresholds[np.argmax(f1_scores)]

  return optimal_threshold

y_scores = clf.predict_proba(x_val)[:,1]
threshold = find_optimal_threshold(y_val, y_scores)
threshold

In [None]:
y_pred = clf.predict(x_val)
f1 = f1_score(y_val, y_pred)
f1

In [None]:
y_scores = clf.predict_proba(x_val)[:,1]
threshold = find_optimal_threshold(y_val, y_scores)
threshold

In [None]:
x_test = test_04.drop('is_converted', axis = 1).values

In [None]:
pred_proba = clf.predict_proba(x_test)[:,1]
pred_proba_ex = (pred_proba>threshold).astype(int)
compare(pred_proba_ex)

In [None]:
# x_test = df_test.drop('is_converted', axis = 1).values
pred = clf.predict(x_test)

In [None]:
compare(pred)

In [None]:
df_sub = pd.read_csv('submission (1).csv')
df_sub['is_converted'] = pred_proba_ex # threshold 기준 데이터 분류
df_sub['is_converted_proba'] = pred_proba
df_sub.to_csv('submission_tabnet.csv', index = False)

In [None]:
feature_importances = clf.feature_importances_
num_features = len(feature_importances)
plt.figure(figsize = (10,6))
plt.barh(range(num_features), feature_importances, align = 'center')
plt.yticks(np.arange(num_features), train_04.drop('is_converted', axis = 1).columns)
plt.xlabel('feature importance')
plt.title('tabnet feature importances')
plt.show()

## LightGBM

In [None]:
# DATA LOAD
data = context_data_load()

# Train/Valid Split
data = context_data_split(data)

In [None]:
data['train'] # 학습용 데이터 살펴보기

In [None]:
X, y = data['X_resampled'], data['y_resampled']
test = data['test'].drop(["is_converted", "id"], axis=1)

In [None]:
X.columns

In [None]:
categorical_feature = data['cat_columns']

In [None]:
X_train, X_valid, y_train, y_valid = data['X_train'], data['X_valid'], data['y_train'], data['y_valid']

In [None]:
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature = categorical_feature, free_raw_data=False)
val_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature = categorical_feature, free_raw_data=False)

In [None]:
def objective(trial):
      param = {
          "objective": "binary",
          "metric": "binary_logloss",
          "verbosity": -1,
          "max_bin ": 510,
          "boosting_type": 'gbdt',
          'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
          "n_estimators":trial.suggest_int("n_estimators", 100, 1000),
          "max_depth":trial.suggest_int("max_depth", 4, 20),
          "num_leaves": trial.suggest_int("num_leaves", 10, 100)
          }

      # LightGBM 모델 학습
      lgb_model = lgb.train(param, train_data, valid_sets =[val_data],
                            num_boost_round=10, categorical_feature = categorical_feature)
      preds = lgb_model.predict(X_valid, num_iteration=lgb_model.best_iteration)
      pred_labels = np.rint(preds)
      F1 = f1_score(y_valid, pred_labels)
      return F1

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'lgbm_parameter_optuna',
    direction = 'maximize',
    sampler = sampler,
)
study.optimize(objective, n_trials=30)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

model = lgb.train(study.best_params, train_data, valid_sets =[val_data],
                  num_boost_round=10, categorical_feature = categorical_feature)

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
# classifier
pred = model.predict(X_valid)
threshold = np.median(pred)
pred = pred >= threshold
get_clf_eval(y_valid, pred)

In [None]:
# 예측에 필요한 데이터 분리
test_pred = model.predict(test)
test_pred = test_pred >= threshold
#classifier
sum(test_pred) # True로 예측된 개수

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub["is_converted"] = df_sub["is_converted"].astype(bool)

In [None]:
# 제출 파일 저장
df_sub.to_csv("submission_lgbm.csv", index=False)

## randomforest

In [None]:
# DATA LOAD
data = context_data_load()

# Train/Valid Split
data = context_data_split(data)

In [None]:
data['train'] # 학습용 데이터 살펴보기

In [None]:
x_train, x_val, y_train, y_val = data['X_train'], data['X_valid'], data['y_train'], data['y_valid']

### 3. 모델

In [None]:
param = {"random_state":42,
        'n_estimators': 105, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 4, 'bootstrap': True}

In [None]:
model = RandomForestRegressor(**param, verbose=False)

In [None]:
model.fit(x_train.fillna(0), y_train)

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
#Regressor
pred = model.predict(x_val.fillna(0))
threshold = np.median(pred)
pred = pred >= threshold
get_clf_eval(y_val, pred)

In [None]:
# 예측에 필요한 데이터 분리
test_pred = model.predict(data['test'].drop(["is_converted", "id"], axis=1))

In [None]:
#Regressor
test_pred = test_pred >= threshold
sum(test_pred) # True로 예측된 개수

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub

In [None]:
# 제출 파일 저장 (regressor)
df_sub.to_csv("submission_rfr.csv", index=False)

## 앙상블

In [None]:
test_tab = pd.read_csv("submission_tabnet.csv")
test_cat = pd.read_csv("submission_cat.csv")
test_lgbm = pd.read_csv("submission_lgbm.csv")
test_rfr = pd.read_csv("submission_rfr.csv")

In [None]:
pred_cat = test_cat['is_converted'].apply(lambda x: 1.5*x)
pred_lgbm = test_lgbm['is_converted'].apply(lambda x: 1 if x == True else 0)
pred_tab = test_tab['is_converted_proba'].apply(lambda x: 1.3*x)
pred_rfr = test_rfr['is_converted'].apply(lambda x: 1 if x == True else 0)

In [None]:
df_ensemble = pd.DataFrame()
df_ensemble['cat'] = pred_cat
df_ensemble['lgbm'] = pred_lgbm
df_ensemble['tabnet'] = pred_tab
df_ensemble['rfr'] = pred_rfr
df_ensemble['sum'] = df_ensemble['cat'] + df_ensemble['lgbm'] + df_ensemble['tabnet'] + df_ensemble['rfr']
df_ensemble['is_converted'] = df_ensemble['sum'].apply(lambda x: True if x > 2.525 else False)
test_pred = df_ensemble['is_converted']
print(sum(test_pred))
df_ensemble

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub["is_converted"] = df_sub["is_converted"].astype(bool)
df_sub

In [None]:
# 제출 파일 저장 0.707742639040349
df_sub.to_csv("submission.csv", index=False)