# 0. Package Import

In [311]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
import seaborn as sns
from collections import Counter
from sklearn.metrics import *
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import warnings
import datetime
from math import isnan
import os
warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

PATH = './data/'

plt.rcParams["figure.figsize"] = (10,4)

# ------------------ 전처리된 데이터로 실행 시 2-11부터 실행바랍니다. ------------------

# 1. Raw Data Import

1. quality_data = 시스템 작동 중 문제 발생 시 측정한 지표들 (2시간동안 12회 수집)
2. error_data =  시스템 작동시 시스템 로그 중 상태와 관련있는 로그 수집 (시스템 연결상태 및 강제 리붓 등)
3. problem_data = 고객이 실제 불만을 제기한 시점

In [312]:
PATH = "./data/"

train_quality = pd.read_csv(PATH + 'train_quality_data.csv')
train_error = pd.read_csv(PATH + 'train_err_data.csv')
train_problem = pd.read_csv(PATH + 'train_problem_data.csv')

test_quality = pd.read_csv(PATH + "test_quality_data.csv")
test_error = pd.read_csv(PATH + "test_err_data.csv")
submission = pd.read_csv(PATH + "sample_submission.csv")

# 2. Data Preprocessing

## 2-1. Data shape

In [313]:
print("train_quality shape :", train_quality.shape)
print("train_error shape :", train_error.shape)
print("train_problem shape :", train_problem.shape)
print()
print("test_quality shape :", test_quality.shape)
print("test_error shape :", test_error.shape)

train_quality shape : (828624, 16)
train_error shape : (16554663, 6)
train_problem shape : (5429, 2)

test_quality shape : (747972, 16)
test_error shape : (16532648, 6)


## 2-2. Time processing

* test_error의 time 데이터 중 11월을 벗어나는 데이터가 소량 존재함
* train_error에는 존재하지 않는 시점이므로 시점통일을 위해 제거해줌

In [314]:
def time_processing(raw_df):
    df = raw_df.copy()
    start_date = datetime.datetime(2020,11,1)
    end_date = datetime.datetime(2020,12,1)
    
    # 1. datetime 형식 변환
    df["time"] = pd.to_datetime(df["time"], format = "%Y%m%d%H%M%S")
    
    # 2. 2020.11.1 ~ 2020.11.30까지의 데이터만 추출
    df = df[(df.time >= start_date) & (df.time < end_date)]
    
    
    # 3. 일자 데이터 추출
    df["day"] = list(map(lambda x: x.day, df.time))
    return df

In [315]:
train_quality = time_processing(train_quality)
train_error = time_processing(train_error)
train_problem = time_processing(train_problem)

test_quality = time_processing(test_quality)
test_error = time_processing(test_error)

In [316]:
# 시간 오래걸림
train_quality_period = (min(train_quality.time), max(train_quality.time))
train_error_period = (min(train_error.time), max(train_error.time))
problem_period = (min(train_problem.time), max(train_problem.time))
test_quality_period = (min(test_quality.time), max(test_quality.time))
test_error_period = (min(test_error.time), max(test_error.time))


print("train_quality's time period :", train_quality_period[0],"~",train_quality_period[1]) # quality's period : 2020-10-31 23:50:00 ~ 2020-11-30 23:40:00
print("train_error's time period :", train_error_period[0],"~",train_error_period[1])       # error's period : 2020-10-31 23:59:00 ~ 2020-12-02 18:51:00
print("train_problem's time period :", problem_period[0],"~",problem_period[1])             # problem's period : 2020-11-01 00:00:00 ~ 2020-11-30 23:00:00
print("test_quality's time period :", test_quality_period[0],"~",test_quality_period[1])    # quality's period : 2020-10-31 23:50:00 ~ 2020-11-30 23:40:00
print("test_error's time period :", test_error_period[0],"~",test_error_period[1])          # error's period : 2020-10-31 23:59:00 ~ 2020-12-02 18:51:00

train_quality's time period : 2020-11-01 00:00:00 ~ 2020-11-30 23:40:00
train_error's time period : 2020-11-01 00:00:00 ~ 2020-11-30 23:59:59
train_problem's time period : 2020-11-01 00:00:00 ~ 2020-11-30 23:00:00
test_quality's time period : 2020-11-01 00:00:00 ~ 2020-11-30 23:40:00
test_error's time period : 2020-11-01 00:00:00 ~ 2020-11-30 23:59:59


## 2-3. Data types

In [317]:
train_quality.dtypes

time          datetime64[ns]
user_id                int64
fwver                 object
quality_0            float64
quality_1              int64
quality_2            float64
quality_3              int64
quality_4              int64
quality_5             object
quality_6              int64
quality_7             object
quality_8             object
quality_9             object
quality_10            object
quality_11             int64
quality_12             int64
day                    int64
dtype: object

In [318]:
train_error.dtypes

user_id              int64
time        datetime64[ns]
model_nm            object
fwver               object
errtype              int64
errcode             object
day                  int64
dtype: object

In [319]:
train_problem.dtypes

user_id             int64
time       datetime64[ns]
day                 int64
dtype: object

In [320]:
test_quality.dtypes

time          datetime64[ns]
user_id                int64
fwver                 object
quality_0            float64
quality_1             object
quality_2            float64
quality_3              int64
quality_4              int64
quality_5             object
quality_6              int64
quality_7             object
quality_8             object
quality_9             object
quality_10            object
quality_11             int64
quality_12             int64
day                    int64
dtype: object

In [321]:
test_error.dtypes

user_id              int64
time        datetime64[ns]
model_nm            object
fwver               object
errtype              int64
errcode             object
day                  int64
dtype: object

## 2-4. Quality data type reform

In [322]:
def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    if str(x) == "nan":
        return None
    
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)

In [323]:
def quality_data_reform(df):
    tmp_df = df.copy()
    qual_cols = train_quality.columns[train_quality.columns.str.contains("quality")]
    for col in qual_cols:
        tmp_df[col] = tmp_df[col].apply(string2num)
    return tmp_df

In [324]:
train_quality = quality_data_reform(train_quality)
test_quality = quality_data_reform(test_quality)

* 데이터 확인

In [325]:
train_quality.dtypes

time          datetime64[ns]
user_id                int64
fwver                 object
quality_0            float64
quality_1              int64
quality_2            float64
quality_3              int64
quality_4              int64
quality_5            float64
quality_6              int64
quality_7              int64
quality_8              int64
quality_9              int64
quality_10             int64
quality_11             int64
quality_12             int64
day                    int64
dtype: object

In [326]:
test_quality.dtypes

time          datetime64[ns]
user_id                int64
fwver                 object
quality_0            float64
quality_1            float64
quality_2            float64
quality_3              int64
quality_4              int64
quality_5            float64
quality_6              int64
quality_7              int64
quality_8              int64
quality_9              int64
quality_10             int64
quality_11             int64
quality_12             int64
day                    int64
dtype: object

In [327]:
train_quality.isna().sum()

time               0
user_id            0
fwver          40068
quality_0     144420
quality_1          0
quality_2      40101
quality_3          0
quality_4          0
quality_5         20
quality_6          0
quality_7          0
quality_8          0
quality_9          0
quality_10         0
quality_11         0
quality_12         0
day                0
dtype: int64

In [328]:
test_quality.isna().sum()

time               0
user_id            0
fwver          22764
quality_0     106584
quality_1         11
quality_2      21115
quality_3          0
quality_4          0
quality_5         44
quality_6          0
quality_7          0
quality_8          0
quality_9          0
quality_10         0
quality_11         0
quality_12         0
day                0
dtype: int64

## 2-5. Null check

In [329]:
train_quality.isnull().sum()

time               0
user_id            0
fwver          40068
quality_0     144420
quality_1          0
quality_2      40101
quality_3          0
quality_4          0
quality_5         20
quality_6          0
quality_7          0
quality_8          0
quality_9          0
quality_10         0
quality_11         0
quality_12         0
day                0
dtype: int64

In [330]:
test_quality.isnull().sum()

time               0
user_id            0
fwver          22764
quality_0     106584
quality_1         11
quality_2      21115
quality_3          0
quality_4          0
quality_5         44
quality_6          0
quality_7          0
quality_8          0
quality_9          0
quality_10         0
quality_11         0
quality_12         0
day                0
dtype: int64

In [331]:
train_error.isnull().sum()

user_id     0
time        0
model_nm    0
fwver       0
errtype     0
errcode     1
day         0
dtype: int64

In [332]:
test_error.isnull().sum()

user_id     0
time        0
model_nm    0
fwver       0
errtype     0
errcode     4
day         0
dtype: int64

In [333]:
train_problem.isnull().sum()

user_id    0
time       0
day        0
dtype: int64

## 2-6. Null Preprocessing

### 1) train_quality 내 fwver null 채우기 
* Main IDEA : (quality.user_id -> error.user_id)를 추적해 error데이터에서 사용중인 fwver를 가져온다.
    1. quality.fwver에서 null값을 가지는 user_id 리스트 생성
    2. error.user_id가 가지는 각 fwver 빈도수 체크
    3. 가장 빈도수가 많은 fwver로 quality.fwver에 대체

In [334]:
def get_null_index_by_user(df, uid, col_name):
    df = df[df.user_id == uid]
    return df[df[col_name].isnull()].index

In [335]:
def quality_fwver_null_processing(raw_quality, raw_error):
    df = raw_quality.copy()
    fwver_null_uids = df[df.fwver.isnull()].user_id.unique()
    tmp_err = raw_error[["user_id","fwver"]]
    
    for uid in fwver_null_uids:
        grouped_err = tmp_err[tmp_err.user_id == uid].groupby("fwver").count()
        try:
            fw_ver = grouped_err.sort_values("user_id", ascending = False).iloc[0].name
        except:
            continue
        
        null_idx = get_null_index_by_user(df, uid, "fwver")
        df.loc[null_idx,"fwver"] = fw_ver
    return df

In [336]:
train_quality = quality_fwver_null_processing(train_quality, train_error)
test_quality = quality_fwver_null_processing(test_quality, test_error)

In [337]:
train_quality.isnull().sum()

time               0
user_id            0
fwver              0
quality_0     144420
quality_1          0
quality_2      40101
quality_3          0
quality_4          0
quality_5         20
quality_6          0
quality_7          0
quality_8          0
quality_9          0
quality_10         0
quality_11         0
quality_12         0
day                0
dtype: int64

In [338]:
test_quality.isnull().sum()

time               0
user_id            0
fwver            444
quality_0     106584
quality_1         11
quality_2      21115
quality_3          0
quality_4          0
quality_5         44
quality_6          0
quality_7          0
quality_8          0
quality_9          0
quality_10         0
quality_11         0
quality_12         0
day                0
dtype: int64

### 2) train_quality 내 각종 quality변수 null 채우기
>* Main Idea : fwver에 따라 quality의 분포가 대략적으로 결정된다. 
>* 따라서 동 fwver의 다른 quality 데이터에서 샘플을 추출하는 방식으로 기존의 데이터 분포를 모방 할 계획  
>
>
>* EDA를 통해 데이터를 훑어본 결과, quality데이터가 null인 경우는 두가지가 존재한다.
>     1. 특정 quality가 특정 fwver에 대해 통째로 null인 경우
>     2. 특정 fwver에 대해 특정 quality가 통째로 null이 이난 경우
>     
>     
>* 이에 따라 위 조건에 따라 다른 처리가 필요하다. 가령 1의 경우 전부 0으로 만들어주는 식.

In [339]:
np.random.seed(0) # 과정을 반복해도 동일한 결과가 나오도록하기 위해 seed 0으로 초기화

In [340]:
def quality_null_processing(raw_df):
    df = raw_df.copy()
    null_exist_cols = df.columns[df.isnull().sum() > 0].values
    
    for col in null_exist_cols:
        fw_vers = df[df[col].isnull()].fwver.unique()
        
        for fw_ver in fw_vers:
            table = df[(df.fwver == fw_ver) & (df[col].notnull())]
            choice_pool = table[col].values
            null_idxs = df[(df.fwver == fw_ver) & (df[col].isnull())].index
            
            choice_size = len(null_idxs)
            
            if len(choice_pool) == 0:
                choice_pool = [0]
            
            df.loc[null_idxs, col] = np.random.choice(choice_pool, choice_size)

    return df

In [341]:
train_quality = quality_null_processing(train_quality)

In [342]:
test_quality = quality_null_processing(test_quality)

In [343]:
train_quality.isnull().sum()

time          0
user_id       0
fwver         0
quality_0     0
quality_1     0
quality_2     0
quality_3     0
quality_4     0
quality_5     0
quality_6     0
quality_7     0
quality_8     0
quality_9     0
quality_10    0
quality_11    0
quality_12    0
day           0
dtype: int64

In [344]:
test_quality.isnull().sum()

time            0
user_id         0
fwver         444
quality_0     444
quality_1       0
quality_2     444
quality_3       0
quality_4       0
quality_5       0
quality_6       0
quality_7       0
quality_8       0
quality_9       0
quality_10      0
quality_11      0
quality_12      0
day             0
dtype: int64

* test_quality에는 error에도 존재하지 않는 fwver이 있었다.
* 따라서 해당 fwver에 대한 결측값 대체는 불가능하므로 삭제해준다.

In [345]:
test_quality = test_quality.dropna()

In [346]:
test_quality.isnull().sum()

time          0
user_id       0
fwver         0
quality_0     0
quality_1     0
quality_2     0
quality_3     0
quality_4     0
quality_5     0
quality_6     0
quality_7     0
quality_8     0
quality_9     0
quality_10    0
quality_11    0
quality_12    0
day           0
dtype: int64

## 2-7. Target data generate

In [347]:
problem = np.zeros(15000, dtype = int)
# person_idx의 problem이 한 번이라도 발생했다면 1
# 없다면 0
min_user_id = 10000

problem[train_problem.user_id.unique() - min_user_id] = 1 
problem.shape

(15000,)

In [348]:
problem.dtype

dtype('int32')

## 2-8. Qaulity data describe

In [349]:
train_quality.describe()

Unnamed: 0,user_id,quality_0,quality_1,quality_2,quality_3,quality_4,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,day
count,828408.0,828408.0,828408.0,828408.0,828408.0,828408.0,828408.0,828408.0,828408.0,828408.0,828408.0,828408.0,828408.0,828408.0,828408.0
mean,17574.629357,37.43176,0.199083,48.78499,0.0,0.0,74.911366,2.414512,26.750572,0.163716,56.358925,896.7144,0.189299,0.04589,15.308109
std,4374.1354,4356.004,0.685093,5719.742,0.0,0.0,2278.917033,32.674342,317.915952,5.154911,3280.777658,16523.17,0.394198,0.30249,8.762233
min,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,13685.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,8.0
50%,17597.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,15.0
75%,21424.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,39.0,0.0,0.0,23.0
max,24997.0,1576670.0,171.0,1918590.0,0.0,0.0,637385.0,600.0,7200.0,1317.0,397424.0,1910175.0,14.0,14.0,30.0


In [350]:
test_quality.describe()

Unnamed: 0,user_id,quality_0,quality_1,quality_2,quality_3,quality_4,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,day
count,747396.0,747396.0,747396.0,747396.0,747396.0,747396.0,747396.0,747396.0,747396.0,747396.0,747396.0,747396.0,747396.0,747396.0,747396.0
mean,37414.627603,21.425924,0.223025,64.80222,0.0,0.0,65.846492,2.499269,27.716359,0.400751,75.56544,787.881,0.193025,0.040589,15.260328
std,4259.202732,2027.988912,12.137486,11826.68,0.0,0.0,1879.186953,33.6384,324.811143,43.424185,8766.098,17579.52,0.398132,0.337904,8.748296
min,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,33783.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,8.0
50%,37282.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,15.0
75%,41064.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0,23.0
max,44997.0,930380.0,10452.0,6366190.0,0.0,0.0,156398.0,600.0,7200.0,10452.0,1954316.0,1172849.0,17.0,19.0,30.0


* quality_3, quality_4는 모든 데이터가 0임을 확인
    * 분포확인이 불가능해 problem예측에 불필요한 데이터 -> **제거**

In [351]:
train_quality = train_quality.drop(["quality_3","quality_4"], axis = 1)
test_quality = test_quality.drop(["quality_3","quality_4"], axis = 1)

## 2-9. Feature engineering (user별 요약)
> ### 1. Quality data **(93 columns)**
>> 1. quality_3, quality_4 제거
>>> * quality_3, quality_4는 모든 데이터가 0임을 확인
>>> * 분포확인이 불가능해 problem예측에 불필요한 데이터 -> **제거**
>> 1. 중복행 제거
>>> * 동일 user, 동일 시간, quality_0 ~ quality_12 까지 모두 일치하는 행이 약 55만개 (전체행 82만개)
>>> * 모델의 편향을 줄 수 있다 생각하여 중복되는 행을 제거
>> 2. 유저별 quality 지표 변경횟수 집계 **(quality_change_ : 5열)**
>>> * (1)에서 중복행을 제거해준것을 토대로 유저별로 1일간 quality가 변경되었던 횟수를 count하여 나열 ([유저, 일자]로 groupby)
>>> * 유저별로 quality 변경횟수를 집계(합계, 평균값, 분산, 최대값, count)
>> 3. 유저별 quality 지표 (평균, Q1, 중앙값, Q3, 분산, 최대값, 최소값, 범위) 추출 **(quality_N_ : 88열)**
>>> * 각 quality 데이터를 유저별로 요약하여 지도학습을 위함
>>> * quality 데이터가 좌측으로 심하게 치우친 분포이므로 mean 대신 median 사용

> ### 2. Error data **(79 columns)**
>> 1. 일별 error 발생횟수 열 생성 **(err_at_N : 30열)**
>>> * ex) 11월 N일에 user_id가 10000인 user에게 발생된 error의 횟수
>> 2. 유저별 errtype 발생횟수 **(errtype_N : 42열)**
>>> * error type 갯수가 42개인것이 확인됨
>>> * 각 error type 별 갯수로 고객의 불만을 예측 하고자 함
>> 3. fwver 변경 로그 추출 **(fwver_N : 4열)**
>>> * 유저별 fwver변경은 최대 3회가 일어난다.
>>> * fwver 변경시 고객의 문제가 제기 되는것을 확인 할 수 있었음
>> 4. model 변경 로그 추출 **(model_nm_N : 3열)**
>>> * model 변경시 고객의 문제가 제기 되는것을 확인 할 수 있었음

### 1. Qaulity data

In [352]:
def Q1(x):
    return np.percentile(x,25)

def Q3(x):
    return np.percentile(x,75)

def data_range(x):
    return max(x) - min(x)

def make_quality(quality_data):
    # quality 3,4 제거
    if "quality_3" in quality_data.columns:
        quality_data = quality_data.drop(["quality_3","quality_4"], axis = 1)
    qual_cols = quality_data.columns[quality_data.columns.str.contains("quality")]
    
    # 1. 중복제거
    unique_train_quality = quality_data.drop_duplicates()

    # 2. 일별 quality 변화 횟수 집계
    quality_change_in_day = unique_train_quality[["user_id","day","time"]].groupby(["user_id","day"]).count()
    quality_change_in_day.columns = ["quality_change"]
    quality_change_in_day = quality_change_in_day.reset_index()

    summary_quality_count_by_day = quality_change_in_day[["user_id","quality_change"]].groupby("user_id").agg(["sum","mean","std","max","count"])
    summary_quality_count_by_day = summary_quality_count_by_day.fillna(0)
    new_cols = ["_".join(col) for col in summary_quality_count_by_day.columns]
    summary_quality_count_by_day.columns = new_cols
    
    # 3. user별 quality 특성 집계
    idx_col = ["user_id"] + list(qual_cols)
    
    quality_summary = unique_train_quality[idx_col].groupby("user_id").agg(["mean",Q1,"median",Q3,"std","max","min",data_range])
    new_cols = ["_".join(col) for col in quality_summary.columns]
    quality_summary.columns = new_cols
    
    # 4. 생성된 데이터셋 병합
    final_df = summary_quality_count_by_day
    final_df = final_df.merge(quality_summary, left_index = True, right_index = True)
    
    return final_df

In [353]:
preprocessed_train_quality = make_quality(train_quality).reset_index()
preprocessed_test_quality = make_quality(test_quality).reset_index()

### 2. Error data

In [354]:
def make_change_log(tmp_df, min_user, max_user, min_types):
    tmp_df = tmp_df.sort_values("user_id")
    user_len = max_user - min_user
    tmp_arr = np.zeros((user_len,min_types), dtype = int).astype("str")
    
    before_uid = None
    i = 0
    for user_id, value in tmp_df.values:
        if user_id == before_uid:
            i += 1
            if i >= min_types: continue
            tmp_arr[user_id - min_user, i] = value
        else:
            i = 0
            tmp_arr[user_id - min_user, i] = value
        before_uid = user_id
    
    tmp_df = pd.DataFrame(tmp_arr,index = np.arange(min_user,max_user))
    return tmp_df

def error_preprocess_X(error_data, train = "train"):
    min_user = 30000
    max_user = 44999
    if train == "train":
        min_user = 10000
        max_user = 25000
    
    # 1. user별, 일별 error 발생 횟수
    user_err_count = error_data[["user_id","day","time"]].groupby(["user_id","day"]).count()
    user_err_count.columns = ["err_at"]
    user_err_count = user_err_count.unstack(level = -1, fill_value = 0)

    err_count_cols = user_err_count.columns.values
    err_count_cols = list(map(lambda x: x[0] + "_" + str(x[1]), err_count_cols))
    user_err_count.columns = err_count_cols
    
    # 2. groupby 연산을 통해 user, errtype별 횟수 생성
    user_day_errtype = error_data[["user_id","errtype","time"]].groupby(["user_id","errtype"]).count()
    user_day_errtype.columns = ["errtype"]
    user_day_errtype = user_day_errtype.unstack(level = -1, fill_value = 0)

    errtype_cols = user_day_errtype.columns.values
    errtype_cols = list(map(lambda x: x[0] + "_" + str(x[1]), errtype_cols))

    user_day_errtype.columns = errtype_cols

    # 3. fwver 변화과정 데이터 생성
    tmp_df = error_data[["user_id","fwver"]].drop_duplicates()
    fwver_change_df = make_change_log(tmp_df, min_user, max_user, 4).reset_index()
    
    fwver_change_df.columns = ["user_id","fwver_1","fwver_2","fwver_3","fwver_4"]
    
    # 4. model 변화과정 데이터 생성
    tmp_df = error_data[["user_id","model_nm"]].drop_duplicates()
    model_change_df = make_change_log(tmp_df, min_user, max_user, 3).reset_index()
    
    model_change_df.columns = ["user_id","model_nm_1","model_nm_2","model_nm_3"]
    
    # 5. 데이터 병합
    user_errtype = user_day_errtype.copy()
    
    merged_1 = pd.merge(user_err_count, user_errtype, left_index = True, right_index = True).reset_index()
    merged_2 = pd.merge(fwver_change_df, model_change_df, on = "user_id")
    
    fin_data = pd.merge(merged_1, merged_2, on = "user_id")
    return fin_data

In [355]:
def object_processing(raw_df):
    df = raw_df.copy()
    label_enc = LabelEncoder()
    object_cols = df.columns[df.dtypes == "object"]

    for col in object_cols:
        df[col] = label_enc.fit_transform(df[col])
    return df

In [356]:
preprocessed_train_error = error_preprocess_X(train_error, "train")
preprocessed_test_error = error_preprocess_X(test_error,"test")

In [357]:
preprocessed_train_error = object_processing(preprocessed_train_error)
preprocessed_test_error = object_processing(preprocessed_test_error)

### 3. Problem data

In [358]:
problem = np.zeros(15000)
# person_idx의 problem이 한 번이라도 발생했다면 1
# 없다면 0
problem[train_problem.user_id.unique()-10000] = 1 
problem.shape

(15000,)

### 3. merge (quality & error & problem) data

In [359]:
train_data = pd.merge(preprocessed_train_error, preprocessed_train_quality, how = "left", on = "user_id",)
test_data = pd.merge(preprocessed_test_error, preprocessed_test_quality, how = "left", on = "user_id")

train_data["target"] = problem

## 2-9. Sample data

In [360]:
train_data.head()

Unnamed: 0,user_id,err_at_1,err_at_2,err_at_3,err_at_4,err_at_5,err_at_6,err_at_7,err_at_8,err_at_9,...,quality_11_data_range,quality_12_mean,quality_12_Q1,quality_12_median,quality_12_Q3,quality_12_std,quality_12_max,quality_12_min,quality_12_data_range,target
0,10000,11,9,18,5,10,9,20,7,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10001,11,50,29,48,42,29,13,24,53,...,,,,,,,,,,1.0
2,10002,10,13,13,15,9,8,17,11,12,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10003,9,14,10,5,16,5,2,11,5,...,,,,,,,,,,0.0
4,10004,25,21,49,28,11,28,18,29,20,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [361]:
test_data.head()

Unnamed: 0,user_id,err_at_1,err_at_2,err_at_3,err_at_4,err_at_5,err_at_6,err_at_7,err_at_8,err_at_9,...,quality_11_min,quality_11_data_range,quality_12_mean,quality_12_Q1,quality_12_median,quality_12_Q3,quality_12_std,quality_12_max,quality_12_min,quality_12_data_range
0,30000,76,2,7,0,24,82,808,95,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30001,13,18,5,17,4,2,22,16,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30002,29,18,53,20,30,14,41,17,12,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30003,6,11,10,6,4,10,0,35,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,30004,39,28,21,48,8,20,19,37,12,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2-10. Save data

In [362]:
SAVE_PATH = "./preprocessed/"

if "preprocessed" not in os.listdir():
    os.makedirs(save_PATH)
    print("`preprocessed` directory is generated!")
    
train_data.to_csv(SAVE_PATH + "train_data.csv")
test_data.to_csv(SAVE_PATH + "test_data.csv")

# ------------------ 전처리된 데이터로 실행 시 여기부터 실행바랍니다. ------------------

## 2-11. Reload data

In [363]:
SAVE_PATH = "./preprocessed/"

train_data = pd.read_csv(SAVE_PATH + 'train_data.csv', index_col= 0)
test_data = pd.read_csv(SAVE_PATH + 'test_data.csv', index_col= 0)

# 3. 모델링

In [364]:
train_x = np.array(train_data.drop('target',axis=1))
train_y = np.array(train_data['target'])
# del error, problem
print(train_x.shape)
print(train_y.shape)

(15000, 172)
(15000,)


In [365]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.4
# 파라미터 설정
params =      {
                'bagging_fraction' : '0.8',
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'max_depth' : 20,
                'num_leaves' : 20,
                'seed': 111
    
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 3
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

[LightGBM] [Info] Number of positive: 3633, number of negative: 8367
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23015
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 171
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302750 -> initscore=-0.834237
[LightGBM] [Info] Start training from score -0.834237
Training until validation scores don't improve for 3 rounds
[20]	valid_0's auc: 0.817092	valid_0's pr_auc: 0.818193
Early stopping, best iteration is:
[20]	valid_0's auc: 0.817092	valid_0's pr_auc: 0.818193
[LightGBM] [Info] Number of positive: 4828, number of negative: 7172
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23230
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 171
[LightGBM] [Info] [

In [366]:
print(np.mean(auc_scores))

0.7990120248430435


In [367]:
pred_y_list = []
for model in models:
    pred_y = model.predict(test_data)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [368]:
pred_ensemble

array([[0.55789716],
       [0.30712053],
       [0.31579017],
       ...,
       [0.34880219],
       [0.56035812],
       [0.3168421 ]])

# 4. 제출파일 생성

In [369]:
sample_submssion = pd.read_csv('./data/sample_submission.csv')
sample_submssion = sample_submssion[sample_submssion['user_id']!=43262]

sample_submssion['problem'] = pred_ensemble.reshape(-1)
sample_submssion =  sample_submssion.append({'user_id' : 43262, 'problem' : 0.5},ignore_index=True)
sample_submssion['user_id'] = sample_submssion['user_id'].astype(int)
sample_submssion = sample_submssion.sort_values('user_id')

sample_submssion.to_csv("sub.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.557897
1,30001,0.307121
2,30002,0.315790
3,30003,0.517555
4,30004,0.573848
...,...,...
14993,44994,0.307766
14994,44995,0.330342
14995,44996,0.348802
14996,44997,0.560358
