## Feature Engineering 전략
#### 엔지니어링 적으로 단일 변수별로 중요 카테고리가 있을 경우 이를 상기시켜준다
#### 특정 카테고리가 상대적으로 Positive value 가 많다 또는 전체 분포와의 차이가 많다
##### 1.1 True Value 상위에 있는 단순비율 -> 클 수록 좋음
##### 1.2 True value 분포 (비율) - 실제 분포 (비율?) 가 큰 경우
##### 1.3 True value row의 수 / 전체 value row 수 -> 클 수록 좋음
##### 1.4 True value 분포 비율 / 전체 분포에서 비율 -> 클 수록 좋음

##### 2. 위 생성된 수치 데이터들에 대해 clustering!

##### 3. 다변량으로 처리할 수 있는 방법 생각 ex) 교집합의 비율, 커널 참고

#### Base Line 모델 설정 후 전 후 비교를 통해 엔지니어링 효과 점검

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import time

In [3]:
root = r'D:/data/2_FraudDetection/preprocessed/'
train_fname ='train_parsed_date.csv'
test_fname = 'test_parsed_date.csv'
train_is_attributed_all = 'train_is_attributed_all.csv'

In [4]:
df_is_attributed_all = pd.read_csv(os.path.join(root,train_is_attributed_all))
df_train =  pd.read_csv(os.path.join(root,train_fname))
df_test = pd.read_csv(os.path.join(root,test_fname))

In [5]:
df_is_attributed_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456846 entries, 0 to 456845
Data columns (total 11 columns):
ip                 456846 non-null int64
app                456846 non-null int64
device             456846 non-null int64
os                 456846 non-null int64
channel            456846 non-null int64
click_time         456846 non-null object
attributed_time    456846 non-null object
is_attributed      456846 non-null int64
year               456846 non-null int64
weekday            456846 non-null int64
hour               456846 non-null int64
dtypes: int64(9), object(2)
memory usage: 38.3+ MB


In [6]:
df_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,year,weekday,hour
0,134575,3,1,13,379,2017-11-06 14:43:10,,0,2017,0,14
1,73503,3,1,18,379,2017-11-06 14:49:43,,0,2017,0,14
2,74715,3,1,19,379,2017-11-06 14:55:25,,0,2017,0,14
3,153492,3,1,17,379,2017-11-06 15:03:11,,0,2017,0,15
4,192796,3,1,19,379,2017-11-06 15:07:24,,0,2017,0,15


In [7]:
df_test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time,year,weekday,hour
0,0,5744,9,1,3,107,2017-11-10 04:00:00,2017,4,4
1,1,119901,9,1,3,466,2017-11-10 04:00:00,2017,4,4
2,2,72287,21,1,19,128,2017-11-10 04:00:00,2017,4,4
3,3,78477,15,1,13,111,2017-11-10 04:00:00,2017,4,4
4,4,123080,12,1,13,328,2017-11-10 04:00:00,2017,4,4


In [8]:
df_train.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time',
       'is_attributed', 'year', 'weekday', 'hour'],
      dtype='object')

In [9]:
## Label Encoding using Dictionary
# Need process for unseen data (label)

categorical_cols = ['ip', 'app', 'device', 'os',
                    'channel']
total_label_dict = {}
for col in categorical_cols:    
    set1 = set(df_is_attributed_all[col].unique())
    set2 = set(df_train[col].unique())
    total_labels = list(set1.union(set2))
    label_dict = dict(zip(total_labels,range(len(total_labels))))
    total_label_dict[col] = label_dict
    df_train[col] = df_train[col].map(total_label_dict[col])
    df_is_attributed_all[col] = df_is_attributed_all[col].map(total_label_dict[col])


##### 1.1 True positive에서 비율
* 각각 column에서 label마다 비율 구해서 dictionary에 담은 후 
* 새로운 칼럼으로 추가

In [10]:
### value마다 비율을 넣어주는 dictionary 생성
### 만약 여기에 없는 value (not in keys) 일 경우 0 대입

from sklearn.preprocessing import StandardScaler 

## 프로토 타입 완성
ip_ratio_dict_total = {}
ratios = pd.value_counts(df_is_attributed_all['app'], normalize=True)

## Scaler
scaler_dict = {}
sc = StandardScaler()
ratios_scaled = sc.fit_transform(ratios.values.reshape(-1,1))
scaler_dict['app'] =  sc
ip_ratio_dict_total['app'] = dict(zip(ratios.index,ratios_scaled.T[0]))
df_train['app_true_ratio'] = df_train.app.map(lambda x : ip_ratio_dict_total['app'][x] if  x in  ip_ratio_dict_total['app'].keys() else 0)

## For 문으로 다 돌리자

In [11]:
fe_cols = ['app','device','os','channel','hour']

f1_dict_total = {}
scaler_dict = {}

for col in fe_cols:
    
    print('Start  Engineering Feature 1',col)
    ratios_true = pd.value_counts(df_is_attributed_all[col], normalize=True)
    ratio_real = pd.value_counts(df_train[col], normalize=True)
    sc = StandardScaler()    
    ratios_scaled = sc.fit_transform(ratios.values.reshape(-1,1))
    f1_dict_total[col] = {'dict':dict(zip(ratios.index,ratios_scaled.T[0])), 'scaler' : sc}
    df_train['f1_true_ratio_'+col] = df_train[col].map(lambda x : f1_dict_total[col]['dict'][x] if  x in  f1_dict_total[col]['dict'].keys() else 0)
    #df_train['f1_true_ratio_'+col] = df_train[col].map(lambda x : f1_dict_total[col]['dict'][x] if  x in  f1_dict_total[col].keys() else 0)


Start  Engineering Feature 1 app
Start  Engineering Feature 1 device
Start  Engineering Feature 1 os
Start  Engineering Feature 1 channel
Start  Engineering Feature 1 hour


In [12]:
df_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,year,weekday,hour,app_true_ratio,f1_true_ratio_app,f1_true_ratio_device,f1_true_ratio_os,f1_true_ratio_channel,f1_true_ratio_hour
0,46881,3,1,13,124,2017-11-06 14:43:10,,0,2017,0,14,0.77648,0.77648,-0.074584,-0.122551,0.0,0.04557
1,22688,3,1,18,124,2017-11-06 14:49:43,,0,2017,0,14,0.77648,0.77648,-0.074584,0.555775,0.0,0.04557
2,23006,3,1,19,124,2017-11-06 14:55:25,,0,2017,0,14,0.77648,0.77648,-0.074584,12.285683,0.0,0.04557
3,65267,3,1,17,124,2017-11-06 15:03:11,,0,2017,0,15,0.77648,0.77648,-0.074584,-0.142435,0.0,0.131609
4,103558,3,1,19,124,2017-11-06 15:07:24,,0,2017,0,15,0.77648,0.77648,-0.074584,12.285683,0.0,0.131609


##### 1.2 True value 분포 (비율) - 실제 분포 (비율) 
* df_is_attributed_all -> true value
* df_train -> 실제 분포
* point : 서로 key가 다를 수 있다 어떻게 통일할 지가 중요
* -1 < new_feature < 1

In [13]:
f2_dict_total = {}

for col in fe_cols :
    print('Start  Engineering Feature 2',col)
    
    ## 각각 비율 구하기
    ratio_true = pd.value_counts(df_is_attributed_all[col], normalize=True)
    ratio_real = pd.value_counts(df_train[col], normalize=True)

    # 서로 서로 없는 건 0으로 채워야

    real_set = set(ratio_real.index)
    true_set = set(ratio_true.index)
    total_index = real_set.union(true_set)

    ratio_real_filled = ratio_real[total_index].fillna(value = 0)
    ratio_true_filled = ratio_true[total_index].fillna(value = 0)

    ratio_sub = ratio_true_filled - ratio_real_filled
    f2_dict_total[col] = dict(zip(ratio_sub.index,ratio_sub))

    ## index가 없을 경우 -1로 채우기 (최소값)
    df_train['f2_ratio_sub'+col] = df_train[col].map(lambda x : f2_dict_total[col][x] if  x in  f2_dict_total[col].keys() else -1)


Start  Engineering Feature 2 app


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


Start  Engineering Feature 2 device
Start  Engineering Feature 2 os
Start  Engineering Feature 2 channel
Start  Engineering Feature 2 hour


In [14]:
df_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,year,weekday,...,f1_true_ratio_app,f1_true_ratio_device,f1_true_ratio_os,f1_true_ratio_channel,f1_true_ratio_hour,f2_ratio_subapp,f2_ratio_subdevice,f2_ratio_subos,f2_ratio_subchannel,f2_ratio_subhour
0,46881,3,1,13,124,2017-11-06 14:43:10,,0,2017,0,...,0.77648,-0.074584,-0.122551,0.0,0.04557,-0.161019,-0.271914,-0.080215,-0.015897,-0.008167
1,22688,3,1,18,124,2017-11-06 14:49:43,,0,2017,0,...,0.77648,-0.074584,0.555775,0.0,0.04557,-0.161019,-0.271914,-0.022995,-0.015897,-0.008167
2,23006,3,1,19,124,2017-11-06 14:55:25,,0,2017,0,...,0.77648,-0.074584,12.285683,0.0,0.04557,-0.161019,-0.271914,-0.064535,-0.015897,-0.008167
3,65267,3,1,17,124,2017-11-06 15:03:11,,0,2017,0,...,0.77648,-0.074584,-0.142435,0.0,0.131609,-0.161019,-0.271914,-0.027231,-0.015897,-0.021051
4,103558,3,1,19,124,2017-11-06 15:07:24,,0,2017,0,...,0.77648,-0.074584,12.285683,0.0,0.131609,-0.161019,-0.271914,-0.064535,-0.015897,-0.021051


In [15]:
### Categorical Variable One hot
## factorize를 통해 one hot encoding
## changing Data type
### 굳이 안해도 베이스라인 차이 없음


# categorical_cols = ['ip', 'app', 'device', 'os',
#                     'channel']
# for col in categorical_cols:
#     df_train[col] = df_train[col].astype('category')
#     df_is_attributed_all[col]= df_is_attributed_all[col].astype('category')

In [16]:
df_train['is_attributed'].value_counts()

0    18443421
1       45468
Name: is_attributed, dtype: int64

In [17]:
## Down Sampling
df_minority = df_train[df_train.is_attributed == 1]
df_majority = df_train[df_train.is_attributed == 0]

df_majority_resample = resample(df_majority, replace = False,
                               n_samples=45468,
                               random_state=123)

df_downsampled = pd.concat([df_majority_resample,df_minority])

In [18]:
df_downsampled.is_attributed.value_counts()

1    45468
0    45468
Name: is_attributed, dtype: int64

### Base Line 모델
Random forest

In [226]:
## Due to Class imbalance Need to down sample

## int to category dtype

to_train_columns = ['ip','app','device','os','channel','weekday','hour']
target_column = ['is_attributed']

x_train, x_test , y_train, y_test =  train_test_split(df_downsampled[to_train_columns],df_downsampled[target_column], 
                                                     test_size = 0.3 ,
                                                     random_state=123)

In [227]:
start_time = time.time()

hyper_parmas = {'n_estimators': 100,
               'max_features': 'auto',
               'max_depth': 50,
               'min_samples_split': 2,
               'min_samples_leaf': 4,
               'bootstrap': True}

rf = RandomForestClassifier(n_estimators=hyper_parmas['n_estimators'],
                            max_features=hyper_parmas['max_features'],
                            max_depth=hyper_parmas["max_depth"],
                            min_samples_leaf=hyper_parmas["min_samples_leaf"],
                            min_samples_split=hyper_parmas['min_samples_split'],
                            random_state=123456,
                            n_jobs=8)
rf.fit(x_train,y_train)
end_time = time.time()

print("Elapse Time", end_time - start_time)



Elapse Time 0.7461671829223633


In [228]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = rf.predict(x_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred, target_names = ['class 0','class 1']))

[[13202   532]
 [ 1707 11840]]
              precision    recall  f1-score   support

     class 0       0.89      0.96      0.92     13734
     class 1       0.96      0.87      0.91     13547

   micro avg       0.92      0.92      0.92     27281
   macro avg       0.92      0.92      0.92     27281
weighted avg       0.92      0.92      0.92     27281



#### Baseline model F1 score : 0.92

In [229]:
rf.predict_proba(x_test)

array([[6.78478711e-04, 9.99321521e-01],
       [9.06259744e-01, 9.37402555e-02],
       [9.34821764e-01, 6.51782356e-02],
       ...,
       [4.49367577e-02, 9.55063242e-01],
       [9.96032371e-01, 3.96762905e-03],
       [8.71537026e-01, 1.28462974e-01]])

In [230]:
df_train.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time',
       'is_attributed', 'year', 'weekday', 'hour', 'app_true_ratio',
       'f1_true_ratio_app', 'f1_true_ratio_device', 'f1_true_ratio_os',
       'f1_true_ratio_channel', 'f1_true_ratio_hour', 'f2_ratio_subapp',
       'f2_ratio_subdevice', 'f2_ratio_subos', 'f2_ratio_subchannel',
       'f2_ratio_subhour'],
      dtype='object')

### F1 추가

In [231]:
to_train_columns =['ip', 'app', 'device', 'os', 'channel'
       ,  'weekday', 'hour',
       'f1_true_ratio_app', 'f1_true_ratio_device', 'f1_true_ratio_os',
       'f1_true_ratio_channel', 'f1_true_ratio_hour']
target_column = ['is_attributed']

x_train, x_test , y_train, y_test =  train_test_split(df_downsampled[to_train_columns],df_downsampled[target_column], 
                                                     test_size = 0.3 ,
                                                     random_state=123)

start_time = time.time()

hyper_parmas = {'n_estimators': 100,
               'max_features': 'auto',
               'max_depth': 50,
               'min_samples_split': 2,
               'min_samples_leaf': 4,
               'bootstrap': True}
rf = RandomForestClassifier(n_estimators=hyper_parmas['n_estimators'],
                            max_features=hyper_parmas['max_features'],
                            max_depth=hyper_parmas["max_depth"],
                            min_samples_leaf=hyper_parmas["min_samples_leaf"],
                            min_samples_split=hyper_parmas['min_samples_split'],
                            random_state=123456,
                            n_jobs=8)

rf.fit(x_train,y_train)
end_time = time.time()

print("Elapse Time", end_time - start_time)

y_pred = rf.predict(x_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred, target_names = ['class 0','class 1']))



Elapse Time 0.9482133388519287
[[13209   525]
 [ 1670 11877]]
              precision    recall  f1-score   support

     class 0       0.89      0.96      0.92     13734
     class 1       0.96      0.88      0.92     13547

   micro avg       0.92      0.92      0.92     27281
   macro avg       0.92      0.92      0.92     27281
weighted avg       0.92      0.92      0.92     27281



* 전체적으로 에러의 개수는 매우 살짝 줄어 들었다
* class 1을 정확히 맞힌 비율이 늘어남
* 성과 아주 살짝 있음

### F2 추가

In [237]:
to_train_columns =['ip', 'app', 'device', 'os', 'channel'
       ,  'weekday', 'hour', 'f2_ratio_subapp',
       'f2_ratio_subdevice', 'f2_ratio_subos', 'f2_ratio_subchannel',
       'f2_ratio_subhour']
target_column = ['is_attributed']

x_train, x_test , y_train, y_test =  train_test_split(df_downsampled[to_train_columns],df_downsampled[target_column], 
                                                     test_size = 0.3 ,
                                                     random_state=123)

start_time = time.time()

rf = RandomForestClassifier(n_estimators=hyper_parmas['n_estimators'],
                            max_features=hyper_parmas['max_features'],
                            max_depth=hyper_parmas["max_depth"],
                            min_samples_leaf=hyper_parmas["min_samples_leaf"],
                            min_samples_split=hyper_parmas['min_samples_split'],
                            random_state=123456,
                            n_jobs=8)

rf = RandomForestClassifier(n_estimators=300, random_state=123456
                           , n_jobs=8)
rf.fit(x_train,y_train)
end_time = time.time()

print("Elapse Time", end_time - start_time)

y_pred = rf.predict(x_test)

print(confusion_matrix(c,y_pred))
print(classification_report(y_test,y_pred, target_names = ['class 0','class 1']))



Elapse Time 2.839637041091919
[[12996   738]
 [ 1542 12005]]
              precision    recall  f1-score   support

     class 0       0.89      0.95      0.92     13734
     class 1       0.94      0.89      0.91     13547

   micro avg       0.92      0.92      0.92     27281
   macro avg       0.92      0.92      0.92     27281
weighted avg       0.92      0.92      0.92     27281



* class 1을 정확히 맞힌 비율이 늘어남 (TN)
* 재현율은 올랐지만 precision은 떨어짐
