In [None]:


from tqdm import tqdm
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
import time
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

plt.style.use(['seaborn-darkgrid'])

In [None]:
def reduce_mem_usage(df,verbose=True):
    numerics = ['int16','int32','int64' ,'float16','float32','float64']

    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                if c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                if c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int8)
                if c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int8)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)

                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        end_mem = df.memory_usage().sum() / 1024 **2
        if verbose: print("Mem , usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(end_mem, 100 * (start_mem - end_mem) /start_mem))
        return df

In [None]:
import pandas as pd
train = pd.read_csv('./JobCare_data/train.csv')
test = pd.read_csv('./JobCare_data/test.csv')

In [9]:
train = train.drop(['id', 'contents_open_dt'], axis=1)
test = test.drop(['id', 'contents_open_dt'], axis=1)

In [None]:
# 무지성randomforest 하지말고 시각화하고 데이터 살펴보고 뭘 적용할지결정

train = train.astype(np.int)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
x = train.iloc[:,:-1]
y = train['target']

In [None]:
x = x.astype(np.int)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x,y , test_size=0.3 , shuffle=True, stratify=y, random_state=34)

x_train = x_train.astype(np.int)

# randomforest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train.values,y_train.values)
y_pred = rf.predict(x_valid)
rf.score(x_train,y_train)
round(rf.score(x_train.values,y_train.values)*100,2)

In [None]:
plt.plot(y_valid.values[:100], label="answer")
plt.plot(y_pred[:100], label="predict")
plt.legend()

In [None]:
#GridSearch

rf_param_grid = {
    'n_estimators' :  [100,200,300],
    'max_depth' : [6,8,10,12,20,30,50,70,],
    'min_samples_leaf' : [3,5,7,10],
    'min_samples_split' : [2,3,5,10],
    'n_jobs': [-1]
}

x_train_encoding = x_train.iloc[:,:-2].astype(np.int)

In [None]:
rf_grid = GridSearchCV(rf, param_grid = rf_param_grid, scoring='accuracy', n_jobs=-1 , verbose =1)
rf_grid.fit(x_train_encoding.values , y_train.values)

rf_grid.best_params_

In [None]:
# valid set으로 예측을 하고 score 확인 ?

In [None]:
# LogisticRegression
logreg = LogisticRegression()
logreg.fit(x_train.values,y_train.values)
y_pred = logreg.predict(x_valid)
acc_log = round(logreg.score(x_train.values,y_train.values)* 100,2)
# SelectFromModel

In [None]:
print("Number of features before selection : {}".format(x_train.shape[1]))
sfm = SelectFromModel(rf , threshold='median', prefit=True)
n_features = sfm.transform(x_train).shape[1]
print('Number of features after seletion : {}'.format(n_features))
selected_vars  = list(x_train.columns[sfm.get_support()])
preds = rf.predict(test.values)
test = test[selected_vars]
preds = rf.predict(test[selected_vars].values)

In [None]:
train = train[selected_vars + ['target']]

# 버릴 feature 는 버린다 많을 수록 좋은 피쳐가 아니기 때문에
# 제출 submission

In [None]:
submission = pd.read_csv('./job_care/sample_submission.csv')

submission['target'] = preds
test.columns
submission.to_csv('./baseline.csv', index=False)
baseline = pd.read_csv('./baseline.csv')
baseline['target'].value_counts()

In [None]:
# f1 score로 train에서 feature 를 가지고 학습한 모델을 test 모델에 적용해서
# 유의미하면 ?


In [None]:
#PermutationImportance
perm = PermutationImportance(rf, scoring='accuracy', random_state=22).fit(x_valid,y_valid)

In [None]:
start_time = time.time()
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances : {elapsed_time:.3f} seconds")

In [None]:
#feature importance
eli5.show_weights(perm , top =20  ,feature_names = x_valid.columns.tolist())
# tree 기반 이므로 얼마나 트리 분할과 밀접한 관련이 있는 지를 본다

# feature 하나하나 마다 shuffle하여 성능 변화 지켜보기 중요한 역할의 feature 라면 모델 서능 떨어질

# weight가 양수인갑들은 중요한 값 모델에 큰 영향을 끼친다
# contents attribute d가 중요한 featrue
n_features = len(x_train.columns)

x_train.columns

In [None]:
#LGBMClassifier
model = lgb.LGBMClassifier(n_estimators=100, objective='binary', class_weight='balanced'
                           ,learning_rate= 0.05, reg_alpha=0.1, reg_lambda=0.1,
                           subsample=0.8 , n_jobs=-1 , random_state=50)

model.feature_importances_

In [None]:
import matplotlib.pyplot as plt

plt.barh(np.arange(n_features), sorted(model.feature_importances_), align='center')
plt.yticks(np.arange(n_features) , x_train.columns)
plt.xlabel('random forest feature importance')
plt.ylabel("Feature")
plt.tight_layout()

In [None]:
import numpy as np


# 사용자 번호와 컨텐츠 번호는 관련이 없을 듯 한데 제거

# 신경망에 리스트를 주입할 수 없으니 텐서로 변환

# feature importance
feature_names= test.columns
forest_importances = pd.Series(importances, index=feature_names )

import matplotlib.pyplot as plt

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title('Feature importances using MDI')
ax.set_ylabel("Mena decrease in impurity")
fig.tight_layout()

In [None]:
# feature importance 가 높은 값에 가중치를 줘서 더 높은 정확도 필요 어떤 콘텐츠 열람하고 시청을 했느냐가 타겟
train
#회원 선호속성과 컨텐츠 속성과의 연관관계
# 같은 사용자
# 데이터 시각화 부터 하자
train = train.astype(np.int)
test = test.astype(np.int)

# 각각 feature 간의 상관관계
# 대분류중분류소분류가 그래도 타겟과의 상관관계가 그나마 높은 것이 보인다
import seaborn as sns
train

In [None]:
sns.heatmap(train.corr())

In [None]:
# 변수 나누기
train['d_l_match_yn']
train = reduce_mem_usage(train)

In [68]:
meta_data = []
ordinal_col = ['person_attribute_a_1','person_attribute_b','person_prefer_e','contents_attribute_e']
for col_name in train.columns:
    if 'yn' in col_name:
        level = 'binary'
    elif col_name in ordinal_col:
        level = 'ordinal'
    elif 'attribute' in col_name:
        level = 'nominal'
    elif 'prefer' in col_name:
        level = 'nominal'

    f_dict = {
        'feature_name' : col_name,
        'level' : level
    }

    meta_data.append(f_dict)

In [69]:
meta_df = pd.DataFrame(meta_data, columns=['feature_name', 'level'])
meta_df

Unnamed: 0,feature_name,level
0,d_l_match_yn,binary
1,d_m_match_yn,binary
2,d_s_match_yn,binary
3,h_l_match_yn,binary
4,h_m_match_yn,binary
5,h_s_match_yn,binary
6,person_attribute_a,nominal
7,person_attribute_a_1,ordinal
8,person_attribute_b,ordinal
9,person_prefer_c,nominal


In [None]:
# 둘다 오버샘플링 할 필요는 없다 적절한 분포
train['target'].value_counts()

In [None]:
#####missing_data########
vars_with_missing =  []

for f in train.columns:
    missings = train[train[f] ==-1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings/train.shape[0]

        print('Variable {} has {} records : ({:.2f}) with missing values'.format(f , missings , missings_perc))
print('In total , there are {} variables with missing values'.format(len(vars_with_missing)))
# check cardinality

In [None]:
v = meta_df[(meta_df.level == 'nominal')].index

In [None]:
for col in train.columns:
    dist_values = train[col].value_counts().shape[0]
    print('Variable {} has {} distinct values'.format(col, dist_values))

In [None]:
# 변수 시각화
train.dtypes

In [None]:
# target 이 1인 categorical value 에 대한 percentage

for col in train.columns:
    plt.figure()
    fig , ax = plt.subplots(figsize=(20,10))
    # Calculate the percnetage of target=1 per category value
    cat_perc = train[[col,'target']].groupby([col], as_index=False).mean()
    cat_perc.sort_values(by='target', ascending=False, inplace=True)
    # Bar plot
    # Order the bars decending on target mean
    sns.barplot(ax=ax, x= col, y='target', data=cat_perc, order=cat_perc[col])
    plt.ylabel("target percentage", fontsize=18)
    plt.xlabel(col,fontsize=18)
    plt.tick_params(axis='both', which='major' , labelsize=10)
    plt.show()

In [None]:
col = meta_df[meta_df.level == 'nominal']['feature_name'].values.tolist()
train = train.astype(np.int)

In [None]:
# dummification 하면 값의 수 만큼 컬럼이 늘어난다
#pd.get_dummies(train , columns = ['person_attribute_a_1'], drop_first=True)
# PolynomialFeature
# 각 특성의 제곱 혹은 그 이상을 추가

In [None]:
print('Before dummification we have {} variables in train'.format(train.shape[1]))
train = pd.get_dummies(train, columns=col, drop_first=True)
print('After dummification we have {} variables in train'.format(train.shape[1]))

In [None]:
# 분산이 너무 낮으면 제거한다
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=.01)


train

# 이진 변수 이고 nomial변수인데 분산이 필요한가?
selector.fit(train.drop(['target','person_rn','contents_rn']))

train_select = train.select_dtypes(include=['int'])

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
mutual_info_classif(train_select.values , train.target.values, n_neighbors=3, random_state=17)

In [None]:
train_select.columns

In [None]:
####randomforest 2
import plotly.graph_objs as go
import plotly.offline as py
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(n_estimators=150 , max_depth=8 , min_samples_split=4, max_features=0.2, n_jobs=-1 ,random_state=0)
rf.fit(train,train.target)

features = train.columns[:-1].values

trace = go.Scatter(
    y=rf.feature_importances_,
    x= features,
    mode='markers',
    marker=dict(
        sizemode='diameter',
        sizeref =1 ,
        size= 13,
        color= rf.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text= features
)
data = [trace]

layout = go.Layout(
    autosize=True,
    title='Random Forest Feature Importance',
    hovermode='closest',
    xaxis = dict(
        ticklen=5,
        showgrid=False ,
        zeroline=False,
        showline=False
    ),
    yaxis = dict(
        title = 'Feature Importance',
        showgrid =False,
        zeroline=False,
        ticklen=5,
        gridwidth=2
    ),
    showlegend=False
)
fig = go.Figure(data=data,layout=layout)
py.iplot(fig,filename='scatter2010')


x, y =list((x) for x in zip(*sorted(zip(rf.feature_importances_,features), reverse=False)))

In [None]:
trace2 = go.Bar(
    x=x ,
    y=y,
    marker=dict(
        color=x ,
        colorscale = 'Viridis',
        reversescale=True
    ),
    name='Random Forest Feature Importance',
    orientation='h',
)
layout= dict(
    title='Barplot of Feature importances',
    width= 900, height = 2000,
    yaxis =dict(
        showgrid=False ,
        showline=False,
        showticklabels=True,
    )
)

In [None]:
# barplot 이 훨씬 시각적으로 와닿는다

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')

In [None]:
from sklearn import tree
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image , ImageDraw , ImageFont
import re

In [None]:
train.iloc[:,:-1].astype(np.int).drop(['person_rn','contents_rn'],axis=1)

In [None]:
target = train['target']

In [None]:
x = train.iloc[:,:-1].astype(np.int)

In [None]:
target.values

In [None]:
x

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x,y , test_size=0.3 , shuffle=True, stratify=y, random_state=34)

In [None]:
x_train

In [None]:
y_train

In [None]:
#desiciontreeregressor  를 하면 안되지 .. logisticregresoor도 아니고

In [None]:
### DecisionTreeClassifier #####
decision_tree = tree.DecisionTreeClassifier(max_depth=3)
decision_tree.fit(x_train, y_train.values)

In [None]:
y_pred = decision_tree.predict(x_valid)

import sklearn.metrics as mt

In [None]:
# accuracy = TP + TN / TP + TN + FP + FN (전체)
accuracy = mt.accuracy_score(y_valid,y_pred)

In [None]:
# 어느 피쳐가 들어갈 때 성능이 많이 떨어질까?
# 모든 피쳐 조합 ?

# 여부 feature 로만 모델 테스트 해보기

In [None]:
yn_col = []
for col in cols:
    if 'match_yn' in col:
        yn_col.append(col)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
### GradientBoostingClassifier
#split마다 고려되는 features의 수 float이면 int(max_features * n_features)값이다
yn_train = train.loc[:,yn_col].astype(np.int)

x_train, x_valid, y_train, y_valid = train_test_split(yn_train,target,stratify=target)

In [None]:
gb = GradientBoostingClassifier(n_estimators=100, max_depth=3, min_samples_leaf=4, max_features=0.2, random_state=0)
gb.fit(x_train,y_train)

In [None]:

(y_valid == gb.predict(x_valid)).mean()

In [None]:
gb_param_grid = {
    'n_estimators' :  [100,200],
    'max_depth' : [6,8],
    'min_samples_leaf' : [3,5],
    'min_samples_split' : [2,3]
}

In [None]:
rf_grid = GridSearchCV(gb, param_grid = gb_param_grid, scoring='accuracy', n_jobs=-1 , verbose =1)
rf_grid.fit(x_train,y_train)

rf_grid.best_params_


In [None]:
attribute_col = []
for col in cols:
    if 'attribute' in col:
        attribute_col.append(col)
attribute_train = train.loc[:,attribute_col].astype(np.int)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(attribute_train,target,stratify=target)



prefer_col = []
for col in cols:
    if 'prefer' in col:
        prefer_col.append(col)

In [None]:
prefer_train = train.loc[:, prefer_col].astype(np.int)

In [None]:
# 여부, 선호 , 속성 각각을 트레이닝 해보고 조합도 트레이닝 해본다 ? 별로 좋은 것 같진 않지만

In [None]:
# 선호는 확실히 모델 성능이 떨어지기는 한다

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(prefer_train,target,stratify=target)


target = train['target']

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(yn_train,target,stratify=target)

In [None]:
x_train = x_train.astype(np.int)

In [None]:
import lightgbm as lgb

In [None]:
gb.fit(x_train,y_train)

In [None]:
# 0.569면 비슷하다 그냥 이것도
(y_valid == gb.predict(x_valid)).mean()

In [None]:
model.fit(x_train, y_train , eval_metric='auc' , eval_set=[(x_valid,y_valid) , (x_train,y_train)],
         eval_names=['valid','train'],early_stopping_rounds=100, verbose=200
          )



x_test = test.loc[: ,yn_col].astype(np.int)

In [None]:
(gb.predict(x_test) == model.predict(x_test)).mean()

In [None]:
##### decision tree 시각화 #####
# 덮어쓰기 안되므로 트레이닝 후 수동으로 삭제 후 다시 저자
# Export our trained model as a .dot file
with open("tree1.dot", 'w') as f:
    f = tree.export_graphviz(decision_tree,
                            out_file=f,
                            max_depth=4,
                            impurity= False,
                            feature_names = train.iloc[:,:-1].columns.values.tolist(),
                            class_names =['No' , 'Yes'],
                            rounded =True,
                            filled=True)

In [None]:
from subprocess import check_call

# 정확도가 많이 낮네
(y_valid == decision_tree.predict(x_valid)).mean()

In [None]:
# Convert .dot to .png to allow display in web notebook
check_call(['dot', '-Tpng', 'tree1.dot' , '-o','tree1.png'])

In [None]:
# Annotating chart with PIL
img = Image.open("tree1.png")
draw = ImageDraw.Draw(img)
img.save("sample-out.png")
PImage("sample-out.png")


import os
os.getcwd()

In [None]:
print(mf)

In [None]:
v = train

In [None]:
# feature engineering
print('Before dummification we have {} variables in train')

In [None]:
# category 2 is not twice the value of category 1

In [None]:
train['person_prefer_d_1']

In [None]:
import seaborn as sns

In [None]:
# 같은 컨텐츠
train['contents_rn'].value_counts()

In [None]:

import numpy as np
import pandas as pd
import torch

In [108]:
########## TabNet #############
from torch import nn
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

In [27]:
train_df = pd.read_csv('./JobCare_data/train.csv')
test_df = pd.read_csv('./JobCare_data/test.csv')

In [98]:
train = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)<11].copy()
val = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)==11].copy()

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,...,contents_attribute_j_1_woe_encode,contents_attribute_j_woe_encode,contents_attribute_c_woe_encode,contents_attribute_k_woe_encode,contents_attribute_l_woe_encode,contents_attribute_d_woe_encode,contents_attribute_m_woe_encode,contents_attribute_e_woe_encode,contents_attribute_h_woe_encode,target_woe_encode
0,True,True,True,False,False,False,1,4,3,5,...,-0.110406,-0.131229,0.050647,0.008244,0.004922,-0.331832,0.032566,0.016421,-0.147974,13.815511
1,False,False,False,True,True,False,1,3,4,1,...,0.110187,0.039526,0.050647,0.008244,0.004922,-0.331832,0.032566,0.016421,0.174218,-inf
2,False,False,False,True,False,False,2,0,3,5,...,-0.110406,-0.131229,0.050647,-0.245994,0.304402,-0.061951,0.032566,0.016421,-0.159230,-inf
3,False,False,False,True,False,False,2,0,2,5,...,0.110187,0.039526,0.050647,0.008244,0.004922,-0.331832,0.048236,-0.005555,-0.001587,-inf
4,True,True,True,False,False,False,1,3,4,5,...,-0.110406,-0.131229,0.050647,0.008244,0.004922,-0.331832,0.032566,0.016421,-0.001587,-inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501946,False,False,False,True,False,False,1,1,2,2,...,0.110187,0.039526,0.050647,0.008244,0.137717,0.218158,0.032566,0.024290,0.142156,13.815511
501947,True,True,False,True,False,False,1,6,2,1,...,-0.110406,-0.131229,0.050647,0.008244,-0.037998,0.377821,0.032566,0.016421,-0.248896,13.815511
501948,True,True,True,True,False,False,1,7,4,1,...,0.110187,0.039526,0.050647,0.008244,-0.043894,0.218158,0.000910,0.113906,0.142156,13.815511
501949,True,False,False,True,False,False,1,1,2,1,...,0.110187,0.039526,0.050647,0.008244,0.448950,0.218158,-0.028532,0.016421,-0.081789,13.815511


In [None]:
#preprocessing
test= test_df.copy()

In [113]:
for df in [train,val,test]:
    if 'contents_open_dt' in df.columns:
        df.drop(['contents_open_dt'],axis=1,inplace=True)
    if 'contents_rn' in df.columns:
        df.drop(['contents_rn'],axis=1 , inplace=True)
    if 'id' in df.columns:
        df.drop(['id'],axis=1 , inplace=True)
    if 'person_rn' in df.columns:
        df.drop(['person_rn'],axis=1 , inplace=True)
    if 'person_prefer_f' in df.columns:
        df.drop(['person_prefer_f'],axis=1 , inplace=True)
    if 'person_prefer_g' in df.columns:
        df.drop(['person_prefer_g'],axis=1 , inplace=True)

In [None]:
columns = sorted(test.columns)
train = train[columns+ ['target']] *1
val = val[columns+['target']]*1
test = test[columns] * 1

In [105]:
cat_idxs = []
cat_dims = []

[]

In [None]:
train_df[col].values

In [103]:
train_df[col].value_counts()

0    251106
1    250845
Name: target, dtype: int64

In [None]:
train

In [None]:
train_df['contents_attribute_h'].value_counts()

In [None]:
train[col]

In [None]:
# le_dict 가 2~313 을 key로 가지고 있다면 컬럼에서 2~313을 가지고 있는 컬럼에서 매핑을 한것이다
# labelencoder 는 말 그대로 2~313

In [110]:
train.columns

Index(['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_f', 'person_prefer_g',
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'contents_open_dt', 'target',
       'd_l_match_yn_woe_encode', 'd_m_match_yn_woe_encode',
       'd_s_match_yn_woe_encode', 'h_l_match_yn_woe_encode',
       'h_m_match_yn_woe_encode', 'h_s_match_yn_woe_encode',
       'person_attribute_a_woe_encode', 'person_attribute_a_1_woe_encode',
   

Index(['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_f', 'person_prefer_g',
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'contents_open_dt', 'target',
       'd_l_match_yn_woe_encode', 'd_m_match_yn_woe_encode',
       'd_s_match_yn_woe_encode', 'h_l_match_yn_woe_encode',
       'h_m_match_yn_woe_encode', 'h_s_match_yn_woe_encode',
       'person_attribute_a_woe_encode', 'person_attribute_a_1_woe_encode',
   

In [None]:
for idx, col in enumerate(train.columns):
    if 'match' not in col and col != 'target':
        le = LabelEncoder()
        le.fit(train_df[col].values)
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
        train[col] = train[col].apply(lambda x: le_dict.get(x,len(le_dict)))
        val[col] = val[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        test[col] = test[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        cat_idxs.append(idx)
        cat_dims.append(len(le_dict)+1)

In [None]:
X_train = train.drop('target',axis=1).values
y_train = train['target'].values
X_val = val.drop('target', axis=1).values
y_val = val['target'].values
X_test = test.values
eval_set = (X_val, y_val)

In [None]:
cat_idxs = []
cat_dims = []

In [114]:
for idx, col in enumerate(train.columns):
    if 'match' not in col and col != 'target':
        le = LabelEncoder()
        le.fit(train_df[col].values)
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
        train[col] = train[col].apply(lambda x: le_dict.get(x,len(le_dict)))
        val[col] = val[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        test[col] = test[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        cat_idxs.append(idx)
        cat_dims.append(len(le_dict)+1)


KeyError: 'person_prefer_f_woe_encode'

In [None]:
clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=5,
                       optimizer_fn=torch.optim.AdamW,# Any optimizer work here
                       mask_type='entmax',#"spaesemax")
                       )

In [None]:
class F1_Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True
    def __call__(self, y_true, y_score):
        score = f1_score(y_true, (y_score[:,1]>0.5)* 1)
        return score

In [None]:
y_valid.values

In [None]:
#이건 어떻게 하길래 더 높은 정확도가 나올까
clf.fit(
    X_train= x_train.values, y_train=y_train.values,
    eval_set=[(x_train.values, y_train.values), (x_valid.values, y_valid.values)],
    eval_name=['train', 'val'],
    eval_metric=['logloss','f1'],
    max_epochs=100, patience=2,
    batch_size=1024,
    virtual_batch_size=256,
    num_workers=1,
    drop_last=False,
)


In [None]:
preds = clf.predict_proba(x_test)
preds = (preds[:,1]>0.5)*1

In [None]:
submission = pd.read_csv('./job_care/sample_submission.csv')
submission['target'] = preds

In [158]:

 #%%
train = pd.read_csv('./JobCare_data/train.csv')

In [None]:
train[['person_attribute_a_1','person_attribute_b','person_attribute_b','contents_attribute_e']]

In [None]:
#### feature encoding #####
attr_a_1_mean_encode = train.groupby('person_attribute_a_1')["target"].mean()

In [None]:

if 'attribute' in col:
    name = 'person_attribute_a_1'['person_attribute_a_1'.index('attribute') + len('attribute')+1:]
elif 'prefer' in col:
    name ='person_attribute_a_1'['person_attribute_a_1'.index('attribute') + len('attribute')+1:]

In [None]:
nominal_cols[0]

In [156]:
nominal_cols = ['person_attribute_a_1','person_attribute_b','person_prefer_e','contents_attribute_e']

In [None]:
# 오버피팅이 자주 발생하는 mean encoding 이므로 cross validation 과 정규화 같이 사용한다=
# 변환 하고자 하는 범주형 변수 선택
# 범주형 변수 그룹화 -> 타깃 변수 총합 합계
# 범주형 변수 그룹화 타깃 빈도수 합계
# 총합을 카운트로 나누고 본래 범주 값에 업데이트
# 여러가지 방법으로 적용 가능하다
# 비슷한 범주 사이에 있는 관계 표현 특징, 범주와 타깃사이에만 국한된다
# 범주가 많은 경우 이 방법은 데이터를 훨씬 더 단순화 한다

In [None]:
target = train.target

In [159]:
for col in nominal_cols:
    if 'attribute' in col:
        name = col[col.index('attribute') + len('attribute')+1:]
        var_name = 'attr_{}_mean_encode'.format(name)
        locals()[var_name] = train.groupby(col)["target"].mean()
    elif 'prefer' in col:
        name = col[col.index('prefer') + len('prefer')+1:]
        var_name = 'prefer_{}_mean_encode'.format(name)
        locals()[var_name] = train.groupby(col)["target"].mean()

    train.loc[:,var_name] = train[col].map(locals()[var_name])

In [160]:
train = train.drop('target',axis=1)

In [161]:
x_train, x_valid, y_train, y_valid = train_test_split(train,target , test_size=0.3 , shuffle=True, stratify=target, random_state=34)


In [162]:
rf= RandomForestClassifier(n_estimators=150 , max_depth=8 , min_samples_split=4, max_features=0.2, n_jobs=-1 ,random_state=0)

rf.fit(x_train,y_train)

ValueError: could not convert string to float: '2020-06-28 23:27:49'

In [145]:
# 과적합일진 모르지만 그래도많이올랐따 만족할 수 없음
(y_valid == rf.predict(x_valid)).mean()

NameError: name 'y_valid' is not defined

In [143]:
train.columns

Index(['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_h_1', 'person_prefer_h_2',
       'person_prefer_h_3', 'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'target', 'd_l_match_yn_woe_encode',
       'd_m_match_yn_woe_encode', 'd_s_match_yn_woe_encode',
       'h_l_match_yn_woe_encode', 'h_m_match_yn_woe_encode',
       'h_s_match_yn_woe_encode', 'person_attribute_a_woe_encode',
       'person_attribute_a_1_woe_encode', 'person_attribute_b_woe_encode',
       'person_prefer_c_woe_encode'

In [None]:

###### Smooth Encoding ##########

# 위와 다르게 smooth한 평균을 계산하고 적용하는 방법
# 1. 평균을 계산
Mean = train['target'].mean()
weight = 100
for col in nominal_cols:

    if 'attribute' in col:
        name = col[col.index('attribute') + len('attribute')+1:]
        var_name = 'attr_{}_mean_encode'.format(name)

    elif 'prefer' in col:
        name = col[col.index('prefer') + len('prefer')+1:]
        var_name = 'prefer_{}_mean_encode'.format(name)

    # 2. 각 그룹에 대한 값들의 빈도와 평균을 계산
    Agg = train.groupby(col)['target'].agg(['count','mean'])
    counts = Agg['count']
    #%%
    means = Agg['mean']

    # 3. 'smooth'한 평균을 계산
    smooth = (counts * means + weight * means) / (counts+weight)
    print(smooth)

    # smooth한 평균에 따라 각 값을 대체하는 것
    train.loc[:,'smooth_'+var_name] = train[col].map(smooth)

In [None]:
train=train.drop(['smoothattr_a_1_mean_encode',	'smoothattr_b_mean_encode'	,'smoothprefer_e_mean_encode'	,'smoothattr_e_mean_encode'],axis=1)

In [None]:
train  =train.drop(['attr_a_1_mean_encode','attr_b_mean_encode','prefer_e_mean_encode','attr_e_mean_encode','target'],axis=1)

In [None]:
from tqdm import tqdm
import numpy as np
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import GridSearchCV
import time
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

plt.style.use(['seaborn-darkgrid'])

In [None]:
train.info()
train.describe()

In [None]:
x = train.iloc[:,:-1]
y = train['target']

In [None]:
x = x.astype(np.int)

x_train = x_train.astype(np.int)

In [None]:
# valid set으로 예측을 하고 score 확인 ?

In [None]:
# LogisticRegression
logreg = LogisticRegression()
logreg.fit(x_train.values,y_train.values)
y_pred = logreg.predict(x_valid)


acc_log = round(logreg.score(x_train.values,y_train.values)* 100,2)


# 버릴 feature 는 버린다 많을 수록 좋은 피쳐가 아니기 때문에
# 제출 submission

In [None]:
###### 제출 코드 #####
submission = pd.read_csv('./job_care/sample_submission.csv')

submission['target'] = preds
test.columns
submission.to_csv('./baseline.csv', index=False)
baseline = pd.read_csv('./baseline.csv')
baseline['target'].value_counts()

# f1 score로 train에서 feature 를 가지고 학습한 모델을 test 모델에 적용해서
# 유의미하면 ?

In [None]:


# tree 기반 이므로 얼마나 트리 분할과 밀접한 관련이 있는 지를 본다


# feature 하나하나 마다 shuffle하여 성능 변화 지켜보기 중요한 역할의 feature 라면 모델 서능 떨어질 것
# weight가 양수인갑들은 중요한 값 모델에 큰 영향을 끼친다
# contents attribute d가 중요한 featrue
n_features = len(x_train.columns)

In [None]:
### 시각화 코드 ###
model.feature_importances_

import matplotlib.pyplot as plt

plt.barh(np.arange(n_features), sorted(model.feature_importances_), align='center')
plt.yticks(np.arange(n_features) , x_train.columns)
plt.xlabel('random forest feature importance')
plt.ylabel("Feature")
plt.tight_layout()

In [None]:
import numpy as np

# 사용자 번호와 컨텐츠 번호는 관련이 없을 듯 한데 제거

# 신경망에 리스트를 주입할 수 없으니 텐서로 변환


In [None]:
# feature importance
feature_names= test.columns
forest_importances = pd.Series(importances, index=feature_names )

import matplotlib.pyplot as plt

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title('Feature importances using MDI')
ax.set_ylabel("Mena decrease in impurity")
fig.tight_layout()

In [None]:
# feature importance 가 높은 값에 가중치를 줘서 더 높은 정확도 필요 어떤 콘텐츠 열람하고 시청을 했느냐가 타겟
train

In [None]:
#회원 선호속성과 컨텐츠 속성과의 연관관계

In [None]:
# 같은 사용자
# 데이터 시각화 부터 하자
train = train.astype(np.int)

In [None]:
test = test.astype(np.int)

In [None]:
# 각각 feature 간의 상관관계
# 대분류중분류소분류가 그래도 타겟과의 상관관계가 그나마 높은 것이 보인다
import seaborn as sns
train

In [None]:
sns.heatmap(train.corr())

In [None]:
# 변수 나누기

#
train['d_l_match_yn']

In [61]:
meta_data = []
for col_name in train.columns:
    if 'yn' in col_name:
        level = 'binary'
    elif 'attribute' in col_name:
        level = 'nominal'
    elif 'prefer' in col_name:
        level = 'nominal'

    f_dict = {
        'feature_name' : col_name,
        'level' : level
    }

    meta_data.append(f_dict)

meta_df = pd.DataFrame(meta_data, columns=['feature_name', 'level'])

Unnamed: 0,feature_name,level
0,d_l_match_yn,binary
1,d_m_match_yn,binary
2,d_s_match_yn,binary
3,h_l_match_yn,binary
4,h_m_match_yn,binary
5,h_s_match_yn,binary
6,person_attribute_a,nominal
7,person_attribute_a_1,nominal
8,person_attribute_b,nominal
9,person_prefer_c,nominal


In [None]:
# 둘다 오버샘플링 할 필요는 없다 적절한 분포
train['target'].value_counts()

In [None]:
vars_with_missing =  []

for f in train.columns:
    missings = train[train[f] ==-1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings/train.shape[0]

        print('Variable {} has {} records : ({:.2f}) with missing values'.format(f , missings , missings_perc))
print('In total , there are {} variables with missing values'.format(len(vars_with_missing)))


# check cardinality

In [None]:
v = meta_df[(meta_df.level == 'nominal')].index

In [None]:
for col in train.columns:
    dist_values = train[col].value_counts().shape[0]
    print('Variable {} has {} distinct values'.format(col, dist_values))

In [None]:
# 변수 시각화

train.dtypes

In [None]:
# target 이 1인 categorical value 에 대한 percentage

for col in train.columns:
    plt.figure()
    fig , ax = plt.subplots(figsize=(20,10))
    # Calculate the percnetage of target=1 per category value
    cat_perc = train[[col,'target']].groupby([col], as_index=False).mean()
    cat_perc.sort_values(by='target', ascending=False, inplace=True)
    # Bar plot
    # Order the bars decending on target mean
    sns.barplot(ax=ax, x= col, y='target', data=cat_perc, order=cat_perc[col])
    plt.ylabel("target percentage", fontsize=18)
    plt.xlabel(col,fontsize=18)
    plt.tick_params(axis='both', which='major' , labelsize=10)
    plt.show()

In [None]:
col = meta_df[meta_df.level == 'nominal']['feature_name'].values.tolist()

In [None]:

train = train.astype(np.int)

In [None]:
# dummification 하면 값의 수 만큼 컬럼이 늘어난다



pd.get_dummies(train , columns = ['person_attribute_a_1'], drop_first=True)

In [None]:
# PolynomialFeature
# 각 특서으이 제곱 혹은 그 이상을 추가
print('Before dummification we have {} variables in train'.format(train.shape[1]))
train = pd.get_dummies(train, columns=col, drop_first=True)
print('After dummification we have {} variables in train'.format(train.shape[1]))
# 분산이 너무 낮으면 제거한다
from sklearn.feature_selection import VarianceThreshold


selector = VarianceThreshold(threshold=.01)

In [None]:
train

In [None]:
# 이진 변수 이고 nomial변수인데 분산이 필요한가?

In [None]:
selector.fit(train.drop(['target','person_rn','contents_rn']))

In [None]:
train_select = train.select_dtypes(include=['int'])

#%

# barplot 이 훨씬 시각적으로 와닿는다

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')

In [None]:
from sklearn import tree
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image , ImageDraw , ImageFont
import re


import sklearn.metrics as mt

In [None]:
# accuracy = TP + TN / TP + TN + FP + FN (전체)
accuracy = mt.accuracy_score(y_valid,y_pred)

In [None]:
# 어느 피쳐가 들어갈 때 성능이 많이 떨어질까?
# 모든 피쳐 조합 ?

# 여부 feature 로만 모델 테스트 해보기


cols = train.columns.tolist()

In [None]:
yn_col = []
for col in cols:
    if 'match_yn' in col:
        yn_col.append(col)



#split마다 고려되는 features의 수 float이면 int(max_features * n_features)값이다


yn_train = train.loc[:,yn_col].astype(np.int)

(y_valid == gb.predict(x_valid)).mean()

In [None]:
attribute_col = []
for col in cols:
    if 'attribute' in col:
        attribute_col.append(col)

In [None]:
#
attribute_train = train.loc[:,attribute_col].astype(np.int)

In [None]:
prefer_col = []
for col in cols:
    if 'prefer' in col:
        prefer_col.append(col)

prefer_train = train.loc[:, prefer_col].astype(np.int)

# 여부, 선호 , 속성 각각을 트레이닝 해보고 조합도 트레이닝 해본다 ? 별로 좋은 것 같진 않지만

# 선호는 확실히 모델 성능이 떨어지기는 한다


x_train, x_valid, y_train, y_valid = train_test_split(prefer_train,target,stratify=target)



# 0.569면 비슷하다 그냥 이것도

In [None]:



train

In [None]:
### labelencoder
for idx, col in enumerate(train.columns):
    if 'match' not in col and col != 'target':

        le = LabelEncoder()
        le.fit(train_df[col].values)
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
        train[col] = train[col].apply(lambda x: le_dict.get(x,len(le_dict)))
        val[col] = val[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        test[col] = test[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        cat_idxs.append(idx)
        cat_dims.append(len(le_dict)+1)

In [None]:
X_train = train.drop('target',axis=1).values
y_train = train['target'].values
X_val = val.drop('target', axis=1).values
y_val = val['target'].values
X_test = test.values
eval_set = (X_val, y_val)

In [None]:
cat_idxs = []

cat_dims = []

In [None]:
train_df['person_prefer_f']

In [None]:
train.drop(['person_prefer_f','person_prefer_g'],axis=1,inplace=True)

In [None]:
train_temp = pd.read_csv('./JobCare_data/train.csv')

In [None]:
train['contents_open_dt'] = train_temp['contents_open_dt']
val = train[train['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)<11 ].copy()

In [None]:
test

In [None]:

attr_a_1_mean_encode = train.groupby('person_attribute_a_1')["target"].mean()

In [None]:

if 'attribute' in col:
    name = 'person_attribute_a_1'['person_attribute_a_1'.index('attribute') + len('attribute')+1:]
elif 'prefer' in col:
    name =

In [None]:
nominal_cols[0]

In [None]:
nominal_cols = ['person_attribute_a_1','person_attribute_b','person_prefer_e','contents_attribute_e']

In [None]:
# 오버피팅이 자주 발생하는 mean encoding 이므로 cross validation 과 정규화 같이 사용한다=
# 변환 하고자 하는 범주형 변수 선택
# 범주형 변수 그룹화 -> 타깃 변수 총합 합계
# 범주형 변수 그룹화 타깃 빈도수 합계
# 총합을 카운트로 나누고 본래 범주 값에 업데이트
# 여러가지 방법으로 적용 가능하다
# 비슷한 범주 사이에 있는 관계 표현 특징, 범주와 타깃사이에만 국한된다
# 범주가 많은 경우 이 방법은 데이터를 훨씬 더 단순화 한다

for col in nominal_cols:
    if 'attribute' in col:
        name = col[col.index('attribute') + len('attribute')+1:]
        var_name = 'attr_{}_mean_encode'.format(name)
        locals()[var_name] = train.groupby(col)["target"].mean()
    elif 'prefer' in col:
        name = col[col.index('prefer') + len('prefer')+1:]
        var_name = 'prefer_{}_mean_encode'.format(name)
        locals()[var_name] = train.groupby(col)["target"].mean()

    train.loc[:,var_name] = train[col].map(locals()[var_name])

AttributeError: 'DataFrame' object has no attribute 'target'

In [None]:
target = train.target
train = train.drop('target',axis=1)

In [169]:
train = train.drop(['contents_open_dt','id'],axis=1)

In [170]:
x_train, x_valid, y_train, y_valid = train_test_split(train,target , test_size=0.3 , shuffle=True, stratify=target, random_state=34)

rf= RandomForestClassifier(n_estimators=150 , max_depth=8 , min_samples_split=4, max_features=0.2, n_jobs=-1 ,random_state=0)

rf.fit(x_train,y_train)

RandomForestClassifier(max_depth=8, max_features=0.2, min_samples_split=4,
                       n_estimators=150, n_jobs=-1, random_state=0)

In [171]:
# 과적합일진 모르지만 그래도많이올랐따 만족할 수 없음
# 0.57정도에서 0.59 정도로 mean encoding 으로 상승

(y_valid == rf.predict(x_valid)).mean()

0.5970076899578978

In [None]:
# 위와 다르게 smooth한 평균을 계산하고 적용하는 방법
# 1. 평균을 계산
Mean = train['target'].mean()
weight = 100
for col in nominal_cols:

    if 'attribute' in col:
        name = col[col.index('attribute') + len('attribute')+1:]
        var_name = 'attr_{}_mean_encode'.format(name)

    elif 'prefer' in col:
        name = col[col.index('prefer') + len('prefer')+1:]
        var_name = 'prefer_{}_mean_encode'.format(name)

    # 2. 각 그룹에 대한 값들의 빈도와 평균을 계산
    Agg = train.groupby(col)['target'].agg(['count','mean'])
    counts = Agg['count']
    #%%
    means = Agg['mean']

    # 3. 'smooth'한 평균을 계산
    smooth = (counts * means + weight * means) / (counts+weight)
    print(smooth)

    # smooth한 평균에 따라 각 값을 대체하는 것
    train.loc[:,'smooth_'+var_name] = train[col].map(smooth)

In [None]:
train=train.drop(['smoothattr_a_1_mean_encode',	'smoothattr_b_mean_encode'	,'smoothprefer_e_mean_encode'	,'smoothattr_e_mean_encode'],axis=1)

In [None]:
train  =train.drop(['attr_a_1_mean_encode','attr_b_mean_encode','prefer_e_mean_encode','attr_e_mean_encode','target'],axis=1)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train,target , test_size=0.3 , shuffle=True, stratify=target, random_state=34)

In [141]:
rf= RandomForestClassifier(n_estimators=150 , max_depth=8 , min_samples_split=4, max_features=0.2, n_jobs=-1 ,random_state=0)

rf.fit(x_train,y_train)
# 과적합일진 모르지만 그래도많이올랐따 만족할 수 없음
(y_valid == rf.predict(x_valid)).mean()
train = train.drop(['id','contents_open_dt','person_rn','contents_rn'],axis=1)
target = train.target
train = train.drop('target',axis=1)
x_train, x_valid, y_train, y_valid = train_test_split(train,target , test_size=0.3 , shuffle=True, stratify=y, random_state=34)
rf= RandomForestClassifier(n_estimators=150 , max_depth=8 , min_samples_split=4, max_features=0.2, n_jobs=-1 ,random_state=0)
rf.fit(x_train,y_train)

# 과적합일진 모르지만 그래도많이올랐따 만족할 수 없음 정확도 향상
(y_valid == rf.predict(x_valid)).mean()
####### Weight of Evidence Encoding (WoE)

NameError: name 'x_train' is not defined

In [None]:
#%#%
# 각 범주가 target = 1일 확률 (좋은(good) = 1 일 확률) 을 계산한다.

train = train.drop(['person_rn'],axis=1)
cols.pop('person_rn')
cols.remove('person_rn')
cols.remove('contents_rn')
cols.remove('target')
#### WoE Encoding #######

In [83]:
cols

['d_l_match_yn',
 'd_m_match_yn',
 'd_s_match_yn',
 'h_l_match_yn',
 'h_m_match_yn',
 'h_s_match_yn',
 'person_attribute_a',
 'person_attribute_a_1',
 'person_attribute_b',
 'person_prefer_c',
 'person_prefer_d_1',
 'person_prefer_d_2',
 'person_prefer_d_3',
 'person_prefer_e',
 'person_prefer_f',
 'person_prefer_g',
 'person_prefer_h_1',
 'person_prefer_h_2',
 'person_prefer_h_3',
 'contents_attribute_i',
 'contents_attribute_a',
 'contents_attribute_j_1',
 'contents_attribute_j',
 'contents_attribute_c',
 'contents_attribute_k',
 'contents_attribute_l',
 'contents_attribute_d',
 'contents_attribute_m',
 'contents_attribute_e',
 'contents_attribute_h',
 'person_rn',
 'contents_rn',
 'target']

Index(['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_f', 'person_prefer_g',
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'contents_open_dt', 'target'],
      dtype='object')

In [132]:
def woe_encoding(df):
    for col in cols:
        if col in df.columns:
            woe_df = df.groupby(col)['target'].mean()
        # if col in test.columns:
        #     test['{}_woe_encode'.format(col)] = test[col].map(woe_df)
        woe_df = pd.DataFrame(woe_df)
        woe_df = woe_df.rename(columns = {'target':'good'})
        woe_df['bad'] = 1-woe_df.good
        woe_df['bad'] = np.where(woe_df['bad'] == 0 , 1e-6,woe_df['bad'])
        woe_df['WoE'] = np.log(woe_df.good/woe_df.bad)
        if col in df.columns:
            df.loc[:,'{}_woe_encode'.format(col)] = df[col].map(woe_df['WoE'])
            # test데이터에는 train 데이터 트레이닝 시 같이 매핑해준다 . target값이 없으므로
            test.loc[:,'{}_woe_encode',format(col)] = test[col].map(woe_df['WoE'])
    return df

In [130]:
test.loc[:,'{}_woe_encode',format(col)] = test[col].map(woe_df['WoE'])

KeyError: 'Column not found: target'

In [140]:
set(train.columns) - set(test.columns)

{'person_prefer_f_woe_encode',
 'person_prefer_g_woe_encode',
 'target',
 'target_woe_encode'}

KeyError: 'person_prefer_f'

In [None]:
train['ta"rget']= target
woe_df = train.groupby('person_attribute_a_1')['target'].mean()
woe_df = pd.DataFrame(woe_df)

# 칼럼의 이름을 "good" 으로 바꾸어 좀 더 이해하기 쉽게 한다
woe_df = woe_df.rename(columns = {'target':'good'})

woe_df['bad'] = 1 -woe_df.good
#분모에 최소한의 값을 더하여 0으로 나뉘는 일을 막는다
woe_df['bad'] = np.where(woe_df['bad']  == 0, 1e-6,woe_df['bad'])
#WoE를 계산한다
woe_df['WoE'] = np.log(woe_df.good/woe_df.bad)

In [None]:
# 가중치 인코딩이니 좀 더 내가 원했듯이 별 차이 없는 (0이어도 타겟값이 1:1비율로 퍼져있는)
# 그런 변수는 woe값이 아주 낮게 나오는게 아주맘에든다 바로이거다 전체구조를 파악하고 기억할 생각을 안해서 그렇지
# 하나씩 기억을 하고 이루다 개발하면서 로그시스템도 척척 restfulapi 스프링 서비스 구조를 알다보니
# 분명히 이건 이걸껀데 하면서 하나의 웹사이트에서 본 지식이 이해가 안가면 다른 웹사이트에서 본 지식과 내가 알고 있는것을 합쳐서
# 보니 이해할 수 있었다는 것을 얻었다는게 아주중요하다 오늘 WoE 값이 내가 원하던 값
woe_df

In [None]:
locals()['attr_a_1_mean_encode']
locals()['attr_b_mean_encode']
locals()['prefer_e_mean_encode']
locals()['attr_e_mean_encode']

In [None]:
train

In [None]:
attr_a_1_mean_encode

In [None]:
# 각 범주에 대해 , 타깃 = 1 인 확률 (좋은(Good) = 1 일 확률을 찾는다)을 찾는다


#### PR Encoding

In [None]:
pr_df = train.groupby("person_attribute_a_1")['target'].mean()

In [None]:
pr_df

In [None]:
pr_df = pd.DataFrame(pr_df)

In [None]:
pr_df = pr_df.rename(columns = {'target' : 'good'})

In [None]:
pr_df['bad'] = 1 - pr_df.good


In [None]:
pr_df

In [None]:
# 분모에 최소한의 값을 더하여 0으로 나뉘는 일을 막는다

pr_df['bad'] = np.where(pr_df['bad'] == 0 , 1e-6 ,  pr_df['bad'])

In [None]:
#확률 비율을 계산한다

pr_df['PR'] = pr_df.good/pr_df.bad

In [None]:
pr_df

In [None]:
train.loc[:,'PR_Encode'] = train['person_attribute_a_1'].map(pr_df['PR'])

In [None]:
import pandas as pd
import numpy as np

In [None]:
# 훈련 시에 생성된 mapping 값들을 타깃 데이터가주어지지 않는 kaggle competition에 사용함
# 타겟 기반으로 생성된 훈련 타임의 피쳐들을 테스트 데이터에 적용할 수 있도록 . <- 설명

In [None]:
# 이 mean_encode 를 test data 에 활용

#mean_encode

In [None]:
# test 데이터에 그냥 쓰는 것이 아니라 인코딩시켜서 적용한다 바로

test['person_attr_a_1_encode'] =test['person_attribute_a_1'].map(mean_encode)

In [None]:
cols

In [None]:
train.groupby('person_attribute_a_1')['target'].mean()

In [None]:
for col in cols:
    print(train.groupby(col)['target'].mean())

In [None]:
#1227 이 뭔지는 모르지만 많은 값이면서도 target 수치가 높다 어떻게 활용하지?
# d1과 d3의 타겟 값이 많으면서도 높은 값을 차지하니 높게 가중치를 쳐야하는데
train['person_prefer_d_1'].value_counts()

In [None]:


group_data = train.groupby('person_prefer_d_1')['target'].mean()

In [None]:
group_data[group_data.index ==114]

In [None]:
group_data[group_data.index ==102]


In [None]:
# 이렇게 count 많으면서 반반인 feature는 버릴 수 없나
# feature_seletion 을 했다고 하면 되잖아
# 순서형에 대해서는 mean encode를 하고 이런 반반인 값들은
# feature selection 을 통해 버렸다고
# 정확도가 높지만 count 낮은 것은 버린다
# 그래 전체를 기억하자 feature selection 의 개념도 잘못됬었다 그건 피쳐 자체를 선택하냐 마냐의 문제고
# 그렇다면 인코딩을 통해 의미없는 밸류는 의미없도록 하는게 나을 지도
group_data[group_data.index ==1227]
train[train.person_prefer_d_2 == 4]

In [None]:
train['person_prefer_d_2'].value_counts()
train.columns
meta_df
# d_l_match_yn WoE encoding
d_l_match_woe_df = train.groupby("d_l_match_yn")['target'].mean()
d_l_match_woe_df = pd.DataFrame(d_l_match_woe_df)
d_l_match_woe_df
d_l_match_woe_df = d_l_match_woe_df.rename(columns = {'target' : 'good'})
d_l_match_woe_df['bad'] = 1- d_l_match_woe_df.good
d_l_match_woe_df['bad'] = np.where(d_l_match_woe_df['bad'] == 0 , 1e-6 , d_l_match_woe_df['bad'])
d_l_match_woe_df['WoE'] = np.log(d_l_match_woe_df.good/woe_df.bad)

d_l_match_woe_df

# person_attribute_a_1 PR Encoding

pr_df = train.groupby('person_attribute_a_1')['target'].mean()
pr_df = pd.DataFrame(pr_df)
pr_df = pr_df.rename(columns = {'target' : 'good'} )
pr_df['bad'] = 1-pr_df.good
pr_df['bad']  = np.where(pr_df['bad'] ==0 , 1e-6, pr_df['bad'])
# 모델의 가중치가 update 될 때 이렇게 비슷한 값을 커지게 한 것의 변형은
# 의미가 적을 것 같고 log를 씌워 음수 양수 나눈 것은 좀 더 달라지기는 했으므로 의미가 있으려나 오히려 더 극단적으로 될 수도
#
pr_df['PR'] = pr_df.good/pr_df.bad
pr_df

##### all WoE Encoding ######
for col in cols:
    if col in train.columns:
        woe_df = train.groupby(col)['target'].mean()
    if col in test.columns:
        test['{}_woe_encode'.format(col)] = test[col].map(woe_df)
    woe_df = pd.DataFrame(woe_df)
    woe_df = woe_df.rename(columns = {'target':'good'})
    woe_df['bad'] = 1-woe_df.good
    woe_df['bad'] = np.where(woe_df['bad'] == 0 , 1e-6,woe_df['bad'])
    woe_df['WoE'] = np.log(woe_df.good/woe_df.bad)
    if col in train.columns:
        train.loc[:,'{}_woe_encode'.format(col)] = train[col].map(woe_df['WoE'])
train['target']= target
woe_df = train.groupby('person_attribute_a_1')['target'].mean()
woe_df = pd.DataFrame(woe_df)

In [None]:
# 칼럼의 이름을 "good" 으로 바꾸어 좀 더 이해하기 쉽게 한다
woe_df = woe_df.rename(columns = {'target':'good'})
woe_df['bad'] = 1 -woe_df.good
#분모에 최소한의 값을 더하여 0으로 나뉘는 일을 막는다
woe_df['bad'] = np.where(woe_df['bad']  == 0, 1e-6,woe_df['bad'])
#WoE를 계산한다
woe_df['WoE'] = np.log(woe_df.good/woe_df.bad)
# 가중치 인코딩이니 좀 더 내가 원했듯이 별 차이 없는 (0이어도 타겟값이 1:1비율로 퍼져있는)
# 그런 변수는 woe값이 아주 낮게 나오는게 아주맘에든다 바로이거다 전체구조를 파악하고 기억할 생각을 안해서 그렇지
# 하나씩 기억을 하고 이루다 개발하면서 로그시스템도 척척 restfulapi 스프링 서비스 구조를 알다보니
# 분명히 이건 이걸껀데 하면서 하나의 웹사이트에서 본 지식이 이해가 안가면 다른 웹사이트에서 본 지식과 내가 알고 있는것을 합쳐서
# 보니 이해할 수 있었다는 것을 얻었다는게 아주중요하다 오늘 WoE 값이 내가 원하던 값
woe_df

locals()['attr_a_1_mean_encode']
locals()['attr_b_mean_encode']
locals()['prefer_e_mean_encode']
locals()['attr_e_mean_encode']
train


In [None]:
attr_a_1_mean_encode
# 각 범주에 대해 , 타깃 = 1 인 확률 (좋은(Good) = 1 일 확률을 찾는다)을 찾는다
pr_df = train.groupby("person_attribute_a_1")['target'].mean()
pr_df
pr_df = pd.DataFrame(pr_df)
pr_df = pr_df.rename(columns = {'target' : 'good'})
pr_df['bad'] = 1 - pr_df.good
pr_df
# 분모에 최소한의 값을 더하여 0으로 나뉘는 일을 막는다
pr_df['bad'] = np.where(pr_df['bad'] == 0 , 1e-6 ,  pr_df['bad'])
#확률 비율을 계산한다
pr_df['PR'] = pr_df.good/pr_df.bad
pr_df
train.loc[:,'PR_Encode'] = train['person_attribute_a_1'].map(pr_df['PR'])

In [None]:

# 훈련 시에 생성된 mapping 값들을 타깃 데이터가주어지지 않는 kaggle competition에 사용함
# 타겟 기반으로 생성된 훈련 타임의 피쳐들을 테스트 데이터에 적용할 수 있도록 . <- 설명

In [None]:
# 이 mean_encode 를 test data 에 활용

#mean_encode

In [None]:
# test 데이터에 그냥 쓰는 것이 아니라 인코딩시켜서 적용한다 바로


for col in cols:
    print(train.groupby(col)['target'].mean())

#1227 이 뭔지는 모르지만 많은 값이면서도 target 수치가 높다 어떻게 활용하지?
# d1과 d3의 타겟 값이 많으면서도 높은 값을 차지하니 높게 가중치를 쳐야하는데
train['person_prefer_d_1'].value_counts()

In [None]:
group_data = train.groupby('person_prefer_d_1')['target'].mean()

In [None]:
# 이렇게 count 많으면서 반반인 feature는 버릴 수 없나
# feature_seletion 을 했다고 하면 되잖아
# 순서형에 대해서는 mean encode를 하고 이런 반반인 값들은
# feature selection 을 통해 버렸다고
# 정확도가 높지만 count 낮은 것은 버린다
# 그래 전체를 기억하자 feature selection 의 개념도 잘못됬었다 그건 피쳐 자체를 선택하냐 마냐의 문제고
# 그렇다면 인코딩을 통해 의미없는 밸류는 의미없도록 하는게 나을 지도
group_data[group_data.index ==1227]

train[train.person_prefer_d_2 == 4]
train['person_prefer_d_2'].value_counts()


train.columns


In [None]:
meta_df

In [None]:
d_l_match_woe_df = train.groupby("d_l_match_yn")['target'].mean()
d_l_match_woe_df = pd.DataFrame(d_l_match_woe_df)

d_l_match_woe_df
d_l_match_woe_df = d_l_match_woe_df.rename(columns = {'target' : 'good'})
d_l_match_woe_df['bad'] = 1- d_l_match_woe_df.good

In [6]:
d_l_match_woe_df['bad'] = np.where(d_l_match_woe_df['bad'] == 0 , 1e-6 , d_l_match_woe_df['bad'])
d_l_match_woe_df['WoE'] = np.log(d_l_match_woe_df.good/woe_df.bad)
train.groupby("d_l_match_yn")['target'].mean()
d_l_match_woe_df
pr_df = train.groupby('person_attribute_a_1')['target'].mean()
pr_df = pd.DataFrame(pr_df)
pr_df = pr_df.rename(columns = {'target' : 'good'} )
pr_df['bad'] = 1-pr_df.good
pr_df['bad']  = np.where(pr_df['bad'] ==0 , 1e-6, pr_df['bad'])
# 모델의 가중치가 update 될 때 이렇게 비슷한 값을 커지게 한 것의 변형은
# 의미가 적을 것 같고 log를 씌워 음수 양수 나눈 것은 좀 더 달라지기는 했으므로 의미가 있으려나 오히려 더 극단적으로 될 수도
#
pr_df['PR'] = pr_df.good/pr_df.bad
pr_df

NameError: name 'd_l_match_woe_df' is not defined