In [None]:
import warnings
import pandas as pd, numpy as np, re, json
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings("ignore")

with open("./results/C_편의점커피라떼_시뮬레이션.json", "r", encoding="utf-8") as f:
    sim_result = json.load(f)

In [54]:
result_df = pd.json_normalize(sim_result['results'])

In [55]:
result_df.columns

Index(['persona_id', 'current_product', 'target_product_info',
       'current_product_info', 'conversion_decision', 'reasoning',
       'raw_response', 'response_time', 'success', 'error', 'timestamp',
       'persona.uuid', 'persona.segment_key_input', 'persona.reasoning',
       'persona.persona_info', 'persona.raw_data.uuid',
       'persona.raw_data.segment_key_input', 'persona.raw_data.reasoning',
       'persona.raw_data.기존사용제품', 'persona.raw_data.가구소득',
       'persona.raw_data.연령대', 'persona.raw_data.가구원수', 'persona.raw_data.성별',
       'persona.raw_data.지역', 'persona.raw_data.교육수준', 'persona.raw_data.직업',
       'persona.raw_data.건강관심도', 'persona.raw_data.가구요리빈도',
       'persona.raw_data.주거형태', 'persona.raw_data.건강투자정도',
       'persona.raw_data.운동여부', 'persona.raw_data.sns사용빈도',
       'persona.raw_data.식료품구입빈도', 'persona.raw_data.1회평균식료품구입금액',
       'persona.raw_data.우유구입기준'],
      dtype='object')

In [56]:
cols = ['persona_id',
        'persona.raw_data.가구소득','persona.raw_data.연령대', 'persona.raw_data.가구원수', 'persona.raw_data.성별', 'persona.raw_data.지역',
        'persona.raw_data.교육수준', 'persona.raw_data.직업', 'persona.raw_data.건강관심도', 'persona.raw_data.가구요리빈도', 'persona.raw_data.주거형태',
        'persona.raw_data.건강투자정도', 'persona.raw_data.운동여부', 'persona.raw_data.sns사용빈도', 'persona.raw_data.식료품구입빈도', 'persona.raw_data.1회평균식료품구입금액', 
        # 'persona.raw_data.가공식품구입빈도', 'persona.raw_data.가공식품구입기준',
        'persona.raw_data.우유구입기준',
        # 'selected_product', 
        'conversion_decision']

In [60]:
cols += ['dd']
cols

['persona_id',
 'persona.raw_data.가구소득',
 'persona.raw_data.연령대',
 'persona.raw_data.가구원수',
 'persona.raw_data.성별',
 'persona.raw_data.지역',
 'persona.raw_data.교육수준',
 'persona.raw_data.직업',
 'persona.raw_data.건강관심도',
 'persona.raw_data.가구요리빈도',
 'persona.raw_data.주거형태',
 'persona.raw_data.건강투자정도',
 'persona.raw_data.운동여부',
 'persona.raw_data.sns사용빈도',
 'persona.raw_data.식료품구입빈도',
 'persona.raw_data.1회평균식료품구입금액',
 'persona.raw_data.우유구입기준',
 'conversion_decision',
 'dd',
 'dd',
 'dd']

In [30]:
data = result_df[cols]

In [31]:
# data['selected_product'].value_counts()
data['conversion_decision'].value_counts()

conversion_decision
0    663
1    337
Name: count, dtype: int64

In [91]:
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

def rf_analysis(sim_path, conversion_sim, random_state = 42, product_cat = None, agg = 'avg',
                grid = {"n_estimators": [50, 100, 200], "max_depth": [None, 4, 8], "min_samples_split": [2, 4, 8]}):
    with open(sim_path, "r", encoding="utf-8") as f:
        sim_result = json.load(f)
        
    result_df = pd.json_normalize(sim_result['results'])
    cols = ['persona_id',
        'persona.raw_data.가구소득','persona.raw_data.연령대', 'persona.raw_data.가구원수', 'persona.raw_data.성별', 'persona.raw_data.지역',
        'persona.raw_data.교육수준', 'persona.raw_data.직업', 'persona.raw_data.건강관심도', 'persona.raw_data.가구요리빈도', 'persona.raw_data.주거형태',
        'persona.raw_data.건강투자정도', 'persona.raw_data.운동여부', 'persona.raw_data.sns사용빈도', 'persona.raw_data.식료품구입빈도', 'persona.raw_data.1회평균식료품구입금액']
    
    if product_cat == '유제품': # 그릭, 라떼
        cols += ['persona.raw_data.우유구입기준']

    elif product_cat == '가공식품': # 맛참, 햄
        cols += ['persona.raw_data.가공식품구입빈도', 'persona.raw_data.가공식품구입기준']
    
    else:
        pass

    if conversion_sim: # 그릭, 라떼, 햄
        cols += ['conversion_decision']
        data = result_df[cols]
    
    else: # 맛참, 참치액
        cols += ['selected_product']
        data = result_df[cols]
        target_map = {'사조 고추할라피뇨참치': 0, '사조참치액': 0, '한라참치액': 0, '동원참치액': 1, '동원맛참 매콤참기름': 1, }
        data["selected_product"] = data["selected_product"].map(target_map)
    
    data.columns = [re.sub('persona.raw_data.', '', col) for col in data.columns]

    X = data.iloc[:,1:-1]
    y = data.iloc[:,-1]
    num_cols = ['1회평균식료품구입금액'] 
    cat_cols = [c for c in X.columns if c not in num_cols]

    ohe = OneHotEncoder(sparse_output = False, drop = 'first')
    nominal_val = ohe.fit_transform(X[cat_cols]).astype('int32')
    X.drop(columns=cat_cols,axis=1,inplace=True)
    nominal_df = pd.DataFrame(nominal_val, columns=ohe.get_feature_names_out(cat_cols))

    X = pd.concat([nominal_df,X],axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2, random_state=42, stratify=y)

    rf_base = RandomForestClassifier(random_state=random_state, n_jobs=-1, class_weight="balanced_subsample")

    if grid is not None:
        print("Grid Searching...")
        gs = GridSearchCV(rf_base, param_grid=grid, cv=4, scoring='accuracy', n_jobs=-1, verbose=1)
        gs.fit(X_train, y_train)
        rf = gs.best_estimator_
        print("Best Params:", gs.best_params_)
    else:
        rf = rf_base.fit(X_train, y_train)

    y_pred_tr_proba = rf.predict_proba(X_train)
    y_pred_tr = rf.predict(X_train)
    y_pred_te_proba = rf.predict_proba(X_test)
    y_pred_te = rf.predict(X_test)

    ce_train = log_loss(y_train,y_pred_tr_proba)
    ce_test = log_loss(y_test, y_pred_te_proba)
    acc_train = accuracy_score(y_train, y_pred_tr)
    acc_test = accuracy_score(y_test, y_pred_te)
    print("Train Cross-Entropy:", ce_train)
    print("Train Acc:",acc_train)
    print("Test Cross-Entropy:", ce_test)
    print("Test Acc:",acc_test)

    rf.fit(X,y)
    importance = pd.DataFrame({"feature": rf.feature_names_in_, "importance": rf.feature_importances_})

    group_map = {c:[c] for c in num_cols}
    for c in cat_cols:
        group_map[c] = [f for f in rf.feature_names_in_ if f.startswith(c + "_")]

    group_imp_agg = []
    for g, cols in group_map.items():
        if agg == 'sum':
            aggs = importance.set_index("feature").reindex(cols)["importance"].fillna(0).sum()
        elif agg == 'avg':
            aggs = importance.set_index("feature").reindex(cols)["importance"].fillna(0).mean()

        group_imp_agg.append({"group": g, "k_features": len(cols), "rf_importance_agg": aggs})

    group_importance = pd.DataFrame(group_imp_agg).sort_values("rf_importance_agg", ascending=False)
    group_importance['rf_importance_agg'] = group_importance['rf_importance_agg']/group_importance['rf_importance_agg'].sum()

    return group_importance


In [None]:
rf_analysis(sim_path = './results/C_편의점커피라떼_시뮬레이션.json', conversion_sim=True, product_cat='유제품', agg='sum')

Grid Searching...
Fitting 4 folds for each of 27 candidates, totalling 108 fits
Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Train Cross-Entropy: 0.1633545504502022
Train Acc: 0.99125
Test Cross-Entropy: 0.7632409879494675
Test Acc: 0.715


Unnamed: 0,group,k_features,rf_importance_agg
5,지역,13,0.130389
0,1회평균식료품구입금액,1,0.128317
7,직업,11,0.092497
2,연령대,5,0.090674
15,우유구입기준,6,0.07814
11,건강투자정도,5,0.062149
14,식료품구입빈도,3,0.057585
13,sns사용빈도,2,0.055226
3,가구원수,2,0.054964
8,건강관심도,5,0.046906


In [None]:
rf_analysis(sim_path = './results/C_그릭요거트_시뮬레이션.json', conversion_sim=True, product_cat='유제품', agg='sum')

Grid Searching...
Fitting 4 folds for each of 27 candidates, totalling 108 fits
Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Train Cross-Entropy: 0.06289018319527682
Train Acc: 0.9925
Test Cross-Entropy: 0.26655638540752447
Test Acc: 0.905


Unnamed: 0,group,k_features,rf_importance_agg
0,1회평균식료품구입금액,1,0.123229
2,연령대,5,0.101524
13,sns사용빈도,2,0.096785
7,직업,12,0.096473
5,지역,12,0.092001
15,우유구입기준,6,0.067227
14,식료품구입빈도,3,0.062967
9,가구요리빈도,2,0.057464
3,가구원수,2,0.049832
10,주거형태,4,0.048984


In [None]:
rf_analysis(sim_path = './results/B_참치캔_시뮬레이션.json', conversion_sim=False, product_cat='가공식품', agg='sum')

Grid Searching...
Fitting 4 folds for each of 27 candidates, totalling 108 fits
Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Train Cross-Entropy: 0.15591542907218017
Train Acc: 0.995
Test Cross-Entropy: 0.6125800792322924
Test Acc: 0.715


Unnamed: 0,group,k_features,rf_importance_agg
5,지역,14,0.148173
0,1회평균식료품구입금액,1,0.108414
7,직업,13,0.098474
2,연령대,5,0.087465
16,가공식품구입기준,8,0.061142
15,가공식품구입빈도,5,0.058573
11,건강투자정도,5,0.057822
14,식료품구입빈도,5,0.056907
10,주거형태,3,0.055878
3,가구원수,2,0.053585


In [None]:
rf_analysis(sim_path = './results/C_스팸_시뮬레이션.json', conversion_sim=True, product_cat='햄', agg='sum')

Grid Searching...
Fitting 4 folds for each of 27 candidates, totalling 108 fits
Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Train Cross-Entropy: 0.02202639784004516
Train Acc: 1.0
Test Cross-Entropy: 0.44678453325558365
Test Acc: 0.955


Unnamed: 0,group,k_features,rf_importance_agg
13,sns사용빈도,2,0.141872
2,연령대,5,0.135413
7,직업,13,0.12216
5,지역,14,0.09015
0,1회평균식료품구입금액,1,0.080706
10,주거형태,3,0.075643
11,건강투자정도,5,0.058752
8,건강관심도,4,0.0506
14,식료품구입빈도,5,0.048805
3,가구원수,2,0.041447


In [None]:
rf_analysis(sim_path = './results/B_참치액_시뮬레이션.json', conversion_sim=False, product_cat=None, agg='sum')

Grid Searching...
Fitting 4 folds for each of 27 candidates, totalling 108 fits
Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Train Cross-Entropy: 0.1700232402609808
Train Acc: 0.98625
Test Cross-Entropy: 0.5668345069224335
Test Acc: 0.755


Unnamed: 0,group,k_features,rf_importance_agg
5,지역,13,0.153507
0,1회평균식료품구입금액,1,0.150128
2,연령대,5,0.110291
7,직업,13,0.105697
14,식료품구입빈도,4,0.065252
11,건강투자정도,5,0.063677
3,가구원수,2,0.059243
10,주거형태,3,0.059225
13,sns사용빈도,2,0.040772
8,건강관심도,4,0.037817
