In [2]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from skopt import BayesSearchCV
from skopt.space import Real, Integer

from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder


In [3]:
pd.options.display.max_rows = 1000

In [33]:
df0 = pd.read_excel('train_data_LGBM.xlsx')

In [34]:
# 고객별 구매 빈도 확인 위한 pivoting
df_pivot = df0.pivot_table(index = 'Customer_ID', values = 'Product_No', aggfunc = 'count')

In [35]:
# IQR 기준 이상치 판단(Q3 + (Q3 - Q1)*1.5)
df_pivot['Product_No'].describe()

count    34595.000000
mean         2.312473
std          3.523270
min          1.000000
25%          1.000000
50%          2.000000
75%          2.000000
max        337.000000
Name: Product_No, dtype: float64

In [36]:
# 2번 이상 구매 고객 sorting, 이상치 제거
member_index = df_pivot[(df_pivot['Product_No'] > 1)&(df_pivot['Product_No'] < 5)].index
df = df0[df0['Customer_ID'].isin(member_index)]

In [37]:
# CTG, Sub_CTG 인코딩 및 datetime date numeric으로 변환
encoder1 = LabelEncoder()
encoder2 = LabelEncoder()

df.loc[:, 'CTG'] = encoder1.fit_transform(df['CTG'])
df.loc[:, 'Sub_CTG'] = encoder2.fit_transform(df['Sub_CTG'])

df.loc[:, 'Order_Date'] = pd.to_numeric(df['Order_Date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'CTG'] = encoder1.fit_transform(df['CTG'])
  df.loc[:, 'CTG'] = encoder1.fit_transform(df['CTG'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'Sub_CTG'] = encoder2.fit_transform(df['Sub_CTG'])
  df.loc[:, 'Sub_CTG'] = encoder2.fit_transform(df['Sub_CTG'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [38]:
# 각 고객의 구매 이력을 시계열 데이터로 변환
X = []
y = []
member_ids = []
maxlen = 0
for member_id in df['Customer_ID'].unique():
    member_df = df[df['Customer_ID'] == member_id].sort_values('Order_Date')
    for i in range(len(member_df) - 1):
        X.append(member_df.iloc[:i+1][['Product_No', 'CTG', 'Sub_CTG' ]].values)
        y.append(member_df.iloc[i+1]['CTG'])
        member_ids.append(member_id)
        maxlen = max(maxlen, i+1)

# 시계열 데이터를 동일한 길이로 패딩
X = pad_sequences(X, maxlen=maxlen, dtype='float32')
X[np.isnan(X)] = 0

In [39]:
# 데이터를 학습/테스트 세트로 분리
X_train, X_test, y_train, y_test, member_ids_train, member_ids_test = train_test_split(X, y, member_ids, test_size=0.2, random_state=0)

# LightGBM 모델 학습
model = LGBMClassifier(num_leaves=56,
                       min_data_in_leaf=30,
                       max_depth=7,
                       bagging_fraction=0.6,
                       feature_fraction=0.75,
                       random_state=0)
model.fit(X_train.reshape(X_train.shape[0], -1), y_train)



In [40]:
# 모델 평가
y_pred = model.predict(X_test.reshape(X_test.shape[0], -1))
accuracy_score(y_pred,y_test)

0.8045405405405406

In [41]:
# 결과값 데이터 프레임 형태로 정리
results_df = pd.DataFrame(columns=['Customer_ID', 'Predicted_CTG'])

for member_id in df['Customer_ID'].unique():
    member_df = df[df['Customer_ID'] == member_id].sort_values('Order_Date')
    
    X2 = member_df[['Product_No','CTG', 'Sub_CTG']].values
    X2 = pad_sequences(X2[np.newaxis,:,:], maxlen=maxlen, dtype='float32')
    X2[np.isnan(X2)] = 0
    X2 = X2.reshape(1, -1)
    y_pred2 = model.predict(X2)
    
    results_df = pd.concat([results_df, pd.DataFrame({'Customer_ID': [member_id], 'Predicted_CTG': [y_pred2[0]]})], ignore_index=True)

In [42]:
# Sector별 구매빈도 데이터프레임 생성
Sector_df = pd.DataFrame(columns=['Customer_ID', 'A', 'B', 'C', 'D', 'E'])

for member_id in df['Customer_ID'].unique():
    member_df2 = df[df['Customer_ID'] == member_id].sort_values('Order_Date')
    
    sector_counts = member_df2['Sector'].value_counts()
    
    Sector_df = pd.concat([Sector_df, pd.DataFrame({
        'Customer_ID': [member_id],
        'A': [sector_counts.get('A', 0)],
        'B': [sector_counts.get('B', 0)],
        'C': [sector_counts.get('C', 0)],
        'D': [sector_counts.get('D', 0)],
        'E': [sector_counts.get('E', 0)]
    })], ignore_index=True)

In [43]:
summary_df = pd.merge(Sector_df,results_df, on = 'Customer_ID', how = 'inner')

In [44]:
summary_df['Predicted_CTG'] = summary_df['Predicted_CTG'].astype(int)
summary_df['Predicted_CTG'] = encoder1.inverse_transform(summary_df['Predicted_CTG'])

In [45]:
summary_df

Unnamed: 0,Customer_ID,A,B,C,D,E,Predicted_CTG
0,972206,1,2,0,0,0,BA
1,180769,0,0,0,2,0,DA
2,263139,0,0,2,1,0,DA
3,117392,1,1,1,0,0,AA
4,534956,1,0,0,1,1,CC
...,...,...,...,...,...,...,...
15174,518530,3,0,0,0,0,AA
15175,830934,0,0,0,4,0,DA
15176,444315,0,0,2,0,0,CA
15177,906368,0,0,0,4,0,DA


In [None]:
# Bayes Search 통한 Hyper Parameter Fine Tuning

from skopt import BayesSearchCV
from skopt.space import Real, Integer

param_grid = {
    'num_leaves': Integer(7, 63),
    'min_data_in_leaf': Integer(1, 30),
    'max_depth': Integer(-1, 7),
    'bagging_fraction': Real(0.6, 0.9),
    'feature_fraction': Real(0.6, 0.9)
}

# 베이지안 최적화
model = LGBMClassifier(random_state=0)
bayes_search = BayesSearchCV(model, param_grid, cv=5)
bayes_search.fit(X_train.reshape(X_train.shape[0], -1), y_train)

best_params = bayes_search.best_params_
print('Best parameters:', best_params)

model = LGBMClassifier(**best_params)
model.fit(X_train.reshape(X_train.shape[0], -1), y_train)

y_pred = model.predict(X_test.reshape(X_test.shape[0], -1))
print('Accuracy:', accuracy_score(y_test, y_pred))