In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# 데이터 불러오기
data = pd.read_csv('./data_in/total_df.csv')

# 날짜 컬럼을 datetime 형식으로 변환
date_columns = ['Order_purchase_timestamp', 'Order_delivered_carrier_date', 'Order_delivered_customer_date',
                'Order_estimated_delivery_date', 'Review_creation_date', 'Review_answer_timestamp']
for col in date_columns:
    data[col] = pd.to_datetime(data[col])

# 1. 판매하는 품목의 평균 가격대
category_price_avg = data.groupby('Product_category_name')['Price'].mean().rename('Avg_category_price')
data = data.join(category_price_avg, on='Product_category_name')

# 2. 동일한 상품에 대한 가격 경쟁력
data['Price_competitiveness'] = data['Price'] - data['Avg_category_price']

# 3. 판매자의 평균 리뷰 점수
seller_review_avg = data.groupby('Seller_id')['Review_score'].mean().rename('Avg_seller_review')
data = data.join(seller_review_avg, on='Seller_id')

# 4. 상품의 평균 리뷰 점수
product_review_avg = data.groupby('Product_id')['Review_score'].mean().rename('Avg_product_review')
data = data.join(product_review_avg, on='Product_id')

# 5. 셀러에 대한 고객 충성도 (고유 고객 수 대비 반복 구매 비율)
repeat_customers = data.groupby('Seller_id')['Customer_id'].value_counts().gt(1).groupby('Seller_id').mean().rename('Customer_loyalty')
data = data.join(repeat_customers, on='Seller_id')

# 6. 한 주문에 포함된 상품의 개수
items_per_order = data.groupby('Order_id')['Order_item_id'].max().rename('Items_per_order')
data = data.join(items_per_order, on='Order_id')

# 7. 한 주문 당 결제한 가격
total_payment_per_order = data.groupby('Order_id')['Payment_value'].sum().rename('Total_payment_per_order')
data = data.join(total_payment_per_order, on='Order_id')

# 8. 배송 속도 (주문 날짜부터 고객이 상품을 받은 날짜까지의 일수)
data['Delivery_speed'] = (data['Order_delivered_customer_date'] - data['Order_purchase_timestamp']).dt.days

# 9. 리뷰 응답 속도 (리뷰 작성일부터 판매자 응답일까지의 시간)
data['Review_response_time'] = (data['Review_answer_timestamp'] - data['Review_creation_date']).dt.days


  data = pd.read_csv('./data_in/total_df.csv')


In [5]:
### 2. 랜덤포레스트!!


    # 고객 당 총 결제 금액 계산
    #customer_total_spending = data.groupby('Customer_id')['Payment_value'].sum().rename('Total_spending')

    # 데이터 프레임에 결제 금액 추가
    #data = data.join(customer_total_spending, on='Customer_id')

# 필요한 특성 선택
features = [
    'Avg_category_price', 'Price_competitiveness', 'Avg_seller_review', 'Avg_product_review',
    'Customer_loyalty', 'Items_per_order', 'Delivery_speed', 'Review_response_time'
]
X = data[features]
y = data['Total_payment_per_order']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 생성 및 훈련
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 예측 및 성능 평가
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print('Mean Squared Error:', mse)

# 변수 중요도
feature_importances = model.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
importance_df.sort_values(by='Importance', ascending=False, inplace=True)

Mean Squared Error: 11010.86690148411 r2:  [  30.9981       73.6159      304.13692571 ...   49.21449286 1649.983
   79.17814476]


#### LTV에 영향을 주는 요인

In [4]:
importance_df

Unnamed: 0,Feature,Importance
5,Items_per_order,0.51039
1,Price_competitiveness,0.194242
2,Avg_seller_review,0.154949
0,Avg_category_price,0.054474
7,Review_response_time,0.036879
3,Avg_product_review,0.034587
4,Customer_loyalty,0.008927
6,Delivery_speed,0.005552


#### corr() 메서드로 계산한 LTV와 매출간의 상관계수

In [6]:
ltv_to_revenue = data[['Total_payment_per_order', 'Revenue']].corr()

print(ltv_to_revenue)

                         Total_payment_per_order   Revenue
Total_payment_per_order                 1.000000  0.479854
Revenue                                 0.479854  1.000000


#### 선형 회귀 분석으로 계산한 LTV와 매출간의 연관성

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import random

In [10]:
# 'Total_payment_per_order'과 'Revenue' 컬럼 확인 및 결측치 제거
data = data[['Total_payment_per_order', 'Revenue']].dropna()

# 데이터 분할
X = data[['Total_payment_per_order']]
y = data['Revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 생성 및 훈련
model = LinearRegression()
model.fit(X_train, y_train)

# 예측 및 성능 평가
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print('Mean Squared Error:', mse)
print('R2 Score:', r2)

Mean Squared Error: 37532.3893728115
R2 Score: 0.1353071718360095
