<a href="https://colab.research.google.com/github/chaeun6-cmd/New-repository/blob/main/%EB%8D%B0%EC%9D%B4%ED%84%B0%ED%86%A4_02_20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.formula.api as smf
import statsmodels.api as sm

from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, roc_auc_score
)
from sklearn.model_selection import cross_val_score, KFold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import permutation_importance, partial_dependence

import warnings
warnings.filterwarnings('ignore')


In [None]:
data = pd.read_excel("/content/Online_Retail.xlsx")

data.info()
data.isnull().any()
data.head()
data.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID
count,541909.0,541909,541909.0,406829.0
mean,9.55225,2011-07-04 13:34:57.156386048,4.611114,15287.69057
min,-80995.0,2010-12-01 08:26:00,-11062.06,12346.0
25%,1.0,2011-03-28 11:34:00,1.25,13953.0
50%,3.0,2011-07-19 17:17:00,2.08,15152.0
75%,10.0,2011-10-19 11:27:00,4.13,16791.0
max,80995.0,2011-12-09 12:50:00,38970.0,18287.0
std,218.081158,,96.759853,1713.600303


In [None]:
# 고객 결측 제거
data = data[data["CustomerID"].notna()]

# 수량/가격 필터링
data = data[(data["Quantity"] > 0) & (data["UnitPrice"] > 0)]

# 문자열 변환 (에러 방지)
data["StockCode"] = data["StockCode"].astype(str)
data["Description"] = data["Description"].astype(str)

# 비상품(서비스 항목-배송비, 운송비, 은행 수수료, 온라인 배송비, 샘플 상품) 키워드 정의
service_keywords = ["POST", "CARRIAGE", "BANK", "DOTCOM", "SAMPLE"]

pattern = "|".join(service_keywords)

# 비상품 제거
df = data[
    ~data["StockCode"].str.contains(pattern, case=False, na=False) &
    ~data["Description"].str.contains(pattern, case=False, na=False)
]

In [None]:
# 매출 컬럼  생성
data["TotalAmount"] = data["Quantity"] * data["UnitPrice"]

In [None]:
# 상품 데이터 정형화(문자열 변환)
data["Description"] = data["Description"].astype(str)

# 대문자 통일
data["Description"] = data["Description"].str.upper()

# 앞뒤 공백 제거
data["Description"] = data["Description"].str.strip()

# 여러 공백을 하나로
data["Description"] = data["Description"].str.replace(r"\s+", " ", regex=True)

In [None]:
# 매출 컬럼 확인
data["TotalAmount"].describe()

Unnamed: 0,TotalAmount
count,397884.0
mean,22.397
std,309.071041
min,0.001
25%,4.68
50%,11.8
75%,19.8
max,168469.6


In [None]:
# 중복행 제거 (매출 왜곡 방지)
data.duplicated().sum()

np.int64(5192)

In [None]:
# 기준일 생성(최근성을 계산하기 위해)
reference_date = data["InvoiceDate"].max() + pd.Timedelta(days=1)

In [None]:
# 가장 최근 구매 날짜/몇 번 주문했는지/총 구매 금액
rfm = data.groupby("CustomerID").agg({
    "InvoiceDate": "max",
    "InvoiceNo": "nunique",
    "TotalAmount": "sum"
}).reset_index()

In [None]:
# 생성된 기준일을 기준으로 계산
rfm["Recency"] = (reference_date - rfm["InvoiceDate"]).dt.days

In [None]:
# 컬럼 이름 변환
rfm = rfm.rename(columns={
    "InvoiceNo": "Frequency",
    "TotalAmount": "Monetary"
})

In [None]:
rfm.head()

Unnamed: 0,CustomerID,InvoiceDate,Frequency,Monetary,Recency
0,12346.0,2011-01-18 10:01:00,1,77183.6,326
1,12347.0,2011-12-07 15:52:00,7,4310.0,2
2,12348.0,2011-09-25 13:13:00,4,1797.24,75
3,12349.0,2011-11-21 09:51:00,1,1757.55,19
4,12350.0,2011-02-02 16:01:00,1,334.4,310


In [None]:
rfm.tail()

Unnamed: 0,CustomerID,InvoiceDate,Frequency,Monetary,Recency
4333,18280.0,2011-03-07 09:52:00,1,180.6,278
4334,18281.0,2011-06-12 10:53:00,1,80.82,181
4335,18282.0,2011-12-02 11:43:00,2,178.05,8
4336,18283.0,2011-12-06 12:02:00,16,2094.88,4
4337,18287.0,2011-10-28 09:29:00,3,1837.28,43
