In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('./data_in/total_df.csv')

In [None]:
column_filter = ['Order_id', 'Product_id', 'Seller_id', 'Price',
       'Freight_value', 'Customer_id',
       'Product_category_name', 'Revenue',
       'Customer_city',
       'Order_purchase_timestamp',
       'Payment_sequential', 'Payment_type','Payment_installments', 'Payment_value',
       'Review_score']

df = df[column_filter]

### 도시별 리뷰 점수 확인

In [None]:
df['Review_score'].isna().sum()

In [None]:
city_review = df.pivot_table(
    index= 'Customer_city',
    values= 'Review_score',
    aggfunc= 'mean'
).dropna().sort_values(by= 'Review_score', ascending= False)

In [None]:
freq = pd.DataFrame(df['Customer_city'].value_counts())

In [None]:
city_review_total = pd.merge(city_review, freq, how= 'left', on= 'Customer_city')

In [None]:
city_review_total.sort_values(by= 'count', ascending= False).head(20)

### 매출 구간별 리뷰 점수 확인

In [None]:
df['Revenue'].sort_values(ascending= False).head(15)

In [None]:
# 매출을 구간별로 나누기

bins = np.linspace(df['Revenue'].min(),\
                   df['Revenue'].max(), 1000)
df['Revenue_bin'] = pd.cut(df['Revenue'], bins)

df['Revenue_bin']

In [None]:
df.pivot_table(
    index= 'Revenue_bin',
    values= 'Review_score',
    aggfunc= 'count'
).head(31)

### 월, 일별 매출 데이터

In [None]:
from datetime import datetime
import seaborn as sns

In [None]:
type(df['Order_purchase_timestamp'][0])

In [None]:
df['YearMonth'] = df['Order_purchase_timestamp'].map(
    lambda x: pd.to_datetime(x).strftime('%Y%m%d')
)

In [None]:
df['YearMonth'].info()

In [None]:
_201810 = df['YearMonth'].str.startswith('201810')
_201811 = df['YearMonth'].str.startswith('201811')
_201812 = df['YearMonth'].str.startswith('201812')

In [None]:
df_target_2018 = df[_201810|_201811|_201812]

In [None]:
df_target_2018_1 = df[_201811]

In [None]:
df[_201811].pivot_table(
    index= 'YearMonth',
    values= 'Revenue',
    aggfunc= 'sum'
).sort_values(by= 'Revenue')

In [None]:
# 그래프 한글

import font

In [None]:
daily_revenue = df_target_2018_1.groupby('YearMonth')['Revenue'].sum().reset_index()


In [None]:

plt.figure(figsize=(10, 6))
sns.lineplot(x='YearMonth', y='Revenue', data=daily_revenue, marker='o')
plt.title('일별 매출 추이')
plt.xlabel('년월일')
plt.ylabel('매출')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
_201905 = df['YearMonth'].str.startswith('201905')
_201906 = df['YearMonth'].str.startswith('201906')
_201907 = df['YearMonth'].str.startswith('201907')

In [None]:
df_target_2019 = df[_201905|_201906|_201907]

In [None]:
daily_revenue = df_target_2019.groupby('YearMonth')['Revenue'].sum().reset_index()

In [None]:

plt.figure(figsize=(15, 8))
sns.lineplot(x='YearMonth', y='Revenue', data=daily_revenue, marker='o')
plt.title('일별 매출 추이')
plt.xlabel('년월일')
plt.ylabel('매출')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
df['Customer_id']

In [None]:
prep = pd.read_csv('./data_in/prep.csv')

In [None]:
prep['Customer_unique_id'].describe()

In [None]:
df['Customer_id'].describe()