# IMPORT LIBRARY

In [1]:
import faulthandler 
from pyspark.sql import SparkSession 
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DateType, FloatType, TimestampType
from pyspark.sql.functions import col, array_contains, isnan, when, count
from pyspark.sql.functions import lit, concat_ws, concat, collect_list, udf
from pyspark.sql.functions import countDistinct
import plotly.express as px
import os
import seaborn as sns

## spark 접속

In [2]:

faulthandler.enable()   
spark = SparkSession.builder.master('local').appName('Python Spark SQL Practice').getOrCreate()

# DATA LOAD

In [3]:
def search(dirname):
    filenames = os.listdir(dirname)
    file_list = []
    for filename in filenames:
        full_filename = os.path.join(dirname, filename)
        print(full_filename)
        file_list.append(full_filename)
    return file_list
data_path_list = search("D:/workspace/data/")

D:/workspace/data/2019-Dec.csv
D:/workspace/data/2019-Nov.csv
D:/workspace/data/2019-Oct.csv
D:/workspace/data/2020-Apr.csv
D:/workspace/data/2020-Feb.csv
D:/workspace/data/2020-Jan.csv
D:/workspace/data/2020-Mar.csv


## 스키마 타입

In [4]:
schema = StructType() \
      .add("event_time",TimestampType(),True) \
      .add("event_type",StringType(),True) \
      .add("product_id",StringType(),True) \
      .add("category_id",StringType(),True) \
      .add("category_code",StringType(),True) \
      .add("brand",StringType(),True) \
      .add("price",DoubleType(),True) \
      .add("user_id",StringType(),True) \
      .add("user_session",StringType(),True)

## data merge
    - 2019년 10월 ~ 2020년 4월

In [5]:
for i, x in enumerate(data_path_list):
    file_path = x
    df = spark.read.format("csv") \
      .option("header", True) \
      .option('delimiter', ',') \
      .schema(schema) \
      .load(file_path)
    if i == 0:
        merged_df = df
    else:
        merged_df = merged_df.union(df)

In [6]:
merged_df.show()

+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|2019-12-01 09:00:00|      view|   1005105|2232732093077520756|construction.tool...|  apple|1302.48|556695836|ca5eefc5-11f9-450...|
|2019-12-01 09:00:00|      view|  22700068|2232732091643068746|                NULL|  force| 102.96|577702456|de33debe-c7bf-44e...|
|2019-12-01 09:00:01|      view|   2402273|2232732100769874463|appliances.person...|  bosch| 313.52|539453785|5ee185a7-0689-4a3...|
|2019-12-01 09:00:02|  purchase|  26400248|2053013553056579841|computers.periphe...|   NULL| 132.31|535135317|61792a26-672f-4e6...|
|2019-12-01 09:00:02|      view|  20100164|2232732110089618156|    apparel.t

## spark table 생성

In [7]:
merged_df.createOrReplaceTempView("ecommerce")

# 1. ACQUISITION(고객 유치) 

## 1) DAU (Daily Active User)

### DAU 쿼리

In [8]:
dau = spark.sql("""
SELECT DATE(event_time) AS event_date, COUNT(DISTINCT user_id) AS DAU
FROM ecommerce
GROUP BY event_date
ORDER BY event_date
""")

In [None]:
dau.show()

### DAU 그래프

## 2) MAU(Monthly Active User)

### MAU 쿼리

In [None]:
mau = spark.sql("""
SELECT DATE_FORMAT(event_time, 'yyyy-mm') AS event_month, COUNT(DISTINCT user_id) AS DAU
FROM ecommerce
GROUP BY event_month
ORDER BY event_month
""")

In [None]:
mau.show()

### MAU 그래프

# 2. ACTIVATION(활성화)

## 1) DT(Duration Time, 체류 시간)

### DT 쿼리

In [None]:
duration = spark.sql('''SELECT user_session, 
                                MONTH(MAX(event_time)) AS DURATION_MONTH,
                                DATE(MAX(event_time)) AS DURATION_DATE,
                               MAX(event_time) - MIN(event_time) AS duration
                  FROM ecommerce 
                  GROUP BY user_session
                  '''
          )

In [None]:
duration.show()

## 2) 구매까지 걸리는 DT

### duration 테이블 만들기

In [None]:
duration.createOrReplaceTempView('duration_table')

### user_session 별로 view, purchase, cart의 수

In [None]:
pivot_session = spark.sql('''WITH source AS (
                                                SELECT user_session, event_type, COUNT(event_time) AS event_count
                                                FROM ecommerce
                                                GROUP BY user_session, event_type
                                                )
                            SELECT 
                                user_session,
                                SUM(CASE WHEN event_type = 'view' THEN event_count ELSE 0 END) AS view,
                                SUM(CASE WHEN event_type = 'cart' THEN event_count ELSE 0 END) AS cart,
                                SUM(CASE WHEN event_type = 'purchase' THEN event_count ELSE 0 END) AS purchase
                            FROM source
                            GROUP BY user_session
                            ORDER BY user_session
                        ''')

In [None]:
pivot_session.createOrReplaceTempView('session_pivot_table')

### 구매 이력이 있는 섹션의 DT

In [None]:
purchase_session_avg_duration = spark.sql('''
                                                SELECT MEAN(d.duration)
                                                FROM duration_table AS d, 
                                                    (SELECT user_session, purchase
                                                    FROM session_pivot_table
                                                    WHERE purchase > 0)   AS p
                                                WHERE d.user_session = p.user_session
                                        ''')

In [None]:
purchase_session_avg_duration.show()

## 3) 월간 DT

### 월간 DT 쿼리

In [None]:
duration_month = spark.sql("""
WITH temp AS (
    SELECT user_session, 
    MONTH(MAX(event_time)) AS DURATION_MONTH
    DATE(MAX(event_time)) AS DURATION_DATE
    MAX(event_time) - MIN(event_time) AS duration
    FROM ecommerce 
    GROUP BY user_session
)
SELECT DURATION_DATE, AVG(duration) AS AVG_DURATION
FROM temp
GROUP BY DURATION_DATE
ORDERY BY DURATION_DATE
""")

In [None]:
duration_month.show()

## 4) FUNNEL (EVENT_TYPE 별 COUNT)

### FUNNEL 쿼리

In [None]:
funnel = spark.sql("""
                   SELECT 
                        event_type, COUNT(*) AS CNT
                   FROM 
                        ecommerce
                   GROUP BY 
                        event_type
                   ORDER BY
                        CNT DESC
""")

In [None]:
funnel.show()

In [None]:
df_funnel = funnel.toPandas()

In [None]:
df_funnel

### FUNNEL 그래프

In [None]:
fig = px.funnel(data_frame= df_funnel, x = 'event_type', y = 'CNT')
fig.update_traces(texttemplate= '%{value:,.0f}') # 숫자 형식 지정
fig.show()

# 3. RETENTION(리텐션)

## 1) Monthly Retention
    - 접속 했는지 여부

### 월간 리텐션 쿼리

In [None]:
retention_range_30 = spark.sql("""
                               WITH first as (
                                    SELECT user_id, MIN(event_time) AS first_interaction_time
                                    FROM ecommerce
                                    GROUP BY user_id
                                )

                                SELECT MONTH(first_interaction_time) AS MONTH, FLOOR(MONTHS_BETWEEN(event_time, first_interaction_time)) AS diff_month,
                                COUNT(DISTINCT s.user_id) as user_cnt
                                FROM ecommerce s LEFT JOIN first f
                                ON s.user_id = f.user_id
                                GROUP BY MONTH, FLOOR(MONTHS_BETWEEN(event_time, first_interaction_time))
                                ORDER BY diff_month

                          """)


In [None]:
retention_range_30.show()

### pandas, pivot 전환

In [None]:
df_retention_30 = retention_range_30.toPandas()

In [None]:
df_retention_30_pivot = df_retention_30.pivot(index='MONTH',columns='diff_month',values='user_cnt').reindex([10,11,12,1,2,3,4])

In [None]:
df_retention_30_pivot

### value 비율로 변환

In [None]:
for i in range(len(df_retention_30_pivot)):
    a = df_retention_30_pivot.iloc[i,0]
    for j in range(len(df_retention_30_pivot.columns)):
        if df_retention_30_pivot.isnull().iloc[i,j]:
            pass
        else:
            df_retention_30_pivot.iloc[i,j] = round(df_retention_30_pivot.iloc[i,j] / a, 4)


In [None]:
df_retention_30_pivot.to_csv("월간간리텐션.csv", index=False)

### 월간 리텐션 그래프

In [None]:

sns.heatmap(df_retention_30_pivot, annot=True, fmt=".3f")

[문제를 찾자]
전반적으로 10, 11월 retention 좋음

1~3월 폭망

그 이유는 무엇일까???

Q1.전체적인 건수가 줄어서???
그럼 왜 줄어 들었을까????

단순히 연초라서????
Q2.연말에 폭풍 소비 후 연초에 살 필요성이 없음????

아니면 제품군?????
EX. 컴퓨터나 장기간 사용할 물건을 구매해서 다시 들어갈 일이 생기지 않았나???
--> 블랙프레이데이를 이용해서 저런 것들을 구매 후 재방문으로는 이어지지 않았나???




## 2) Weekly Retention

### 주간 리텐션 쿼리

In [None]:
retention_range_7 = spark.sql("""
                               WITH first as (
                                    SELECT user_id, MIN(event_time) AS first_interaction_time
                                    FROM ecommerce
                                    GROUP BY user_id
                                )
                                SELECT DATE(DATE_TRUNC('week',first_interaction_time)) AS WEEK, 
                                CAST(DATEDIFF(s.event_time, f.first_interaction_time) / 7 AS INT) AS diff_week,
                                COUNT(DISTINCT s.user_id) as user_cnt
                                FROM ecommerce s LEFT JOIN first f
                                ON s.user_id = f.user_id
                                GROUP BY WEEK, diff_week
                                ORDER BY diff_week

                          """)


In [None]:
retention_range_7.show()

### pandas, pivot 전환

In [None]:
df_retention_7 = retention_range_7.toPandas()

In [None]:
df_retention_7.sort_values(by=['WEEK','diff_week'])

In [None]:
df_retention_7_pivot = df_retention_7.pivot(index='WEEK',columns='diff_week',values='user_cnt')

In [None]:
df_retention_7_pivot

### value 비율로 전환

In [None]:
for i in range(len(df_retention_7_pivot)):
    a = df_retention_7_pivot.iloc[i,0]
    for j in range(len(df_retention_7_pivot.columns)):
        if df_retention_7_pivot.isnull().iloc[i,j]:
            pass
        else:
            df_retention_7_pivot.iloc[i,j] = round(df_retention_7_pivot.iloc[i,j] / a, 4)


In [None]:
df_retention_7_pivot.to_csv("주간간리텐션.csv", index=False)

### 주간 리텐션 그래프

In [None]:

sns.heatmap(df_retention_7_pivot, annot=True, fmt=".3f")

# SPARK STOP

In [None]:
spark.stop()