# IMPORT LIBRARY

In [11]:
import faulthandler 
from pyspark.sql import SparkSession 
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DateType, FloatType, TimestampType
from pyspark.sql.functions import col, array_contains, isnan, when, count
from pyspark.sql.functions import lit, concat_ws, concat, collect_list, udf
from pyspark.sql.functions import countDistinct
import plotly.express as px
import os

## spark 접속

In [12]:

faulthandler.enable()   
spark = SparkSession.builder.master('local').appName('Python Spark SQL Practice').getOrCreate()

# DATA LOAD

In [13]:
def search(dirname):
    filenames = os.listdir(dirname)
    file_list = []
    for filename in filenames:
        full_filename = os.path.join(dirname, filename)
        print(full_filename)
        file_list.append(full_filename)
    return file_list
data_path_list = search("D:/workspace/data/")

D:/workspace/data/2019-Dec.csv
D:/workspace/data/2019-Nov.csv
D:/workspace/data/2019-Oct.csv
D:/workspace/data/2020-Apr.csv
D:/workspace/data/2020-Feb.csv
D:/workspace/data/2020-Jan.csv
D:/workspace/data/2020-Mar.csv


## 스키마 타입

In [14]:
schema = StructType() \
      .add("event_time",TimestampType(),True) \
      .add("event_type",StringType(),True) \
      .add("product_id",StringType(),True) \
      .add("category_id",StringType(),True) \
      .add("category_code",StringType(),True) \
      .add("brand",StringType(),True) \
      .add("price",DoubleType(),True) \
      .add("user_id",StringType(),True) \
      .add("user_session",StringType(),True)

## DATA MERGE
    - 2019년 10월 ~ 2020년 4월

In [15]:
for i, x in enumerate(data_path_list):
    file_path = x
    df = spark.read.format("csv") \
      .option("header", True) \
      .option('delimiter', ',') \
      .schema(schema) \
      .load(file_path)
    if i == 0:
        merged_df = df
    else:
        merged_df = merged_df.union(df)

In [16]:
merged_df.show()

+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|2019-12-01 09:00:00|      view|   1005105|2232732093077520756|construction.tool...|  apple|1302.48|556695836|ca5eefc5-11f9-450...|
|2019-12-01 09:00:00|      view|  22700068|2232732091643068746|                NULL|  force| 102.96|577702456|de33debe-c7bf-44e...|
|2019-12-01 09:00:01|      view|   2402273|2232732100769874463|appliances.person...|  bosch| 313.52|539453785|5ee185a7-0689-4a3...|
|2019-12-01 09:00:02|  purchase|  26400248|2053013553056579841|computers.periphe...|   NULL| 132.31|535135317|61792a26-672f-4e6...|
|2019-12-01 09:00:02|      view|  20100164|2232732110089618156|    apparel.t

## spark table 생성

In [17]:
merged_df.createOrReplaceTempView("ecommerce")

# 1. 카테코리 id가 NULL 값인 것들

In [18]:
eda1 = spark.sql("""SELECT COUNT(*) AS CNT
FROM ecommerce
WHERE category_id is NULL""")

In [19]:
eda1.show()

+---+
|CNT|
+---+
|  0|
+---+



# 2. remove_from_cart 항목이 있는지 여부

In [20]:
eda2 = spark.sql("""SELECT COUNT(*) AS CNT
FROM ecommerce
WHERE event_type = 'remove_from_cart'""")

In [None]:
eda2.show()

# 3. 카테고리 아이디 별 평균 물품 금액

In [None]:
eda3 = spark.sql("""SELECT category_id, category_code, AVG(price) AS AVG_PRICE_PER_CATEGORY
FROM ecommerce
GROUP BY category_code, category_id""")

In [None]:
eda3.show()

# 4. 카테고리 아이디 별 평균 구매 물품 금액

In [None]:
eda4 = spark.sql("""SELECT category_id, category_code, AVG(price) AS AVG_PRICE_PER_CATEGORY
FROM ecommerce
WHERE event_type = 'purchase'
GROUP BY category_id""")

In [None]:
eda4.show()

# 5. 월별 USER 수

In [None]:
eda5 = spark.sql("""
                               SELECT MONTH(event_time) AS MONTH, COUNT(DISTINCT user_id) AS CNT
                              FROM ecommerce
                              GROUP BY MONTH
                          """)



In [None]:
eda5.show()

# 6. EVENT_TYPE 비율

In [None]:
eda6 = spark.sql("SELECT event_type, count(*) AS count FROM ecommerce GROUP BY event_type")

In [None]:
eda6.show()

In [None]:
import matplotlib.pyplot as plt

# Spark DataFrame을 Pandas DataFrame으로 변환
pandas_df = eda6.toPandas()

# 이벤트 타입을 view -> cart -> purchase 순으로 정렬
pandas_df = pandas_df.set_index('event_type').reindex(['view', 'cart', 'purchase']).reset_index()

# 원 그래프 생성
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)  # 첫 번째 그래프는 원 그래프
plt.pie(pandas_df['count'], labels=pandas_df['event_type'], autopct='%1.1f%%', startangle=140)
plt.title('Event Type Distribution (Pie Chart)')

# 막대 그래프 생성
plt.subplot(1, 2, 2)  # 두 번째 그래프는 막대 그래프
plt.bar(pandas_df['event_type'], pandas_df['count'], color=['skyblue', 'lightgreen', 'salmon'])
plt.title('Event Type Counts (Bar Chart)')
plt.xlabel('Event Type')
plt.ylabel('Count')
plt.xticks(rotation=45)

# 그래프 보여주기
plt.tight_layout()
plt.show()


# 7. user_session 별 view, purchase, cart의 수

In [13]:
eda7 = spark.sql('''WITH source AS (
                                                SELECT user_session, event_type, COUNT(event_time) AS event_count
                                                FROM ecommerce
                                                GROUP BY user_session, event_type
                                                )
                            SELECT 
                                user_session,
                                SUM(CASE WHEN event_type = 'view' THEN event_count ELSE 0 END) AS view,
                                SUM(CASE WHEN event_type = 'cart' THEN event_count ELSE 0 END) AS cart,
                                SUM(CASE WHEN event_type = 'purchase' THEN event_count ELSE 0 END) AS purchase
                            FROM source
                            GROUP BY user_session
                            ORDER BY user_session
                        ''')

In [None]:
eda7.show()

# 8. user_session의 NULL 여부 및 수

In [None]:
eda8 = spark.sql("""
SELECT COUNT(*) AS CNT
FROM ecommerce
WHERE user_session is NULL
""")

In [None]:
eda8.show()

# 9. 카테고리 갯수

## 카테고리 대분류

In [None]:
eda9_1 = spark.sql("""SELECT DISTINCT (SUBSTRING_INDEX(category_code, '.', 1)) 
                   FROM ecommerce""")

In [None]:
eda9_1.show()

## 대분류 갯수

In [None]:
eda9_2 = spark.sql("""SELECT COUNT(DISTINCT (SUBSTRING_INDEX(category_code, '.', 1))) AS CNT 
                   FROM ecommerce""")

In [None]:
eda9_2.show()

## 카테고리 중분류

In [None]:
eda9_3= spark.sql("""SELECT DISTINCT (SUBSTRING_INDEX(category_code, '.', 2)) AS cnt 
          FROM ecommerce""")

In [None]:
eda9_3.show()

## 카테고리 중분류 갯수

In [None]:
eda9_4 = spark.sql("""SELECT COUNT(DISTINCT (SUBSTRING_INDEX(category_code, '.', 2))) AS cnt 
          FROM ecommerce""")

In [None]:
eda9_4.show()

# 10. 요일별 판매량, 판매액, paying User
DAYOFWEEK => 일요일 1, 월요일 2, 화요일 3, 수요일 4, 목요일 5, 금요일 6, 토요일 7

In [None]:
eda10 = spark.sql("""
                                SELECT CASE DAYOFWEEK(event_time)
                                            WHEN '1' THEN 'SUN'
                                            WHEN '2' THEN 'MON'
                                            WHEN '3' THEN 'TUE'
                                            WHEN '4' THEN 'WED'
                                            WHEN '5' THEN 'THU' 
                                            WHEN '6' THEN 'FRI' 
                                            WHEN '7' THEN 'SAT' 
                                        END
                                                AS dayOfweek,
                                        COUNT(*) AS total_sales_amount,
                                        FLOOR(SUM(price)) AS total_sales_price,
                                        COUNT(DISTINCT user_id) AS paying_user_cnt
                                FROM ecommerce
                                WHERE event_type = 'purchase'
                                GROUP BY dayOfweek
                                ORDER BY total_sales_amount DESC
                             """)

In [None]:
eda10.show()

# 11. 월별 판매량, 판매액, paying User

In [None]:
eda11 = spark.sql("""
                                SELECT MONTH(event_time) AS MONTH,
                                        COUNT(*) AS total_sales_amount,
                                        FLOOR(SUM(price)) AS total_sales_price,
                                        COUNT(DISTINCT user_id) AS paying_user_cnt
                                FROM ecommerce
                                WHERE event_type = 'purchase'
                                GROUP BY MONTH
                                ORDER BY total_sales_amount DESC
                             """)

In [None]:
eda11.show()

# SPARK STOP

In [None]:
spark.stop()