## Import thư viện

In [1]:
import pyspark
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, count, when, to_timestamp, split, regexp_replace, row_number, sum, count_distinct
from functools import reduce
import pyspark.pandas as ps



## Khởi tạo Spark Session

In [2]:
spark = SparkSession.builder \
    .appName("Analysis") \
    .getOrCreate()

## Đọc file data

In [3]:
raw_df = spark.read.csv("./data/raw_data.csv", header=True, inferSchema=True)

In [4]:
raw_df.show()

+-----------+-------------+--------------------+--------------------+---------------+--------------------+--------------------+--------+-------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|    category_id|        publish_time|                tags|   views|  likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+---------------+--------------------+--------------------+--------+-------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|Jw1Y-zhQURU|     17.14.11|John Lewis Christ...|          John Lewis|  Howto & Style|2017-11-10T07:38:...|"christmas|""john...| 7224515|  55681|   10247|         9479|https://i.ytimg.c...|        

In [5]:
raw_df.describe().show()

+-------+-----------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+-----------------+----------------+----------------------+--------------------+
|summary|   video_id|trending_date|               title|       channel_title|         category_id|        publish_time|                tags|               views|               likes|            dislikes|    comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-------+-----------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+-----------------+----------------+----------------------+--------------------+
|  count|      43185|        39540

In [None]:
def dataframe_info(df):
    print(f"{'-'*40}")
    print(f"Thông tin DataFrame:")
    print(f"Số dòng: {df.count()}")
    print(f"Số cột: {len(df.columns)}")
    print(f"{'-'*40}")
    print("Schema:")
    df.printSchema()
    print(f"{'-'*40}")
    print("Số giá trị null trong mỗi cột:")
    null_counts = df.select([
        count(when(col(c).isNull(), c)).alias(c) for c in df.columns
    ])
    null_counts.show()

In [7]:
dataframe_info(raw_df)

----------------------------------------
DataFrame thông tin:
Số dòng: 43295
Số cột: 16
----------------------------------------
Schema:
root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)

----------------------------------------
Số giá trị null trong mỗi cột:
+--------+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-

## Tiền xử lí dữ liệu

### Xóa các cột không cần thiết

In [8]:
category_df = raw_df.drop(*['video_id', 'thumbnail_link', 'comments_disabled', 'video_error_or_removed', 'ratings_disabled'])
dataframe_info(category_df)

----------------------------------------
DataFrame thông tin:
Số dòng: 43295
Số cột: 11
----------------------------------------
Schema:
root
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- description: string (nullable = true)

----------------------------------------
Số giá trị null trong mỗi cột:
+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+-----------+
|trending_date|title|channel_title|category_id|publish_time|tags|views|likes|dislikes|comment_count|description|
+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+-------

### Xóa các hàng có tất cả các giá trị là Null

In [9]:
category_df = category_df.filter(
    reduce(lambda a, b: a | b, (col(c).isNotNull() for c in category_df.columns))
)
category_df.show()

+-------------+--------------------+--------------------+---------------+--------------------+--------------------+--------+-------+--------+-------------+--------------------+
|trending_date|               title|       channel_title|    category_id|        publish_time|                tags|   views|  likes|dislikes|comment_count|         description|
+-------------+--------------------+--------------------+---------------+--------------------+--------------------+--------+-------+--------+-------------+--------------------+
|     17.14.11|John Lewis Christ...|          John Lewis|  Howto & Style|2017-11-10T07:38:...|"christmas|""john...| 7224515|  55681|   10247|         9479|Click here to con...|
|     17.14.11|Taylor Swift: …Re...| Saturday Night Live|  Entertainment|2017-11-12T06:24:...|"SNL|""Saturday N...| 1053632|  25561|    2294|         2757|Musical guest Tay...|
|     17.14.11|Eminem - Walk On ...|          EminemVEVO|          Music|2017-11-10T17:00:...|"Eminem|""Walk""|...|

In [10]:
dataframe_info(category_df)

----------------------------------------
DataFrame thông tin:
Số dòng: 39540
Số cột: 11
----------------------------------------
Schema:
root
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- description: string (nullable = true)

----------------------------------------
Số giá trị null trong mỗi cột:
+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+-----------+
|trending_date|title|channel_title|category_id|publish_time|tags|views|likes|dislikes|comment_count|description|
+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+-------

### Xóa các hàng có trending_date sai định dạng (lỗi data => các giá trị khác trong hàng có nhiều giá trị Null)

In [11]:
category_df = category_df.filter(
    col("trending_date").rlike(r"^\d{2}\.\d{2}\.\d{2}$")
)
dataframe_info(category_df)

----------------------------------------
DataFrame thông tin:
Số dòng: 38806
Số cột: 11
----------------------------------------
Schema:
root
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- description: string (nullable = true)

----------------------------------------
Số giá trị null trong mỗi cột:
+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+-----------+
|trending_date|title|channel_title|category_id|publish_time|tags|views|likes|dislikes|comment_count|description|
+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+-------

### Thêm giá trị cho các hàng có cột description có giá trị bằng Null

In [12]:
category_df = category_df.fillna({"description": "No description"})
dataframe_info(category_df)

----------------------------------------
DataFrame thông tin:
Số dòng: 38806
Số cột: 11
----------------------------------------
Schema:
root
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- description: string (nullable = false)

----------------------------------------
Số giá trị null trong mỗi cột:
+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+-----------+
|trending_date|title|channel_title|category_id|publish_time|tags|views|likes|dislikes|comment_count|description|
+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+------

### Chuẩn hóa dữ liệu

In [13]:
category_df = category_df.withColumn('trending_date', to_timestamp('trending_date', 'yy.dd.MM'))
category_df = category_df.withColumn('publish_time', to_timestamp('publish_time', "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
category_df = category_df.withColumn('tags', when(category_df['tags'] == '[none]', '').otherwise(category_df['tags']))
category_df = category_df.withColumn('tags', split(regexp_replace("tags", '"', ""), "\\|"))
category_df.show()

+-------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------+-------+--------+-------------+--------------------+
|      trending_date|               title|       channel_title|    category_id|       publish_time|                tags|   views|  likes|dislikes|comment_count|         description|
+-------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------+-------+--------+-------------+--------------------+
|2017-11-14 00:00:00|John Lewis Christ...|          John Lewis|  Howto & Style|2017-11-10 07:38:29|[christmas, john ...| 7224515|  55681|   10247|         9479|Click here to con...|
|2017-11-14 00:00:00|Taylor Swift: …Re...| Saturday Night Live|  Entertainment|2017-11-12 06:24:44|[SNL, Saturday Ni...| 1053632|  25561|    2294|         2757|Musical guest Tay...|
|2017-11-14 00:00:00|Eminem - Walk On ...|          EminemVEVO|          Music|2017-11-10 

In [14]:
dataframe_info(category_df)

----------------------------------------
DataFrame thông tin:
Số dòng: 38806
Số cột: 11
----------------------------------------
Schema:
root
 |-- trending_date: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: timestamp (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- description: string (nullable = false)

----------------------------------------
Số giá trị null trong mỗi cột:
+-------------+-----+-------------+-----------+------------+----+-----+-----+--------+-------------+-----------+
|trending_date|title|channel_title|category_id|publish_time|tags|views|likes|dislikes|comment_count|description|
+-------------+-----+-------------+-----------+-----