# EDA and Feature Engineering Using Spark

In [1]:
# Built-in libraries
import collections
import itertools
import re
from typing import Any
import json

import numpy as np
import pandas as pd


# PySpark Modules
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import (
    ArrayType,
    DoubleType,
    StringType,
    IntegerType,
    FloatType,
    StructType,
    StructField,
    LongType,
)

# Black formatter (optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Create Spark Session
spark = SparkSession.builder.appName("EDA").getOrCreate()

23/07/09 13:51:35 WARN Utils: Your hostname, Chinedus-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.151 instead (on interface en0)
23/07/09 13:51:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/09 13:51:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load data
fp = "../data/udemy_data.csv"
raw_data = spark.read.option("header", "true").option("inferSchema", "true").csv(fp)

raw_data.printSchema()

root
 |-- course_name: string (nullable = true)
 |-- instructor: string (nullable = true)
 |-- course url: string (nullable = true)
 |-- course image: string (nullable = true)
 |-- course description: string (nullable = true)
 |-- reviews_avg: string (nullable = true)
 |-- reviews_count: string (nullable = true)
 |-- course_duration: string (nullable = true)
 |-- lectures_count: string (nullable = true)
 |-- level: string (nullable = true)
 |-- price_after_discount: string (nullable = true)
 |-- main_price: string (nullable = true)
 |-- course_flag: string (nullable = true)
 |-- students_count: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)



In [4]:
# Data size (number of rows)
raw_data.count()

5027

### To Do

```text
- Drop irrelevant columns.
- Extract the programming languages from the course name.
- Clean the following columns
    - `reviews avg` 
    - `reviews_count`.
    - `course_duration`.
    - `lectures_count` column .
    - `level`.
    - `price_after_discount` and `main_price`.
    - `students_count`.
```

In [5]:
# Drop irrelevant columns
data = raw_data.drop("_c14", "_c15", "_c16", "_c17")

data.show(5, truncate=60)

+---------------------------------------------------------+------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+----------------------------------------------------+--------------------+----------------+----------------+------------+-----------------------+--------------------------+--------------------------+------------------+
|                                              course_name|                                            instructor|                                                  course url|                                                course image|                                          course description|                                         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|   price_after_discount|                main_price|               

### Extract the programming languages.

In [6]:
tools = [
    r"\br\b",
    "abap",
    "ada",
    r"\ba[\.\s]?i\b",
    "android",
    "angular",
    r"artificial\s?intelligence",
    r"asp\.?\s?net\s?[m]?[v]?[c]?",
    r"\bar\b",
    "assembly",
    r"augmented\s?reality",
    "aws",
    "azure",
    "bash",
    "c#",
    r"c\+\+",
    r"caffe\s?[2]?",
    "cakephp",
    "chainer",
    r"c\.?i[\s\-\/]?[\s\-]?c\.?d",
    "confluence",
    "clojure",
    "cobol",
    "codeigniter",
    "corenlp",
    "crystal",
    "css",
    r"d3[\s\.\-]?[j]?[s]?",
    r"database[\s\-]?[d]?[e]?[s]?[i]?[g]?[n]?",
    r"data\s?science",
    r"data[\s\-]?vi[sz]",
    "dart",
    r"deep\s?learning",
    "delphi",
    r"dev[\s\-]?ops",
    "django",
    "docker",
    "dockerfile",
    r"elastic[\s\-]?search",
    "elixir",
    "elm",
    r"ember[\.\s]?[j]?[s]?",
    "erlang",
    r"express[\.\s]?[j]?[s]?",
    "f#",
    r"fast\s?[a]?[p]?[i]?",
    "flask",
    "firebase",
    "fortran",
    r"game[\s\-]?\w{0,10}\s?dev?",
    "gcp",
    "git",
    "gluon",
    "go",
    "golang",
    r"graph[\s\-]?ql",
    "groovy",
    "h2o",
    "hack",
    "haskell",
    "haxe",
    "html",
    r"hugging\s?face\s?transformers",
    "java",
    "javascript",
    "jax",
    r"jenkin[s]?",
    "jira",
    r"j[\s\-]?query",
    "julia",
    "k8s",
    "kafka",
    "keras",
    "kibana",
    "kotlin",
    "kubernetes",
    "laravel",
    "lisp",
    "lua",
    "matlab",
    "meteor",
    "mxnet",
    "nltk",
    r"node[\.\s]?\s?[j]?[s]?",
    "numpy",
    r"objective\-?\s?[c]?",
    "ocaml",
    "pandas",
    "perl",
    "php",
    r"pl\s?\/?sql",
    "play",
    "powershell",
    r"program[m]?[i]?[n]?[g]?",
    "prolog",
    "puppet",
    "python",
    "pytorch",
    r"rabbit[\-\s]?[m]?[q]?",
    "racket",
    "react",
    "redshift",
    r"[r]?[e]?[s]?[t]?[\-\s]?api",
    "ruby",
    r"ruby\s?on\s?rails",
    "rust",
    "scala",
    r"scikit\-?\s?learn",
    "scipy",
    "scratch",
    "shell",
    r"software\s?arch[i]?[t]?[e]?[c]?[t]?[u]?[r]?[e]?",
    "spacy",
    "spark",
    r"spring\s?[b][o]?[o]?[t]?",
    "sql",
    "struts",
    "swift",
    "symfony",
    "tcl",
    "tensorflow",
    "terraform",
    "theano",
    "torch",
    "typescript",
    "vb.net",
    "verilog",
    r"vue[\.\s]?[j]?[s]?",
    r"web\s?[d]?[e]?[v]?",
    r"word[\s\-]?[p][r]?[e]?[s]?[s]?",
]

# tools = [var.lower() for var in tools]

PATTERN = "(" + "|".join(sorted(tools)) + ")"
PATTERN

'([r]?[e]?[s]?[t]?[\\-\\s]?api|\\ba[\\.\\s]?i\\b|\\bar\\b|\\br\\b|abap|ada|android|angular|artificial\\s?intelligence|asp\\.?\\s?net\\s?[m]?[v]?[c]?|assembly|augmented\\s?reality|aws|azure|bash|c#|c\\+\\+|c\\.?i[\\s\\-\\/]?[\\s\\-]?c\\.?d|caffe\\s?[2]?|cakephp|chainer|clojure|cobol|codeigniter|confluence|corenlp|crystal|css|d3[\\s\\.\\-]?[j]?[s]?|dart|data[\\s\\-]?vi[sz]|data\\s?science|database[\\s\\-]?[d]?[e]?[s]?[i]?[g]?[n]?|deep\\s?learning|delphi|dev[\\s\\-]?ops|django|docker|dockerfile|elastic[\\s\\-]?search|elixir|elm|ember[\\.\\s]?[j]?[s]?|erlang|express[\\.\\s]?[j]?[s]?|f#|fast\\s?[a]?[p]?[i]?|firebase|flask|fortran|game[\\s\\-]?\\w{0,10}\\s?dev?|gcp|git|gluon|go|golang|graph[\\s\\-]?ql|groovy|h2o|hack|haskell|haxe|html|hugging\\s?face\\s?transformers|j[\\s\\-]?query|java|javascript|jax|jenkin[s]?|jira|julia|k8s|kafka|keras|kibana|kotlin|kubernetes|laravel|lisp|lua|matlab|meteor|mxnet|nltk|node[\\.\\s]?\\s?[j]?[s]?|numpy|objective\\-?\\s?[c]?|ocaml|pandas|perl|php|pl\\s?\\/?sq

In [7]:
@F.udf(returnType=StringType())
def extract_prog_language(input_: str, pattern: str) -> Any:
    """This returns a list containing the matched pattern."""
    result = re.compile(pattern=pattern, flags=re.I).findall(string=str(input_))
    result = [var.strip() for var in set(result)]
    return "|".join(result)

In [8]:
data = data.withColumn(
    "prog_languages_n_tools",
    extract_prog_language(F.lower(F.col("course_name")), F.lit(PATTERN)),
)

data.show(10, truncate=50)

[Stage 6:>                                                          (0 + 1) / 1]

+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------+----------------+----------------+------------+-----------------------+--------------------------+--------------------------+------------------+----------------------+
|                                       course_name|                                        instructor|                                        course url|                                      course image|                                course description|                                       reviews_avg|       reviews_count| course_duration|  lectures_count|       level|   price_after_discount|                main_price|               course_flag|    students_count|prog_languages_n_tools|
+-------

                                                                                

### - Clean the following columns

```text
    - `reviews_avg` 
    - `reviews_count`
```

In [9]:
data.show(5, truncate=False)

+---------------------------------------------------------+------------------------------------------------------+---------------------------------------------------------------------+-------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+----------------------------------------------------+--------------------+----------------+----------------+------------+-----------------------+--------------------------+--------------------------+------------------+----------------------+
|course_name                                              |instructor                                            |course url                                                           |course image                                                 |course description                                                                                                    |reviews_avg                 

In [10]:
# Clean the reviews_avg
REVIEWS_PATTERN_1 = r"\d{1}\.\d{1}"

data = data.withColumn(
    "reviews",
    F.regexp_extract(F.col("reviews_avg"), pattern=REVIEWS_PATTERN_1, idx=0),
)

data.show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+
|2022 Complete Pyt...|       Jose Portilla|https://www.udemy...|https://i

                                                                                

In [11]:
# Replace null/invalid values
data.select("reviews").distinct().show(50)

print(data.select("reviews").distinct().toPandas()["reviews"].values)

+-------+
|reviews|
+-------+
|    2.6|
|    3.1|
|    4.2|
|    4.4|
|    3.8|
|    2.7|
|    1.7|
|    2.9|
|    4.5|
|   null|
|    2.5|
|    2.4|
|    4.9|
|    3.4|
|    3.3|
|    4.3|
|    3.5|
|    4.8|
|    4.1|
|    4.6|
|    5.0|
|    9.9|
|    4.0|
|    1.9|
|    3.6|
|    2.8|
|    3.2|
|    3.7|
|    4.7|
|       |
|    3.0|
|    2.1|
|    3.9|
+-------+

['2.6' '3.1' '4.2' '4.4' '3.8' '2.7' '1.7' '2.9' '4.5' None '2.5' '2.4'
 '4.9' '3.4' '3.3' '4.3' '3.5' '4.8' '4.1' '4.6' '5.0' '9.9' '4.0' '1.9'
 '3.6' '2.8' '3.2' '3.7' '4.7' '' '3.0' '2.1' '3.9']


In [12]:
# Calculate the percentile w/o the invalid values
df = data.alias("df")

df = df.select("reviews").filter(~((df["reviews"].isNull()) | (df["reviews"] == r"")))
df.show()

+-------+
|reviews|
+-------+
|    4.6|
|    4.7|
|    4.7|
|    4.5|
|    4.6|
|    4.5|
|    4.7|
|    4.7|
|    4.6|
|    4.6|
|    4.7|
|    4.7|
|    4.6|
|    4.6|
|    4.8|
|    4.8|
|    4.7|
|    4.6|
|    4.6|
|    4.6|
+-------+
only showing top 20 rows



In [13]:
# 50th percentile. i.e 0.5%
REPL_VALUE = df.selectExpr("percentile(reviews, 0.5)").first()[0]
REPL_VALUE

4.4

In [14]:
# Replace null/invalid values
data = data.withColumn(
    "reviews",
    F.when(F.col("reviews").isNull(), REPL_VALUE)
    .when(F.col("reviews") == "", REPL_VALUE)
    .otherwise(F.col("reviews"))
    .cast("double"),
)

data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+
|2022 Complete Pyt...|       Jose Portilla|https://www.udemy...|https://i

                                                                                

In [15]:
# Verify!
data.select("reviews").filter(data.reviews.isNull()).show()

+-------+
|reviews|
+-------+
+-------+



In [16]:
# Extract the patterns.
REVIEWS_COUNT_PATTERN = r"\d{4,10}"

data = data.withColumn(
    "num_reviews",
    F.regexp_extract(str=F.col("reviews_count"), pattern=REVIEWS_COUNT_PATTERN, idx=0),
)


# 90th percentile. i.e 0.9%
REPL_VALUE = data.selectExpr("round(percentile(num_reviews, 0.9), 0)").first()[0]
print(REPL_VALUE)

# Replace null/invalid values
data = data.withColumn(
    "num_reviews",
    F.when(F.col("num_reviews").isNull(), REPL_VALUE)
    .when(F.col("num_reviews") == "", REPL_VALUE)
    .otherwise(F.col("num_reviews"))
    .cast("int"),
)

data.show(10)

13779.0
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+
|2022 Complete Pyt...|       

In [17]:
# Verify!
data.select("num_reviews").filter(data["num_reviews"].isNull()).show()

+-----------+
|num_reviews|
+-----------+
+-----------+



### To Do

```text
- Clean the following columns:
    - `course_duration`.
    - `lectures_count`.
```

In [18]:
data.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+
|2022 Complete Pyt...|       Jose Por

In [19]:
# Course duration and lectures count
DURATION_THRESH = 100  # hours
DURATION_REPL_VALUE = 15  # hours
LECTURES_COUNT_VALUE = 20

# Extract the patterns and replace invalid values
DURATION_PATTERN = r"\d{1,3}\.?\d{1,2}"

data = data.withColumn(
    "course_duration_hrs",
    F.regexp_extract(F.col("course_duration"), pattern=DURATION_PATTERN, idx=0),
).withColumn(
    "course_duration_hrs",
    F.when(F.col("course_duration_hrs") > DURATION_THRESH, DURATION_REPL_VALUE)
    .otherwise(F.col("course_duration_hrs"))
    .cast("double"),
)

data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+-------------------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|course_duration_hrs|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+----------

In [20]:
# Extract the patterns and replace invalid values
LECTURES_PATTERN_1 = r"\d{1,3}\s?\w{8}"
LECTURES_PATTERN_2 = r"\d{1,3}"

data = (
    data.withColumn(
        "num_lectures",
        F.regexp_extract(F.col("lectures_count"), pattern=LECTURES_PATTERN_1, idx=0),
    )
    .withColumn(
        "num_lectures",
        F.regexp_extract(str=F.col("num_lectures"), pattern=LECTURES_PATTERN_2, idx=0),
    )
    .withColumn(
        "num_lectures",
        F.when(F.col("num_lectures").isNull(), LECTURES_COUNT_VALUE)
        .when(F.col("num_lectures") == "", LECTURES_COUNT_VALUE)
        .otherwise(F.col("num_lectures"))
        .cast("int"),
    )
)

data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+-------------------+------------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|course_duration_hrs|num_lectures|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+---------------

### To Do

```text
- Clean the following columns:
    - `level`.
    - `price_after_discount` and `main_price`.
    - `students_count`.
```

In [21]:
data.select("level").distinct().toPandas()["level"].values.tolist()

['Expert',
 '13 lectures',
 'Current price: E£199.99',
 '3.5 total hours',
 '74 lectures',
 None,
 '68 lectures',
 '642 lectures',
 '229 reviews',
 '22 lectures',
 'Rating: 4.5 out of 5',
 '85 lectures',
 'Current price: E£269.99',
 '211 lectures',
 'All Levels',
 'Intermediate',
 '472 lectures',
 'Interviews"',
 'Current price: E£229.99',
 'Rating: 4.4 out of 5',
 'Beginner']

In [22]:
# Extract the patterns
COURSE_PATTERN = r"(\s*all\s?levels\s*|\s*beginner\s*|\s*expert\s*|\s*intermediate\s*|\s*interviews\s*)"
COURSE_VALUE = "not provided"

data = data.withColumn("course_level", F.lower(F.col("level"))).withColumn(
    "course_level",
    F.regexp_extract(
        str=F.col("course_level"),
        pattern=COURSE_PATTERN,
        idx=0,
    ),
)

# Replace invalid values
data = data.withColumn(
    "course_level",
    F.when(F.col("course_level").isNull(), COURSE_VALUE)
    .when(F.col("course_level") == "", COURSE_VALUE)
    .otherwise(F.col("course_level")),
)

data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+-------------------+------------+------------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|course_duration_hrs|num_lectures|course_level|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+--------

In [23]:
# Drop irrelevant columns
data = data.drop(
    "course_name",
    "reviews_avg",
    "reviews_count",
    "course_duration",
    "lectures_count",
    "level",
)

data.show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-----------------------+--------------------------+--------------------------+------------------+----------------------+-------+-----------+-------------------+------------+------------+
|                                        instructor|                                        course url|                                      course image|                                course description|   price_after_discount|                main_price|               course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|course_duration_hrs|num_lectures|course_level|
+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-----------

In [24]:
DISCOUNT_PRICE_PATTERN = r"\d{1,3}\.\d{0,2}"
DISCOUNT_PRICE_VALUE = 199.99

# Extract the patterns and replace invalid values
data = data.withColumn(
    "price_wf_discount",
    F.regexp_extract(
        str=F.col("price_after_discount"),
        pattern=DISCOUNT_PRICE_PATTERN,
        idx=0,
    ),
).withColumn(
    "price_wf_discount",
    F.when(F.col("price_wf_discount").isNull(), DISCOUNT_PRICE_VALUE)
    .when(F.col("price_wf_discount") == "", DISCOUNT_PRICE_VALUE)
    .otherwise(F.col("price_wf_discount")),
)

data.show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+-----------------------+--------------------------+--------------------------+------------------+----------------------+-------+-----------+-------------------+------------+------------+-----------------+
|                                        instructor|                                        course url|                                      course image|                                course description|   price_after_discount|                main_price|               course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|course_duration_hrs|num_lectures|course_level|price_wf_discount|
+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------

In [25]:
# Extract the patterns and replace invalid values
MAIN_PRICE_PATTERN_1 = r"Original\s?price\s?\:?\s?\w{1}\W?\d{1,3}[\.\,]\d{0,3}\.\d{0,2}"
MAIN_PRICE_PATTERN_2 = r"\d{1,3}[\.\,]\d{1,3}[\.\,]?\d{0,2}"  # 856.99, 1,234.55
MAIN_PRICE_PATTERN_3 = r"\,"
MAIN_PRICE_REPL_VALUE = r""
MAIN_PRICE_VALUE = 1_200.00

data = (
    data.withColumn(
        "original_price",
        F.regexp_extract(
            str=F.col("main_price"),
            pattern=MAIN_PRICE_PATTERN_1,
            idx=0,
        ),
    )
    .withColumn(
        "original_price",
        F.regexp_extract(
            str=F.col("original_price"), pattern=MAIN_PRICE_PATTERN_2, idx=0
        ),
    )
    .withColumn(
        "original_price",
        F.regexp_replace(
            str=F.col("original_price"),
            pattern=MAIN_PRICE_PATTERN_3,
            replacement=MAIN_PRICE_REPL_VALUE,
        ),
    )
    .withColumn(
        "original_price",
        F.when(F.col("original_price").isNull(), MAIN_PRICE_VALUE)
        .when(F.col("original_price") == "", MAIN_PRICE_VALUE)
        .otherwise(F.col("original_price")),
    )
)


data.show(truncate=40)

+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+-----------------------+--------------------------+--------------------------+------------------+----------------------+-------+-----------+-------------------+------------+------------+-----------------+--------------+
|                              instructor|                              course url|                            course image|                      course description|   price_after_discount|                main_price|               course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|course_duration_hrs|num_lectures|course_level|price_wf_discount|original_price|
+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+-----------------------+--------------------------+--------------

In [26]:
data.select("course_flag").toPandas()["course_flag"].value_counts()

Bestseller                    513
Original price: E£1,299.99      3
Original price: E£229.99        2
Original price: E£1,599.99      1
44 total hours                  1
Original price: E£1,199.99      1
Rahul Shetty                    1
E£269.99                        1
Intermediate                    1
30 lectures                     1
Name: course_flag, dtype: int64

In [27]:
data.groupBy("course_flag").agg(F.count("course_flag").alias("count")).show(
    truncate=False
)

+--------------------------+-----+
|course_flag               |count|
+--------------------------+-----+
|Bestseller                |513  |
|30 lectures               |1    |
|null                      |0    |
|E£269.99                  |1    |
|Original price: E£229.99  |2    |
|Original price: E£1,599.99|1    |
|44 total hours            |1    |
|Rahul Shetty              |1    |
|Intermediate              |1    |
|Original price: E£1,199.99|1    |
|Original price: E£1,299.99|3    |
+--------------------------+-----+



In [28]:
# Extract the patterns and replace invalid values
COURSE_FLAG_PATTERN = r"\s*bestseller\s*"
COURSE_FLAG_REPL_VALUE = "n/a"

data = data.withColumn(
    "flag_bool",
    F.regexp_extract(
        str=F.lower(F.col("course_flag")), pattern=COURSE_FLAG_PATTERN, idx=0
    ),
).withColumn(
    "flag_bool",
    F.when(F.col("flag_bool").isNull(), COURSE_FLAG_REPL_VALUE)
    .when(F.col("flag_bool") == "", COURSE_FLAG_REPL_VALUE)
    .otherwise(F.col("flag_bool")),
)

data.show(truncate=40)

+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+-----------------------+--------------------------+--------------------------+------------------+----------------------+-------+-----------+-------------------+------------+------------+-----------------+--------------+----------+
|                              instructor|                              course url|                            course image|                      course description|   price_after_discount|                main_price|               course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|course_duration_hrs|num_lectures|course_level|price_wf_discount|original_price| flag_bool|
+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+-----------------------+-------------------

                                                                                

In [29]:
# Extract the patterns and replace invalid values
NUM_STUDENTS_PATTERN_1 = r"\d{1,3}\,\d{1,3}\,?\d{0,3}"
NUM_STUDENTS_PATTERN_2 = r"\,"
NUM_STUDENTS_REPL_VALUE = 99_999


data = (
    data.withColumn(
        "num_students",
        F.regexp_extract(
            str=F.col("students_count"), pattern=NUM_STUDENTS_PATTERN_1, idx=0
        ),
    )
    .withColumn(
        "num_students",
        F.regexp_replace(
            str=F.col("num_students"), pattern=NUM_STUDENTS_PATTERN_2, replacement=""
        ),
    )
    .withColumn(
        "num_students",
        F.when(F.col("num_students").isNull(), NUM_STUDENTS_REPL_VALUE)
        .when(F.col("num_students") == "", NUM_STUDENTS_REPL_VALUE)
        .otherwise(F.col("num_students")),
    )
)

data.show(truncate=40)

+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+-----------------------+--------------------------+--------------------------+------------------+----------------------+-------+-----------+-------------------+------------+------------+-----------------+--------------+----------+------------+
|                              instructor|                              course url|                            course image|                      course description|   price_after_discount|                main_price|               course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|course_duration_hrs|num_lectures|course_level|price_wf_discount|original_price| flag_bool|num_students|
+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+-----------------

In [30]:
# Drop irrelevant columns
data = data.drop("price_after_discount", "main_price", "course_flag", "students_count")

data.show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+----------------------+-------+-----------+-------------------+------------+------------+-----------------+--------------+----------+------------+
|                                        instructor|                                        course url|                                      course image|                                course description|prog_languages_n_tools|reviews|num_reviews|course_duration_hrs|num_lectures|course_level|price_wf_discount|original_price| flag_bool|num_students|
+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+----------------------+-------+-----------+-------------------+------------+------------+--

In [31]:
# Summary stats of course rating
data.agg(
    F.min("reviews").alias("min_review"),
    F.max("reviews").alias("max_review"),
    F.round(F.avg("reviews"), 2).alias("avg_review"),
    F.expr("percentile(reviews, 0.5)").alias("median_review"),
    F.round(F.stddev("reviews"), 2).alias("std_review"),
).show()

+----------+----------+----------+-------------+----------+
|min_review|max_review|avg_review|median_review|std_review|
+----------+----------+----------+-------------+----------+
|       1.7|       9.9|      4.31|          4.4|       0.4|
+----------+----------+----------+-------------+----------+



In [32]:
data.toPandas()

                                                                                

Unnamed: 0,instructor,course url,course image,course description,prog_languages_n_tools,reviews,num_reviews,course_duration_hrs,num_lectures,course_level,price_wf_discount,original_price,flag_bool,num_students
0,Jose Portilla,https://www.udemy.com/course/complete-python-b...,https://img-b.udemycdn.com/course/240x135/5678...,Learn Python like a Professional Start from t...,python,4.6,440383,22.0,155,all levels,319.99,1399.99,,1629692
1,Colt Steele,https://www.udemy.com/course/the-web-developer...,https://img-b.udemycdn.com/course/240x135/6252...,COMPLETELY REDONE - The only course you need t...,web dev,4.7,248508,64.0,615,all levels,269.99,1399.99,,830559
2,Dr. Angela Yu,https://www.udemy.com/course/the-complete-web-...,https://img-b.udemycdn.com/course/240x135/1565...,Become a Full-Stack Web Developer with just ON...,web dev,4.7,234837,65.5,490,all levels,349.99,1699.99,bestseller,794897
3,Maximilian Schwarzmüller,https://www.udemy.com/course/the-complete-guid...,https://img-b.udemycdn.com/course/240x135/7561...,"""Master Angular 14 (formerly """"Angular 2"""") an...",angular,4.4,13779,15.0,20,not provided,199.99,1200.0,,99999
4,"Tim Buchalka, Tim Buchalka's Learn Programming...",https://www.udemy.com/course/java-the-complete...,https://img-b.udemycdn.com/course/240x135/5336...,Learn Java In This Course And Become a Compute...,programming|java,4.5,171838,80.5,401,all levels,349.99,1200.0,bestseller,727934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5022,Serkan Büyüktopçu,https://www.udemy.com/course/siemens-wincc-scada/,https://img-b.udemycdn.com/course/240x135/2858...,This course is a great push for any one who wa...,programming|ada,3.6,13779,2.5,17,expert,519.99,1200.0,,99999
5023,Andy Bek,https://www.udemy.com/course/object-oriented-p...,https://img-b.udemycdn.com/course/240x135/4450...,Deep OOP Foundations From Absolute Scratch,python|programming,4.5,13779,27.0,231,all levels,199.99,1200.0,,1102
5024,Learn Tech Plus,https://www.udemy.com/course/learn-basic-jquery/,https://img-b.udemycdn.com/course/240x135/2554...,Everything you need to know to Build a Retirem...,jquery,3.8,13779,,102,beginner,299.99,1200.0,,45245
5025,Bluelime Learning Solutions,https://www.udemy.com/course/introduction-to-a...,https://img-b.udemycdn.com/course/240x135/1389...,Create an Android App step by step from scratch,android,3.6,13779,,71,beginner,749.99,1200.0,,4653
