# EDA and Feature Engineering Using Spark

In [1]:
# Built-in libraries
import collections
import itertools
import re
from typing import Any
import json

import numpy as np
import pandas as pd


# PySpark Modules
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import (
    ArrayType,
    DoubleType,
    StringType,
    IntegerType,
    FloatType,
    StructType,
    StructField,
    LongType,
)

# Black formatter (optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Create Spark Session
spark = SparkSession.builder.appName("EDA").getOrCreate()

23/07/07 17:45:56 WARN Utils: Your hostname, Chinedus-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.151 instead (on interface en0)
23/07/07 17:45:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/07 17:45:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load data
fp = "../data/udemy_data.csv"
raw_data = spark.read.option("header", "true").option("inferSchema", "true").csv(fp)

raw_data.printSchema()

root
 |-- course_name: string (nullable = true)
 |-- instructor: string (nullable = true)
 |-- course url: string (nullable = true)
 |-- course image: string (nullable = true)
 |-- course description: string (nullable = true)
 |-- reviews_avg: string (nullable = true)
 |-- reviews_count: string (nullable = true)
 |-- course_duration: string (nullable = true)
 |-- lectures_count: string (nullable = true)
 |-- level: string (nullable = true)
 |-- price_after_discount: string (nullable = true)
 |-- main_price: string (nullable = true)
 |-- course_flag: string (nullable = true)
 |-- students_count: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)



In [4]:
# Data size (number of rows)
raw_data.count()

5027

### To Do

```text
- Drop irrelevant columns.
- Extract the programming languages from the course name.
- Clean the following columns
    - `reviews avg` 
    - `reviews_count`.
    - `course_duration`.
    - `lectures_count` column .
    - `level`.
    - `price_after_discount` and `main_price`.
    - `students_count`.
```

In [5]:
# Drop irrelevant columns
data = raw_data.drop("_c14", "_c15", "_c16", "_c17")

data.show(5, truncate=60)

+---------------------------------------------------------+------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+----------------------------------------------------+--------------------+----------------+----------------+------------+-----------------------+--------------------------+--------------------------+------------------+
|                                              course_name|                                            instructor|                                                  course url|                                                course image|                                          course description|                                         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|   price_after_discount|                main_price|               

### Extract the programming languages.

In [6]:
tools = [
    "\\br\\b",
    "abap",
    "ada",
    "angular",
    "asp.net",
    "asp.net mvc",
    "assembly",
    "bash",
    "c#",
    "c\\+\\+",
    "caffe",
    "caffe2",
    "cakephp",
    "chainer",
    "clojure",
    "cobol",
    "codeigniter",
    "corenlp",
    "crystal",
    "css",
    "dart",
    "deeplearning4j",
    "delphi",
    "django",
    "docker",
    "dockerfile",
    "elixir",
    "elm",
    "ember.js",
    "ember[\\.\\s]?[j]?[s]?",
    "erlang",
    "express[\\.\\s]?[j]?[s]?",
    "f#",
    "flask",
    "fortran",
    "gluon",
    "go",
    "golang",
    "groovy",
    "h2o",
    "hack",
    "haskell",
    "haxe",
    "html",
    r"hugging\s?face\s?transformers",
    "java",
    "javascript",
    "julia",
    "k8s",
    "keras",
    "kotlin",
    "kubernetes",
    "laravel",
    "lisp",
    "lua",
    "matlab",
    "meteor",
    "mxnet",
    "nltk",
    "objective-c",
    "ocaml",
    "perl",
    "php",
    "pl/sql",
    "play",
    "powershell",
    "prolog",
    "puppet",
    "python",
    "pytorch",
    "racket",
    "react",
    "ruby",
    "ruby on rails",
    "rust",
    "scala",
    r"scikit\-?\s?learn",
    "scratch",
    "shell",
    "spacy",
    "spring boot",
    "sql",
    "struts",
    "swift",
    "symfony",
    "tcl",
    "tensorflow",
    "theano",
    "torch",
    "typescript",
    "vb.net",
    "verilog",
    "vue[\\.\\s]?[j]?[s]?",
]

# tools = [var.lower() for var in tools]

PATTERN = "(" + "|".join(sorted(tools)) + ")"
PATTERN

'(\\br\\b|abap|ada|angular|asp.net|asp.net mvc|assembly|bash|c#|c\\+\\+|caffe|caffe2|cakephp|chainer|clojure|cobol|codeigniter|corenlp|crystal|css|dart|deeplearning4j|delphi|django|docker|dockerfile|elixir|elm|ember.js|ember[\\.\\s]?[j]?[s]?|erlang|express[\\.\\s]?[j]?[s]?|f#|flask|fortran|gluon|go|golang|groovy|h2o|hack|haskell|haxe|html|hugging\\s?face\\s?transformers|java|javascript|julia|k8s|keras|kotlin|kubernetes|laravel|lisp|lua|matlab|meteor|mxnet|nltk|objective-c|ocaml|perl|php|pl/sql|play|powershell|prolog|puppet|python|pytorch|racket|react|ruby|ruby on rails|rust|scala|scikit\\-?\\s?learn|scratch|shell|spacy|spring boot|sql|struts|swift|symfony|tcl|tensorflow|theano|torch|typescript|vb.net|verilog|vue[\\.\\s]?[j]?[s]?)'

In [7]:
@F.udf(returnType=StringType())
def extract_prog_language(input_: str, pattern: str) -> Any:
    """This returns a list containing the matched pattern."""
    # pattern = PATTERN
    result = re.compile(pattern=pattern, flags=re.I).findall(string=input_)
    result = [var.strip() for var in set(result)]
    return result

In [8]:
data = data.withColumn(
    "prog_languages_n_tools",
    extract_prog_language(F.lower(F.col("course_name")), F.lit(PATTERN)),
)

data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+
|2022 Complete Pyt...|       Jose Portilla|https://www.udemy...|https://img-b.ude...|Learn Python

                                                                                

### - Clean the following columns

```text
    - `reviews_avg` 
    - `reviews_count`
```

In [9]:
data.show(5, truncate=False)

+---------------------------------------------------------+------------------------------------------------------+---------------------------------------------------------------------+-------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+----------------------------------------------------+--------------------+----------------+----------------+------------+-----------------------+--------------------------+--------------------------+------------------+----------------------+
|course_name                                              |instructor                                            |course url                                                           |course image                                                 |course description                                                                                                    |reviews_avg                 

In [10]:
# Clean the reviews_avg
data = data.withColumn(
    "reviews",
    F.regexp_extract(F.col("reviews_avg"), pattern=r"\d{1}\.\d{1}", idx=0),
)

data.show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+
|2022 Complete Pyt...|       Jose Portilla|https://www.udemy...|https://i

In [11]:
# Replace null/invalid values
data.select("reviews").distinct().show(50)

print(data.select("reviews").distinct().toPandas()["reviews"].values)

+-------+
|reviews|
+-------+
|    2.6|
|    3.1|
|    4.2|
|    4.4|
|    3.8|
|    2.7|
|    1.7|
|    2.9|
|    4.5|
|   null|
|    2.5|
|    2.4|
|    4.9|
|    3.4|
|    3.3|
|    4.3|
|    3.5|
|    4.8|
|    4.1|
|    4.6|
|    5.0|
|    9.9|
|    4.0|
|    1.9|
|    3.6|
|    2.8|
|    3.2|
|    3.7|
|    4.7|
|       |
|    3.0|
|    2.1|
|    3.9|
+-------+

['2.6' '3.1' '4.2' '4.4' '3.8' '2.7' '1.7' '2.9' '4.5' None '2.5' '2.4'
 '4.9' '3.4' '3.3' '4.3' '3.5' '4.8' '4.1' '4.6' '5.0' '9.9' '4.0' '1.9'
 '3.6' '2.8' '3.2' '3.7' '4.7' '' '3.0' '2.1' '3.9']


In [12]:
# Calculate the percentile w/o the invalid values
df = data.alias("df")

df = df.select("reviews").filter(~((df["reviews"].isNull()) | (df["reviews"] == r"")))
df.show()

+-------+
|reviews|
+-------+
|    4.6|
|    4.7|
|    4.7|
|    4.5|
|    4.6|
|    4.5|
|    4.7|
|    4.7|
|    4.6|
|    4.6|
|    4.7|
|    4.7|
|    4.6|
|    4.6|
|    4.8|
|    4.8|
|    4.7|
|    4.6|
|    4.6|
|    4.6|
+-------+
only showing top 20 rows



In [13]:
# 50th percentile. i.e 0.5%
REPL_VALUE = df.selectExpr("percentile(reviews, 0.5)").first()[0]
REPL_VALUE

4.4

In [14]:
# Replace null/invalid values
data = data.withColumn(
    "reviews",
    F.when(F.col("reviews").isNull(), REPL_VALUE)
    .when(F.col("reviews") == "", REPL_VALUE)
    .otherwise(F.col("reviews"))
    .cast("double"),
)

data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+
|2022 Complete Pyt...|       Jose Portilla|https://www.udemy...|https://i

In [15]:
# Verify!
data.select("reviews").filter(data.reviews.isNull()).show()

+-------+
|reviews|
+-------+
+-------+



In [16]:
# Clean reviews_count
data = data.withColumn(
    "num_reviews",
    F.regexp_extract(F.col("reviews_count"), pattern=r"\d{4,10}", idx=0),
)


# 90th percentile. i.e 0.9%
REPL_VALUE = data.selectExpr("round(percentile(num_reviews, 0.9), 0)").first()[0]
print(REPL_VALUE)

# Replace null/invalid values
data = data.withColumn(
    "num_reviews",
    F.when(F.col("num_reviews").isNull(), REPL_VALUE)
    .when(F.col("num_reviews") == "", REPL_VALUE)
    .otherwise(F.col("num_reviews"))
    .cast("int"),
)

data.show(10)

13779.0
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+
|2022 Complete Pyt...|       

In [17]:
# Verify!
data.select("num_reviews").filter(data.reviews.isNull()).show()

+-----------+
|num_reviews|
+-----------+
+-----------+



### To Do

```text
- Clean the following columns:
    - `course_duration`.
    - `lectures_count`.
```

In [18]:
data.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+
|2022 Complete Pyt...|       Jose Por

In [19]:
# Course duration and lectures count
DURATION_THRESH = 100  # hours
DURATION_REPL_VALUE = 15  # hours
LECTURES_COUNT_VALUE = 20

data = data.withColumn(
    "course_duration_hrs",
    F.regexp_extract(F.col("course_duration"), pattern=r"\d{1,3}\.?\d{1,2}", idx=0),
).withColumn(
    "course_duration_hrs",
    F.when(F.col("course_duration_hrs") > DURATION_THRESH, DURATION_REPL_VALUE)
    .otherwise(F.col("course_duration_hrs"))
    .cast("double"),
)

data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+-------------------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|course_duration_hrs|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+----------

In [20]:
data = (
    data.withColumn(
        "num_lectures",
        F.regexp_extract(F.col("lectures_count"), pattern=r"\d{1,3}\s?\w{8}", idx=0),
    )
    .withColumn(
        "num_lectures",
        F.regexp_extract(str=F.col("num_lectures"), pattern=r"\d{1,3}", idx=0),
    )
    .withColumn(
        "num_lectures",
        F.when(F.col("num_lectures").isNull(), LECTURES_COUNT_VALUE)
        .when(F.col("num_lectures") == "", LECTURES_COUNT_VALUE)
        .otherwise(F.col("num_lectures"))
        .cast("int"),
    )
)

data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------------+-------+-----------+-------------------+------------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|prog_languages_n_tools|reviews|num_reviews|course_duration_hrs|num_lectures|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+---------------

### To Do

```text
- Clean the following columns:
    - `level`.
    - `price_after_discount` and `main_price`.
    - `students_count`.
```

### To Do

```text
- Drop irrelevant columns.
- Extract the programming languages from the course name.
- Clean the following columns
    - `reviews avg` 
    - `reviews_count`.
    - `course_duration`.
    - `lectures_count` column .
    - `level`.
    - `price_after_discount` and `main_price`.
    - `students_count`.
```