# EDA and Feature Engineering Using Spark

In [37]:
# Built-in libraries
import collections
import itertools
import re
from typing import Any
import json

import numpy as np
import pandas as pd
import pyspark.sql.functions as fn


# PySpark Modules
from pyspark.sql import SparkSession
import pyspark.sql.functions as func
from pyspark.sql.types import (
    ArrayType,
    StringType,
    IntegerType,
    FloatType,
    StructType,
    StructField,
    LongType,
)


# Black formatter (optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Create Spark Session
spark = SparkSession.builder.appName("EDA").getOrCreate()

23/07/06 20:46:52 WARN Utils: Your hostname, Chinedus-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.151 instead (on interface en0)
23/07/06 20:46:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/06 20:46:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [16]:
# Load data
fp = "../data/udemy_data.csv"
data = spark.read.option("header", "true").option("inferSchema", "true").csv(fp)

data.printSchema()


root
 |-- course_name: string (nullable = true)
 |-- instructor: string (nullable = true)
 |-- course url: string (nullable = true)
 |-- course image: string (nullable = true)
 |-- course description: string (nullable = true)
 |-- reviews_avg: string (nullable = true)
 |-- reviews_count: string (nullable = true)
 |-- course_duration: string (nullable = true)
 |-- lectures_count: string (nullable = true)
 |-- level: string (nullable = true)
 |-- price_after_discount: string (nullable = true)
 |-- main_price: string (nullable = true)
 |-- course_flag: string (nullable = true)
 |-- students_count: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)



In [17]:
# Data size (number of rows)
data.count()


5027

In [18]:
data.show(5, truncate=60)


+---------------------------------------------------------+------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+----------------------------------------------------+--------------------+----------------+----------------+------------+-----------------------+--------------------------+--------------------------+------------------+----------------+----+----+----+
|                                              course_name|                                            instructor|                                                  course url|                                                course image|                                          course description|                                         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|   price_after_discount|          

23/07/06 21:25:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: course_name, instructor, course url, course image, course description, reviews_avg, reviews_count, course_duration, lectures_count, level, price_after_discount, main_price, course_flag, students_count, , , , 
 Schema: course_name, instructor, course url, course image, course description, reviews_avg, reviews_count, course_duration, lectures_count, level, price_after_discount, main_price, course_flag, students_count, _c14, _c15, _c16, _c17
Expected: _c14 but found: 
CSV file: file:///Users/neidu/Desktop/Projects/Personal/My_Projects/MLOps_Tutorials/data/udemy_data.csv


In [20]:
# Qs 1.) Check the number of courses that have Python in the course name
COURSE = "python"

data = data.withColumn("to_lower", func.lower(func.col("course_name"))).withColumn(
    "is_python",
    func.when(func.col("to_lower").like(r"%python%"), True).otherwise(False),
)
data.show(5)


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------+----+----+----+--------------------+---------+
|         course_name|          instructor|          course url|        course image|  course description|         reviews_avg|       reviews_count| course_duration|  lectures_count|       level|price_after_discount|          main_price|         course_flag|    students_count|            _c14|_c15|_c16|_c17|            to_lower|is_python|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+----------------+------------+--------------------+--------------------+--------------------+------------------+----------------+----+----+----+---------

23/07/06 21:26:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: course_name, instructor, course url, course image, course description, reviews_avg, reviews_count, course_duration, lectures_count, level, price_after_discount, main_price, course_flag, students_count, , , , 
 Schema: course_name, instructor, course url, course image, course description, reviews_avg, reviews_count, course_duration, lectures_count, level, price_after_discount, main_price, course_flag, students_count, _c14, _c15, _c16, _c17
Expected: _c14 but found: 
CSV file: file:///Users/neidu/Desktop/Projects/Personal/My_Projects/MLOps_Tutorials/data/udemy_data.csv


In [21]:
data.filter(func.col("is_python") == True).count()


643

In [22]:
# OR
data.select(["to_lower", "instructor"]).show(10, truncate=False)


+------------------------------------------------------------+------------------------------------------------------------------+
|to_lower                                                    |instructor                                                        |
+------------------------------------------------------------+------------------------------------------------------------------+
|2022 complete python bootcamp from zero to hero in python   |Jose Portilla                                                     |
|the web developer bootcamp 2022                             |Colt Steele                                                       |
|the complete 2022 web development bootcamp                  |Dr. Angela Yu                                                     |
|angular - the complete guide (2023 edition)                 |Maximilian Schwarzmüller                                          |
|java programming masterclass covering java 11 & java 17     |Tim Buchalka, Tim Buchalka's

In [88]:
tools = [
    "python",
    "java",
    "javascript",
    "c#",
    "c\+\+",
    "ruby",
    "go",
    "swift",
    "objective-c",
    "rust",
    "kotlin",
    "typescript",
    "scala",
    "perl",
    "haskell",
    "lua",
    "dart",
    "julia",
    "elixir",
    "shell",
    "php",
    "ruby",
    "sql",
    "html",
    "css",
    "assembly",
    "groovy",
    "matlab",
    "powershell",
    "vb.net",
    "f#",
    "clojure",
    "erlang",
    "ocaml",
    "bash",
    "pl/sql",
    "delphi",
    "ada",
    "lisp",
    "fortran",
    "prolog",
    "cobol",
    "scratch",
    "abap",
    "tcl",
    "racket",
    "golang",
    "verilog",
    "dockerfile",
    "puppet",
    "hack",
    "rust",
    "groovy",
    "crystal",
    "ocaml",
    "racket",
    "elm",
    "haxe",
    "typescript",
    "lua",
    "perl",
    "elixir",
    "powershell",
    "scala",
    "julia",
    "ruby",
    "go",
    "erlang",
    "swift",
    r"\br\b",
    "haskell",
    "kotlin",
    "f#",
    "rust",
    "dart",
    "clojure",
    "objective-c",
    "java",
    "javascript",
    "python",
    "django",
    "ruby on rails",
    "express.js",
    "asp.net",
    "laravel",
    "flask",
    "spring boot",
    "vue.js",
    "angular",
    "react",
    "ember.js",
    "meteor",
    "symfony",
    "codeigniter",
    "cakephp",
    "asp.net mvc",
    "play",
    "struts",
    "ruby on rails",
    "flask",
    "laravel",
    "spring boot",
    "django",
    "express.js",
    "codeigniter",
    "vue.js",
    "react",
    "ember.js",
    "meteor",
    "angular",
    "symfony",
    "cakephp",
    "play",
    "asp.net mvc",
    "struts",
    "tensorflow",
    "pytorch",
    "keras",
    "scikit-learn",
    "caffe",
    "theano",
    "torch",
    "mxnet",
    "caffe2",
    "h2o",
    "deeplearning4j",
    "gluon",
    "chainer",
    "tensorflow",
    "pytorch",
    "keras",
    "scikit-learn",
    "caffe",
    "theano",
    "torch",
    "mxnet",
    "caffe2",
    "h2o",
    "deeplearning4j",
    "gluon",
    "chainer",
    "spacy",
    "nltk",
    "corenlp",
    "hugging face transformers",
]

# tools = [var.lower() for var in tools]

PATTERN = "(" + "|".join(sorted(set(tools))) + ")"
PATTERN

'(\\br\\b|abap|ada|angular|asp.net|asp.net mvc|assembly|bash|c#|c\\+\\+|caffe|caffe2|cakephp|chainer|clojure|cobol|codeigniter|corenlp|crystal|css|dart|deeplearning4j|delphi|django|dockerfile|elixir|elm|ember.js|erlang|express.js|f#|flask|fortran|gluon|go|golang|groovy|h2o|hack|haskell|haxe|html|hugging face transformers|java|javascript|julia|keras|kotlin|laravel|lisp|lua|matlab|meteor|mxnet|nltk|objective-c|ocaml|perl|php|pl/sql|play|powershell|prolog|puppet|python|pytorch|racket|react|ruby|ruby on rails|rust|scala|scikit-learn|scratch|shell|spacy|spring boot|sql|struts|swift|symfony|tcl|tensorflow|theano|torch|typescript|vb.net|verilog|vue.js)'

In [99]:
@func.udf(returnType=StringType())
def extract_prog_language(input_: str) -> Any:
    """This returns a list containing the matched pattern."""
    pattern = PATTERN
    result = re.compile(pattern=pattern, flags=re.I).findall(string=input_)
    return list(set(result))

In [100]:
text = "machine learning a-z™: hands-on python & r in data science  "

extract_prog_language(input_=text)

TypeError: wrapper() got an unexpected keyword argument 'input_'

In [101]:
data.withColumn("languages", extract_prog_language(func.col("to_lower"))).show(
    30, truncate=False
)

+------------------------------------------------------------+-------------------------------------------------------------------------+----------------------------------------------------------------------------------------+-------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------+--------------------+----------------+----------------+------------+-----------------------+--------------------------+--------------------------+------------------+----------------+----+----+----+------------------------------------------------------------+---------+-------------+
|course_name                                                 |instructor                                                               |course url                                                                              |course image               

23/07/06 22:02:05 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: course_name, instructor, course url, course image, course description, reviews_avg, reviews_count, course_duration, lectures_count, level, price_after_discount, main_price, course_flag, students_count, , , , 
 Schema: course_name, instructor, course url, course image, course description, reviews_avg, reviews_count, course_duration, lectures_count, level, price_after_discount, main_price, course_flag, students_count, _c14, _c15, _c16, _c17
Expected: _c14 but found: 
CSV file: file:///Users/neidu/Desktop/Projects/Personal/My_Projects/MLOps_Tutorials/data/udemy_data.csv
