## Basic Structured Operations


### Step 1: Initialize PySpark Session


In [93]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, lit , avg
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType



# Create a Spark session
spark = SparkSession.builder.appName("day2").getOrCreate()


23/08/30 14:39:44 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### Step 2: Load the Dataset


In [72]:
# Load the Chipotle dataset into a Spark DataFrame
data_path = "./occupation.csv"  # Replace with the actual path
occupation = spark.read.csv(data_path, header=True, inferSchema=True)



In [73]:
occupation.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip_code: string (nullable = true)



### Problem 1: Selecting Specific Columns
Problem: Select the "user_id," "age," and "occupation" columns from the occupation DataFrame.

In [74]:
selected_columns_df = occupation.select("user_id", "age", "occupation")

selected_columns_df.show()


+-------+---+-------------+
|user_id|age|   occupation|
+-------+---+-------------+
|      1| 24|   technician|
|      2| 53|        other|
|      3| 23|       writer|
|      4| 24|   technician|
|      5| 33|        other|
|      6| 42|    executive|
|      7| 57|administrator|
|      8| 36|administrator|
|      9| 29|      student|
|     10| 53|       lawyer|
|     11| 39|        other|
|     12| 28|        other|
|     13| 47|     educator|
|     14| 45|    scientist|
|     15| 49|     educator|
|     16| 21|entertainment|
|     17| 30|   programmer|
|     18| 35|        other|
|     19| 40|    librarian|
|     20| 42|    homemaker|
+-------+---+-------------+
only showing top 20 rows



### Problem 2: Filtering Rows based on Condition
Problem: Find the users who are older than 30 years from the occupation DataFrame.

In [75]:
filtered_occupation = occupation.filter(occupation.age > 30)

filtered_occupation.show()

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|      2| 53|     F|        other|   94043|
|      5| 33|     F|        other|   15213|
|      6| 42|     M|    executive|   98101|
|      7| 57|     M|administrator|   91344|
|      8| 36|     M|administrator|   05201|
|     10| 53|     M|       lawyer|   90703|
|     11| 39|     F|        other|   30329|
|     13| 47|     M|     educator|   29206|
|     14| 45|     M|    scientist|   55106|
|     15| 49|     F|     educator|   97301|
|     18| 35|     F|        other|   37212|
|     19| 40|     M|    librarian|   02138|
|     20| 42|     F|    homemaker|   95660|
|     25| 39|     M|     engineer|   55107|
|     26| 49|     M|     engineer|   21044|
|     27| 40|     F|    librarian|   30030|
|     28| 32|     M|       writer|   55369|
|     29| 41|     M|   programmer|   94043|
|     34| 38|     F|administrator|   42141|
|     39| 41|     M|entertainmen

### Problem 3: Counting and Grouping
Problem: Count the number of users in each occupation from the occupation DataFrame.

In [76]:
occupation_counts = occupation.groupBy("occupation").agg(count("*").alias("user_count"))

occupation_counts.show()

+-------------+----------+
|   occupation|user_count|
+-------------+----------+
|    librarian|        51|
|      retired|        14|
|       lawyer|        12|
|         none|         9|
|       writer|        45|
|   programmer|        66|
|    marketing|        26|
|        other|       105|
|    executive|        32|
|    scientist|        31|
|      student|       196|
|     salesman|        12|
|       artist|        28|
|   technician|        27|
|administrator|        79|
|     engineer|        67|
|   healthcare|        16|
|     educator|        95|
|entertainment|        18|
|    homemaker|         7|
+-------------+----------+
only showing top 20 rows



### Problem 4: Adding a New Column
Problem: Add a new column "age_group" to the occupation DataFrame based on the age of the users. Divide users into age groups: "18-25", "26-35", "36-50", and "51+".

In [77]:
occupation_with_age_group = occupation.withColumn(
    "age_group",
    when(col("age").between(18, 25), "18-25")
    .when(col("age").between(26, 35), "26-35")
    .when(col("age").between(36, 50), "36-50")
    .otherwise("51+")
)


occupation_with_age_group.show()

+-------+---+------+-------------+--------+---------+
|user_id|age|gender|   occupation|zip_code|age_group|
+-------+---+------+-------------+--------+---------+
|      1| 24|     M|   technician|   85711|    18-25|
|      2| 53|     F|        other|   94043|      51+|
|      3| 23|     M|       writer|   32067|    18-25|
|      4| 24|     M|   technician|   43537|    18-25|
|      5| 33|     F|        other|   15213|    26-35|
|      6| 42|     M|    executive|   98101|    36-50|
|      7| 57|     M|administrator|   91344|      51+|
|      8| 36|     M|administrator|   05201|    36-50|
|      9| 29|     M|      student|   01002|    26-35|
|     10| 53|     M|       lawyer|   90703|      51+|
|     11| 39|     F|        other|   30329|    36-50|
|     12| 28|     F|        other|   06405|    26-35|
|     13| 47|     M|     educator|   29206|    36-50|
|     14| 45|     M|    scientist|   55106|    36-50|
|     15| 49|     F|     educator|   97301|    36-50|
|     16| 21|     M|entertai

### Problem 5: Creating DataFrames and Converting to Spark Types
Problem: Given the provided code snippet, create a DataFrame df using the given data and schema. The schema includes columns for firstname, middlename, lastname, id, gender, and salary. After creating the DataFrame, print its schema and display its content without truncation.

In [78]:
#Defining Schema for our dataframe

schema = StructType([
    StructField("firstname", StringType(), True),
    StructField("middlename", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("id", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", FloatType(), True)
])


#Inserting data into the newly made schema

data = [
    ("James", None, "Smith", 36636, "M", 3000.0),
    ("Michael", "Rose", None, 40288, "M", 4000.0),
    ("Robert", None, "Williams", 42114, "M", 4000.0),
    ("Maria", "Anne", "Jones", 39192, "F", 4000.0),
    ("Jen", "Mary", "Brown", None, "F", -1.0)
]

# Create a DataFrame using the schema and data

df = spark.createDataFrame(data, schema)


# Show the schema

df.printSchema()

# Display the content without truncation

df.show(truncate=False)



root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: float (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |null      |Smith   |36636|M     |3000.0|
|Michael  |Rose      |null    |40288|M     |4000.0|
|Robert   |null      |Williams|42114|M     |4000.0|
|Maria    |Anne      |Jones   |39192|F     |4000.0|
|Jen      |Mary      |Brown   |null |F     |-1.0  |
+---------+----------+--------+-----+------+------+



### Problem 6: Adding and Renaming Columns
Problem: Add a new column "gender" to the existing DataFrame and rename the "Age" column to "Years".

In [79]:
# Add a new column "gender" with a constant value
occupation_with_gender = occupation.withColumn("gender", lit("Unknown"))

# Rename the "age" column to "years"
occupation_final = occupation_with_gender.withColumnRenamed("age", "years")

# Show the schema and content
occupation_final.printSchema()

occupation_final.show(truncate=False)

root
 |-- user_id: integer (nullable = true)
 |-- years: integer (nullable = true)
 |-- gender: string (nullable = false)
 |-- occupation: string (nullable = true)
 |-- zip_code: string (nullable = true)

+-------+-----+-------+-------------+--------+
|user_id|years|gender |occupation   |zip_code|
+-------+-----+-------+-------------+--------+
|1      |24   |Unknown|technician   |85711   |
|2      |53   |Unknown|other        |94043   |
|3      |23   |Unknown|writer       |32067   |
|4      |24   |Unknown|technician   |43537   |
|5      |33   |Unknown|other        |15213   |
|6      |42   |Unknown|executive    |98101   |
|7      |57   |Unknown|administrator|91344   |
|8      |36   |Unknown|administrator|05201   |
|9      |29   |Unknown|student      |01002   |
|10     |53   |Unknown|lawyer       |90703   |
|11     |39   |Unknown|other        |30329   |
|12     |28   |Unknown|other        |06405   |
|13     |47   |Unknown|educator     |29206   |
|14     |45   |Unknown|scientist    |55106 

### Problem 7: Filtering Rows and Sorting
Problem: Filter out users who are younger than 30 years and sort the DataFrame by age in descending order.

In [88]:
# Filter out users younger than 30 years
filtered_occupation = occupation_final.filter(occupation.age >= 30)

# Sort the DataFrame by age in descending order
sorted_occupation = filtered_occupation.orderBy(occupation.age.desc())

# Show the resulting DataFrame
sorted_occupation.show()

+-------+-----+-------+-------------+--------+
|user_id|years| gender|   occupation|zip_code|
+-------+-----+-------+-------------+--------+
|    481|   73|Unknown|      retired|   37771|
|    767|   70|Unknown|     engineer|   00000|
|    803|   70|Unknown|administrator|   78212|
|    860|   70|Unknown|      retired|   48322|
|    559|   69|Unknown|    executive|   10022|
|    585|   69|Unknown|    librarian|   98501|
|    349|   68|Unknown|      retired|   61455|
|    573|   68|Unknown|      retired|   48911|
|    211|   66|Unknown|     salesman|   32605|
|    318|   65|Unknown|      retired|   06518|
|    564|   65|Unknown|      retired|   94591|
|    651|   65|Unknown|      retired|   02903|
|    423|   64|Unknown|        other|   91606|
|    845|   64|Unknown|       doctor|   97405|
|    364|   63|Unknown|     engineer|   01810|
|    777|   63|Unknown|   programmer|   01810|
|    858|   63|Unknown|     educator|   09645|
|    266|   62|Unknown|administrator|   78756|
|    520|   6

### Problem 8: Repartitioning and Collecting Rows
Problem: Repartition the DataFrame into 2 partitions without shuffling the data, then collect and display all rows in the driver and print number of partitions

In [81]:
# Repartition the DataFrame into 2 partitions without shuffling
repartitioned_df = df.coalesce(2)

# Collect and display all rows in the driver
all_rows = repartitioned_df.collect()
for row in all_rows:
    print(row)

# Get the number of partitions
num_partitions = repartitioned_df.rdd.getNumPartitions()


[Stage 64:>                                                         (0 + 2) / 2]

Row(firstname='James', middlename=None, lastname='Smith', id=36636, gender='M', salary=3000.0)
Row(firstname='Michael', middlename='Rose', lastname=None, id=40288, gender='M', salary=4000.0)
Row(firstname='Robert', middlename=None, lastname='Williams', id=42114, gender='M', salary=4000.0)
Row(firstname='Maria', middlename='Anne', lastname='Jones', id=39192, gender='F', salary=4000.0)
Row(firstname='Jen', middlename='Mary', lastname='Brown', id=None, gender='F', salary=-1.0)


                                                                                

In [82]:
print("Number of partitions:", num_partitions)

Number of partitions: 2


#### Filter out rows where the age is greater than 30 and create a new DataFrame. Then, add a new column named "is_elderly" with a value of "True" for these rows and "False" otherwise.Rename the "gender" column to "sex".

In [86]:
# Create a temporary view to use Spark SQL
sorted_occupation.createOrReplaceTempView("occupation_view")

print(sorted_occupation)
## Filter out rows where age is greater than 30
filtered_rows_sql = spark.sql("SELECT user_id, age, gender AS sex, occupation, zip_code FROM occupation_view WHERE age > 30")

# Add a new column "is_elderly" with a value of True or False
filtered_rows_sql = filtered_rows_sql.withColumn("is_elderly", lit(True))

# Show the resulting DataFrame without truncation
filtered_rows_sql.show(100, False)


# Show the resulting DataFrame
filtered_rows_sql.show()

DataFrame[user_id: int, age: int, gender: string, occupation: string, zip_code: string]
+-------+---+---+-------------+--------+----------+
|user_id|age|sex|occupation   |zip_code|is_elderly|
+-------+---+---+-------------+--------+----------+
|2      |53 |F  |other        |94043   |true      |
|5      |33 |F  |other        |15213   |true      |
|6      |42 |M  |executive    |98101   |true      |
|7      |57 |M  |administrator|91344   |true      |
|8      |36 |M  |administrator|05201   |true      |
|10     |53 |M  |lawyer       |90703   |true      |
|11     |39 |F  |other        |30329   |true      |
|13     |47 |M  |educator     |29206   |true      |
|14     |45 |M  |scientist    |55106   |true      |
|15     |49 |F  |educator     |97301   |true      |
|18     |35 |F  |other        |37212   |true      |
|19     |40 |M  |librarian    |02138   |true      |
|20     |42 |F  |homemaker    |95660   |true      |
|25     |39 |M  |engineer     |55107   |true      |
|26     |49 |M  |engineer   

In [89]:

# Filter out rows where age is greater than 30
filtered_rows = sorted_occupation.filter(col("age") > 30)

# Add a new column "is_elderly" with a value of True or False
filtered_rows = filtered_rows.withColumn("is_elderly", lit(True))

# Rename the "gender" column to "sex"
filtered_rows = filtered_rows.withColumnRenamed("gender", "sex")

# Show the resulting DataFrame
filtered_rows.show()

+-------+-----+-------+-------------+--------+----------+
|user_id|years|    sex|   occupation|zip_code|is_elderly|
+-------+-----+-------+-------------+--------+----------+
|    481|   73|Unknown|      retired|   37771|      true|
|    767|   70|Unknown|     engineer|   00000|      true|
|    803|   70|Unknown|administrator|   78212|      true|
|    860|   70|Unknown|      retired|   48322|      true|
|    559|   69|Unknown|    executive|   10022|      true|
|    585|   69|Unknown|    librarian|   98501|      true|
|    349|   68|Unknown|      retired|   61455|      true|
|    573|   68|Unknown|      retired|   48911|      true|
|    211|   66|Unknown|     salesman|   32605|      true|
|    318|   65|Unknown|      retired|   06518|      true|
|    564|   65|Unknown|      retired|   94591|      true|
|    651|   65|Unknown|      retired|   02903|      true|
|    423|   64|Unknown|        other|   91606|      true|
|    845|   64|Unknown|       doctor|   97405|      true|
|    364|   63

In [94]:
# Filter out rows with unknown gender
filtered_occupation = sorted_occupation.filter(col("gender") != "Unknown")

# Calculate the average age of male and female users separately
avg_age_per_gender = filtered_occupation.groupBy("gender").agg(avg("age").alias("avg_age"))

# Show the resulting DataFrame
avg_age_per_gender.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `age` cannot be resolved. Did you mean one of the following? [`years`, `gender`, `user_id`, `zip_code`, `occupation`].;
'Aggregate [gender#1056], [gender#1056, avg('age) AS avg_age#1392]
+- Project [user_id#905, years#1062, gender#1056, occupation#908, zip_code#909]
   +- Sort [age#906 DESC NULLS LAST], true
      +- Project [user_id#905, years#1062, gender#1056, occupation#908, zip_code#909, age#906]
         +- Filter (age#906 >= 30)
            +- Project [user_id#905, age#906 AS years#1062, gender#1056, occupation#908, zip_code#909, age#906]
               +- Project [user_id#905, age#906, Unknown AS gender#1056, occupation#908, zip_code#909]
                  +- Relation [user_id#905,age#906,gender#907,occupation#908,zip_code#909] csv
