In [None]:
#This is used by the PySpark_ETL_Demo notebook to showcase calling a notebook from another notebook and passing in a parameter.
#The parameter is only used to provide an exit value for the notebook, but could be used for more practical efforts.
destination_file = ''

In [1]:
##How did I create the Weird Al Data?
from pyspark.sql.functions import lit, when, col, expr, rand, floor
import random

# Demo data. Let's create a bunch of possible values for every column
names = ["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Diana Evans", "Eve Foster", "Frank Green", "Grace Harris"]
first_names = ["John", "Jane", "Alice", "Bob", "Charlie", "Diana", "Eve", "Frank", "Grace", "Smith", "Bill", "Jon", "Vivian", "Stacy", "Heidi", "Karen", "Otto", "Belinda"]
last_names = ["Doe", "Smith", "Johnson", "Brown", "Davis", "Evans", "Foster", "Green", "Harris", "Johann", "Pingel", "Kelbert", "Hiddleston", "Windsor", "Workmann", "Drews"]
genres = ["Rock", "Pop", "Jazz", "Classical", "Hip Hop", "Country", "Electronic", "Reggae", "Blues", "Metal"]
addresses = ["123 Main St", "456 Elm St", "789 Oak St", "101 Maple Ave", "202 Pine Rd", "303 Cedar Blvd", "404 Birch Ln", "505 Spruce Dr", "606 Willow Ct", "707 Aspen Pl"]
emails = ["example1@example.com", "example2@example.com", "example3@example.com", "example4@example.com", "example5@example.com", "example6@example.com", "example7@example.com", "example8@example.com", "example9@example.com", "example10@example.com"]
phone_numbers = ["555-1234", "555-5678", "555-8765", "555-4321", "555-6789", "555-9876", "555-3456", "555-6543", "555-7890", "555-0987"]

# Add in some environment info for where we're going to save our results
lakehouse_address = 'abfss://84e6d815-34b7-49bb-a433-ebec208e5cdb@onelake.dfs.fabric.microsoft.com/b389d6fd-e091-479b-ac43-6640a58407bd'
file_name = 'music_industry_data.csv'
updated_file_name = 'updated_music_industry_data.csv'

#Create some derived variables based on the environment info
file_address = f"Files/PerformanceTestLarge/{file_name}"
updated_file_address = f"Files/PerformanceTestLarge/{updated_file_name}"

# Create a function to generate random age as a string (whole number or decimal)
def generate_random_age():
    return f"{random.uniform(18, 70)}.{random.uniform(0,9)}"

# # Generate 100s of records in a Spark dataframe
# data = spark.range(0, 100000000).withColumn("Name", when((col("id") < 20) | (col("id") % 5 == 0), lit("Weird Al Yankovic"))
#                                             .otherwise(lit(random.choice(names)))) \
#                                 .withColumn("FirstName", when((col("id") < 20) | (col("id") % 5 == 0), lit("Weird Al"))
#                                             .otherwise(lit(random.choice(first_names)))) \
#                                 .withColumn("LastName", when((col("id") < 20) | (col("id") % 5 == 0), lit("Yankovic"))
#                                             .otherwise(lit(random.choice(last_names)))) \
#                                 .withColumn("Age", lit(generate_random_age())) \
#                                 .withColumn("Address", lit(random.choice(addresses))) \
#                                 .withColumn("Email", lit(random.choice(emails))) \
#                                 .withColumn("PhoneNumber", lit(random.choice(phone_numbers))) \
#                                 .withColumn("Genre", lit(random.choice(genres))) \
#                                 .withColumn("AgeBucket", lit("Unknown"))


data = spark.range(0, 100000000).withColumn(
    "Name",
    when((col("id") < 20) | (col("id") % 5 == 0), lit("Weird Al Yankovic"))
    .otherwise(expr("element_at(array(" + ", ".join([f"'{name}'" for name in names]) + "), cast(rand() * " + str(len(names)) + " + 1 as int))"))
).withColumn(
    "FirstName",
    when((col("id") < 20) | (col("id") % 5 == 0), lit("Weird Al"))
    .otherwise(expr("element_at(array(" + ", ".join([f"'{first_name}'" for first_name in first_names]) + "), cast(rand() * " + str(len(first_names)) + " + 1 as int))"))
).withColumn(
    "LastName",
    when((col("id") < 20) | (col("id") % 5 == 0), lit("Yankovic"))
    .otherwise(expr("element_at(array(" + ", ".join([f"'{last_name}'" for last_name in last_names]) + "), cast(rand() * " + str(len(last_names)) + " + 1 as int))"))
).withColumn(
    "Age",
    ((rand() * 53.0) + 18.0)  # Random age between 18 and 70
).withColumn(
    "Genre",
    expr("element_at(array(" + ", ".join([f"'{genre}'" for genre in genres]) + "), cast(rand() * " + str(len(genres)) + " + 1 as int))")
).withColumn(
    "Address",
    expr("element_at(array(" + ", ".join([f"'{address}'" for address in addresses]) + "), cast(rand() * " + str(len(addresses)) + " + 1 as int))")
).withColumn(
    "Email",
    expr("element_at(array(" + ", ".join([f"'{email}'" for email in emails]) + "), cast(rand() * " + str(len(emails)) + " + 1 as int))")
).withColumn(
    "PhoneNumber",
    expr("element_at(array(" + ", ".join([f"'{phone}'" for phone in phone_numbers]) + "), cast(rand() * " + str(len(phone_numbers)) + " + 1 as int))")
).withColumn(
    "AgeBucket",
    lit("Unknown")
)
#We don't want the ID anymore, so we'll remove that here
data = data.drop("id")


# Display the DataFrame
#print(data)

# Save DataFrame to a CSV file
#data.write.mode("overwrite").format("csv").option("header", "true").save(file_address)

# Now, let's mess with the data
# Obviously this isn't something you'd do in a production setting, this is purely for dramatic effect.
# Find the first record with the name 'Weird Al Yankovic'
first_weird_al_record = data.filter(col("Name") == "Weird Al Yankovic").limit(1).collect()[0]

data_updated = data.withColumn("Age", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["Age"])).otherwise(col("Age"))) \
                   .withColumn("Address", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["Address"])).otherwise(col("Address"))) \
                   .withColumn("Email", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["Email"])).otherwise(col("Email"))) \
                   .withColumn("PhoneNumber", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["PhoneNumber"])).otherwise(col("PhoneNumber"))) \
                   .withColumn("Genre", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["Genre"])).otherwise(col("Genre"))) \
                   .withColumn("AgeBucket", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["AgeBucket"])).otherwise(col("AgeBucket")))


# # Display the updated DataFrame
# print(data_updated)

# Save the updated DataFrame to a new CSV file
#df.to_csv(updated_file_address, index=False)
data_updated.write.format("csv").mode("overwrite").option("header", "true").save(updated_file_address)

StatementMeta(, 4b5c5fbe-f924-4241-881f-ef844c66c061, 3, Finished, Available, Finished)

In [None]:
# Code generated by Data Wrangler for PySpark DataFrame

from pyspark.sql import functions as F
from pyspark.sql import types as T

def clean_data(df):
    # Drop duplicate rows across all columns
    df = df.dropDuplicates()
    # Change column type to float16 for column: 'Age'
    # ⚠️ Could not convert to the specified bits for type
    df = df.withColumn('Age', df['Age'].cast(T.FloatType()))
    # Round column 'Age' (Number of decimals: 0)
    df = df.withColumn('Age', F.round(F.col('Age'), 0))
    return df

df_clean = clean_data(data_updated)
display(df_clean)

StatementMeta(, b1044047-4ede-468f-b4c4-11b9ac8cbb1f, -1, Cancelled, , Cancelled)

In [None]:
#Exit with the cleaned data
import notebookutils

notebookutils.notebook.exit(f"The passed-in parameter was {destination_file}")

In [1]:
# This is a previous version of the script.
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import lit, when, col
# import random

# # Initialize Spark session
# spark = SparkSession.builder.appName("PerformanceOptimization").getOrCreate()

# # Define lists of sample data
# names = ["Weird Al Yankovic", "John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Diana Evans", "Eve Foster", "Frank Green", "Grace Harris"]
# genres = ["Rock", "Pop", "Jazz", "Classical", "Hip Hop", "Country", "Electronic", "Reggae", "Blues", "Metal"]
# addresses = ["123 Main St", "456 Elm St", "789 Oak St", "101 Maple Ave", "202 Pine Rd", "303 Cedar Blvd", "404 Birch Ln", "505 Spruce Dr", "606 Willow Ct", "707 Aspen Pl"]
# emails = ["example1@example.com", "example2@example.com", "example3@example.com", "example4@example.com", "example5@example.com", "example6@example.com", "example7@example.com", "example8@example.com", "example9@example.com", "example10@example.com"]
# phone_numbers = ["555-1234", "555-5678", "555-8765", "555-4321", "555-6789", "555-9876", "555-3456", "555-6543", "555-7890", "555-0987"]

# # Function to generate random age
# def generate_random_age():
#     return random.randint(18, 70) if random.choice([True, False]) else random.uniform(18, 70)

# # Generate data directly in Spark
# data = spark.range(0, 100000000).withColumn("Name", when((col("id") < 20) | (col("id") % 5 == 0), lit("Weird Al Yankovic"))
#                                             .otherwise(lit(random.choice(names)))) \
#                                 .withColumn("Age", lit(generate_random_age())) \
#                                 .withColumn("Address", lit(random.choice(addresses))) \
#                                 .withColumn("Email", lit(random.choice(emails))) \
#                                 .withColumn("PhoneNumber", lit(random.choice(phone_numbers))) \
#                                 .withColumn("Genre", lit(random.choice(genres))) \
#                                 .withColumn("AgeBucket", lit("Unknown"))

# # Find the first record with the name 'Weird Al Yankovic'
# first_weird_al_record = data.filter(col("Name") == "Weird Al Yankovic").limit(1).collect()[0]

# # Update all 'Weird Al Yankovic' records to match the first one
# data_updated = data.withColumn("Age", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["Age"])).otherwise(col("Age"))) \
#                    .withColumn("Address", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["Address"])).otherwise(col("Address"))) \
#                    .withColumn("Email", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["Email"])).otherwise(col("Email"))) \
#                    .withColumn("PhoneNumber", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["PhoneNumber"])).otherwise(col("PhoneNumber"))) \
#                    .withColumn("Genre", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["Genre"])).otherwise(col("Genre"))) \
#                    .withColumn("AgeBucket", when(col("Name") == "Weird Al Yankovic", lit(first_weird_al_record["AgeBucket"])).otherwise(col("AgeBucket")))

# # Save the updated DataFrame to a CSV file
# data_updated.write.format("csv").mode("overwrite").save("abfss://84e6d815-34b7-49bb-a433-ebec208e5cdb@onelake.dfs.fabric.microsoft.com/b389d6fd-e091-479b-ac43-6640a58407bd/Files/PerformanceTest/Spark/updated_music_industry_data_spark.csv")

StatementMeta(, d913ee9d-33ad-4ace-b707-baaad22238c8, 3, Finished, Available, Finished)