In [6]:
%idle_timeout 120
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

import boto3
import sys
from awsglue.dynamicframe import DynamicFrame
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from pyspark.sql import functions as F


In [8]:
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [9]:
def handle_missing_users(sdf: DataFrame, unique_users: DataFrame) -> DataFrame:
    """
    Handle missing users in a PySpark DataFrame.

    Parameters:
    - sdf (DataFrame): PySpark DataFrame representing user data. Should have columns 'userId' and 'featur_name'.
    - unique_users (DataFrame): PySpark DataFrame with unique user information.

    Returns:
    - DataFrame: Updated PySpark DataFrame with filled missing users.
    """

    sdf_user_count = sdf.count()

    unique_count = unique_users.count()

    if sdf_user_count != unique_count:
        print(f"Missing Values: {unique_count - sdf_user_count}")
        missing_users = unique_users.select("userId").subtract(sdf.select("userId"))
        # Since the sdf is only two we rename the column based on sdf's second column
        missing_users_sdf = missing_users.withColumn(sdf.columns[1], lit(0))
        filled_missing_users = sdf.union(missing_users_sdf)

        return filled_missing_users
    else:
        return sdf

In [4]:
# Optimize the data movement from pandas to Spark DataFrame and back
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# You can define a distributed Spark DataFrame, to read the data in a distributed way and be able to process large data
# Here it takes a bit of time because we ask it to infer schema, in practice could just let it set everything as string
# and handle the schema manually
sdf = spark.read.json("s3a://udacity-dsnd/sparkify/sparkify_event_data.json")




In [7]:
# since our focus is on users that are on the platform
clean_psdf = sdf.dropna(subset="userId")
clean_psdf.createOrReplaceTempView("cleaned_user_log")


unique_users = clean_psdf[["userId"]].distinct()




## Build Features

### Song Counts

In [8]:
song_counts = (
    clean_psdf[["userId", "artist"]].dropna(subset="artist").groupBy("userId").count()
)
song_counts = song_counts.withColumnRenamed("count", "song_counts")

song_counts = handle_missing_users(song_counts, unique_users)

AssertionError: 


### Distinct Artist


In [None]:
distinct_artist = (
    clean_psdf.filter(clean_psdf["artist"].isNotNull())
    .groupBy("userId")
    .agg(F.countDistinct("artist").alias("distinct_artist"))
)

distinct_artist = handle_missing_users(distinct_artist, unique_users)

### User Level

In [None]:
user_level = (
    clean_psdf[["userId", "level", "ts"]]
    .orderBy("ts", ascending=False)
    .dropDuplicates(subset=["userId"])
    .select("userId", "level")
)

level_flag_udf = udf(lambda x: 1 if x == "paid" else 0, IntegerType())

# one-hot encode
user_level = user_level.withColumn(
    "level_flag", level_flag_udf(user_level["level"])
).select("userId", "level_flag")

user_level = handle_missing_users(user_level, unique_users)

### Positive App Usage

In [None]:
positive_usage_list = ["Thumbs Up", "Thumbs Down", "Add Friend", "Add to playlist"]

positive_usage = (
    clean_psdf[["userId", "page"]]
    .filter(col("page").isin(positive_usage_list))
    .groupBy("userId")
    .count()
)

positive_usage = positive_usage.withColumnRenamed("count", "pos_interactions")

positive_usage = handle_missing_users(positive_usage, unique_users)

### Negative Interactions

In [None]:
neg_interactions_list = ["Error", "Help"]

neg_interactions = (
    clean_psdf[["userId", "page"]]
    .filter(col("page").isin(neg_interactions_list))
    .groupBy("userId")
    .count()
)

neg_interactions = neg_interactions.withColumnRenamed("count", "neg_interactions")


neg_interactions = handle_missing_users(neg_interactions, unique_users)

### Unique Locations

In [None]:
unique_locations = (
    clean_psdf.filter(clean_psdf["location"].isNotNull())
    .groupBy("userId")
    .agg(F.countDistinct("location").alias("unique_locations"))
)


unique_locations = handle_missing_users(unique_locations, unique_users)

### Page Counts

In [None]:
# page_filter = ["Cancel", "Cancellation Confirmation", "NextSong"]
# page_count_df = (
#     clean_sdf[["userId", "page"]]
#     .filter(~col("page").isin(page_filter))
#     .toPandas()
#     .groupby("userId")
#     .value_counts()
#     .reset_index()
#     .pivot(columns="page", values="count", index="userId")
#     .fillna(int(0))
# )


# page_count_corr = page_count_df.corr()

# sns.heatmap(
#     page_count_corr,
#     annot=False,
#     cmap="coolwarm",
#     fmt=".2f",
#     linewidths=0.5,
# )

# page_count = spark.createDataFrame(page_count_df.reset_index())

### Label

In [None]:
labels = clean_psdf[["userId", "page"]].filter(
    col("page").isin(["Cancellation Confirmation", "Cancel"])
).drop_duplicates(['userId'])


labels = labels.withColumn('label', lit(1))
labels_df = labels.drop('page')

labels_df = handle_missing_users(labels_df,unique_users)

### Combine Features

In [None]:
dfs = [
    song_counts,
    song_listened_mean,
    user_level,
    positive_usage,
    neg_interactions,
    unique_locations,
    distinct_artist,
    # page_count,
]

joined_features = labels_df
for df in dfs:
    joined_features = joined_features.join(df, "userId", "outer")

### Export Features For Model Testing

In [None]:

joined_features.dropDuplicates().drop("userId").write\
    .format("csv")\
    .option("quote", None)\
    .mode("append")\
    .save("s3://spark-spotify-data/cleaned_features")