# Feature Engineering

### Approach
<b>Features To Build </b>
- Total Songs Listened To
- App Page Interactions
    - Thumbs Up
    - Thumbs Down
    - Add Friend
    - Add to playlist
- Help Page Interactions/Error Page Interactions


<b> Key Points </b>
- The Feature dataframe will be created based on the userId
- Transformations will need to maximize Pyspark capabilities

In [None]:
clean_psdf = psdf.dropna(subset="userId")
clean_psdf.createOrReplaceTempView("cleaned_user_log")
unique_users = clean_psdf[["userId"]].distinct()

clean_psdf = clean_psdf.withColumn("ts", (col("ts") / 1000).cast("timestamp"))

# Apply date_format function
clean_psdf = clean_psdf.withColumn("date", date_format(col("ts"), "yyyy-MM-dd"))

# clean_psdf = clean_psdf.withColumn("ts", from_unixtime(col("ts") / 1000))  # Assuming ts is in milliseconds
# clean_psdf = psdf.withColumn("date", date_format(col("ts"), "yyyy-MM-dd"))

In [None]:
def handle_missing_users(
    sdf: DataFrame, unique_users: DataFrame, fill_value=lit(0)
) -> DataFrame:
    """
    Handle missing users in a PySpark DataFrame.

    Parameters:
    - sdf (DataFrame): PySpark DataFrame representing user data. Should have columns 'userId' and 'featur_name'.
    - unique_users (DataFrame): PySpark DataFrame with unique user information.

    Returns:
    - DataFrame: Updated PySpark DataFrame with filled missing users.
    """

    sdf_user_count = sdf.count()

    unique_count = unique_users.count()

    if sdf_user_count != unique_count:
        print(f"Missing Values: {unique_count - sdf_user_count}")
        missing_users = unique_users.select("userId").subtract(sdf.select("userId"))
        # Since the sdf is only two we rename the column based on sdf's second column
        missing_users_sdf = missing_users.withColumn(sdf.columns[1], fill_value)
        filled_missing_users = sdf.union(missing_users_sdf)

        return filled_missing_users
    else:
        return sdf

## Build Features

### Song Counts

In [None]:
song_counts = (
    clean_psdf[["userId", "artist"]].dropna(subset="artist").groupBy("userId").count()
)
song_counts = song_counts.withColumnRenamed("count", "song_counts")

song_counts = handle_missing_users(song_counts, unique_users)

### Distinct Artist

In [None]:
distinct_artist = (
    clean_psdf.filter(clean_psdf["artist"].isNotNull())
    .groupBy("userId")
    .agg(F.countDistinct("artist").alias("distinct_artist"))
)

distinct_artist = handle_missing_users(distinct_artist, unique_users)

### User Level

In [None]:
user_level = (
    clean_psdf[["userId", "level", "ts"]]
    .orderBy("ts", ascending=False)
    .dropDuplicates(subset=["userId"])
    .select("userId", "level")
)

level_flag_udf = udf(lambda x: 1 if x == "paid" else 0, IntegerType())

# one-hot encode
user_level = user_level.withColumn(
    "level_flag", level_flag_udf(user_level["level"])
).select("userId", "level_flag")

user_level = handle_missing_users(user_level, unique_users)

### Positive App Usage

In [None]:
positive_usage_list = ["Thumbs Up", "Thumbs Down", "Add Friend", "Add to playlist"]

positive_usage = (
    clean_psdf[["userId", "page"]]
    .filter(col("page").isin(positive_usage_list))
    .groupBy("userId")
    .count()
)

positive_usage = positive_usage.withColumnRenamed("count", "pos_interactions")

positive_usage = handle_missing_users(positive_usage, unique_users)

### Negative Interactions

In [None]:
neg_interactions_list = ["Error", "Help"]

neg_interactions = (
    clean_psdf[["userId", "page"]]
    .filter(col("page").isin(neg_interactions_list))
    .groupBy("userId")
    .count()
)

neg_interactions = neg_interactions.withColumnRenamed("count", "neg_interactions")


neg_interactions = handle_missing_users(neg_interactions, unique_users)

### Unique Locations

In [None]:
unique_locations = (
    clean_psdf.filter(clean_psdf["location"].isNotNull())
    .groupBy("userId")
    .agg(F.countDistinct("location").alias("unique_locations"))
)


unique_locations = handle_missing_users(unique_locations, unique_users)

### Avg Daily Listens

In [None]:
# Calculate the average daily listens per user
average_daily_listens = (
    clean_psdf.dropna(subset="artist")
    .groupBy("userId", "date")
    .agg(count("artist").alias("daily_listens"))
)
average_daily_listens = average_daily_listens.groupBy("userId").agg(
    {"daily_listens": "avg"}
)

average_daily_listens = handle_missing_users(average_daily_listens, unique_users)

### Page Counts

In [None]:
page_filter = ["Cancel", "Cancellation Confirmation", "NextSong"]
page_count_df = (
    clean_psdf[["userId", "page"]]
    .filter(~col("page").isin(page_filter))
    .toPandas()
    .groupby("userId")
    .value_counts()
    .reset_index()
    .pivot(columns="page", values="count", index="userId")
    .fillna(int(0))
)


page_count_corr = page_count_df.corr()

sns.heatmap(
    page_count_corr,
    annot=False,
    cmap="coolwarm",
    fmt=".2f",
    linewidths=0.5,
)

page_count = spark.createDataFrame(page_count_df.reset_index())

### Sessions

In [None]:
from pyspark.sql.functions import countDistinct

clean_psdf.groupBy("userId").agg(
    countDistinct("sessionId").alias("sessionCount")
).show()

### Label

In [None]:
clean_psdf[["userId", "page"]].filter(
    col("page").isin(["Cancellation Confirmation"])
).distinct().show()

In [None]:
labels = (
    clean_psdf[["userId", "page"]]
    .filter(col("page").isin(["Cancellation Confirmation", "Cancel"]))
    .drop_duplicates(["userId"])
)


labels = labels.withColumn("label", lit(1))
labels_df = labels.drop("page")

labels_df = handle_missing_users(labels_df, unique_users)

In [None]:
labels = (
    clean_psdf[["userId", "page"]]
    .filter(col("page").isin(["Cancellation Confirmation", "Cancel"]))
    .drop_duplicates(["userId"])
)


labels = labels.withColumn("label", lit(1))
labels_df = labels.drop("page")

labels_df = handle_missing_users(labels_df, unique_users)

labels_df.show()

In [None]:
dfs = [
    # song_counts,
    # song_listened_mean,
    average_daily_listens,
    user_level,
    positive_usage,
    neg_interactions,
    unique_locations,
    distinct_artist,
    page_count,
]

joined_features = labels_df
for df in dfs:
    joined_features = joined_features.join(df, "userId", "outer")

### Export Features For Model Testing

In [None]:
joined_features.dropDuplicates().drop("userId").toPandas().to_csv(
    "mini_selected_features.csv"
)

### Feature Correalations

In [None]:
joined_features_df = joined_features.toPandas().drop(labels="userId", axis=1)
# joined_features_df = joined_features_df.drop(
#     labels=["song_counts", "unique_locations"], axis=1
# )

correlation_matrix = joined_features_df.corr()

joined_features_df
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=False, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Extracted Feature Correalations")
plt.show()

In [None]:
from itertools import combinations

# Filter correlations greater than 0.8 (adjust the threshold as needed)
highly_correlated_pairs = []

# Iterate over all combinations of column pairs
for col1, col2 in combinations(correlation_matrix.columns, 2):
    if correlation_matrix[col1][col2] > 0.8:
        highly_correlated_pairs.append((col1, col2, correlation_matrix[col1][col2]))

# Print or use the filtered pairs
print("Highly Correlated Pairs:")


print(pd.DataFrame(data=highly_correlated_pairs).sort_values(by=0))

In [None]:
pandas_df = joined_features.toPandas()

for column in pandas_df.columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=pandas_df[column])
    plt.title(f"Distribution of {column}")
    plt.xlabel(column)
    plt.show()

In [None]:
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler

df = joined_features_df.drop_duplicates().fillna(0).copy()

Y = df["label"]
X = df.iloc[:, 1:]

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
pca = PCA()
X = pca.fit_transform(X)
pca.get_covariance()
explained_variance = pca.explained_variance_ratio_

In [None]:
with plt.style.context("dark_background"):
    plt.figure(figsize=(6, 4))
    plt.bar(
        range(len(explained_variance)),
        explained_variance,
        alpha=0.5,
        align="center",
        label="individual explained variance",
    )
    plt.xlabel("Principal components")
    plt.ylabel("Explained variance ratio")
    plt.legend()
    plt.tight_layout()

In [None]:
with plt.style.context("dark_background"):
    plt.figure(figsize=(6, 4))
    plt.bar(
        range(6), explained_variance[:6], alpha=0.5, align="center"
    )  # , label='individual explained variance' )
    plt.xlabel("Principal components")
    plt.ylabel("Explained variance ratio")
    plt.legend()
    plt.tight_layout()

In [None]:
clean_psdf[["location", "userId"]]

In [None]:
ex = 12
str(ex) + "g"