# For this user prediction model we will use a linear regression with some extra stuff
Let`s define the variables we will use as predictors and targets
-Age|, usage_of_platform|, popular_genre|, money_spent on platform|, number_of_subsriptions|
-usage_of_platform - defined on a 3 point scale low, medium, high. The usage is counted as the number of liked songs, playlists, users following|, artists following|, playlists_following|, 

In [31]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from datetime import datetime, date

In [109]:
# Load the data to be used 
df_users = pd.read_csv(r"/home/jirka/Documents/spotify/project/spotify/analysis/analysis/data/Users_data.csv")
df_songs = pd.read_csv(r"/home/jirka/Documents/spotify/project/spotify/analysis/analysis/data/Songs_data.csv")
df_artists = pd.read_csv(r"/home/jirka/Documents/spotify/project/spotify/analysis/analysis/data/Artists_data.csv")
df_playlists = pd.read_csv(r"/home/jirka/Documents/spotify/project/spotify/analysis/analysis/data/Playlists_data.csv")
df_playlists_following = pd.read_csv(r"/home/jirka/Documents/spotify/project/spotify/analysis/analysis/data/Playlists_users_data.csv")
df_likes = pd.read_csv(r"/home/jirka/Documents/spotify/project/spotify/analysis/analysis/data/Likes_data.csv")
df_artists_following = pd.read_csv(r"/home/jirka/Documents/spotify/project/spotify/analysis/analysis/data/Artists_followers_data.csv")
df_users_following = pd.read_csv(r"/home/jirka/Documents/spotify/project/spotify/analysis/analysis/data/Followers_users_data.csv")
df_users_subscriptions = pd.read_csv(r"/home/jirka/Documents/spotify/project/spotify/analysis/analysis/data/User_subscriptions_data.csv")
df_subscription_info = pd.read_csv(r"/home/jirka/Documents/spotify/project/spotify/analysis/analysis/data/Subscription_plan_info_data.csv")

In [33]:
# Define helper functions
def change_users_age(birthday_str):
    today_date = date.today()
    date_converted = datetime.strptime(birthday_str, "%Y-%m-%d").date()

    age = today_date.year - date_converted.year
    
    return age

In [None]:
# Data manipulation for usage_of_platform column
usage_of_platform = df_playlists_following.groupby(by='user_id')["playlists_id"].count().reset_index()
usage_of_platform.rename(columns={'playlists_id': 'engagement_count'}, inplace=True)

# Add engagement count from artists following
df_artists_following_count = df_artists_following.groupby(by='user_id')['artist_id'].count().reset_index()
usage_of_platform = usage_of_platform.merge(
    df_artists_following_count, 
    on='user_id', 
    how='left', 
    suffixes=('', '_artists')
)
usage_of_platform['engagement_count'] += usage_of_platform['artist_id'].fillna(0)

# Add engagement count from users following
users_following_count = df_users_following.groupby(by='user_id1')['user_id2'].count().reset_index()
usage_of_platform = usage_of_platform.merge(
    users_following_count.rename(columns={'user_id1': 'user_id'}), 
    on='user_id', 
    how='left', 
    suffixes=('', '_users')
)
usage_of_platform['engagement_count'] += usage_of_platform['user_id2'].fillna(0)

# Add engagement count from likes
likes_count = df_likes.groupby(by='user_id')['song_id'].count().reset_index()
usage_of_platform = usage_of_platform.merge(
    likes_count, 
    on='user_id', 
    how='left', 
    suffixes=('', '_likes')
)
usage_of_platform['engagement_count'] += usage_of_platform['song_id'].fillna(0)

# Add engagement count from playlists created
playlists_count = df_playlists.groupby(by='creator_id')['id'].count().reset_index()
usage_of_platform = usage_of_platform.merge(
    playlists_count.rename(columns={'creator_id': 'user_id'}), 
    on='user_id', 
    how='left', 
    suffixes=('', '_playlists')
)
usage_of_platform['engagement_count'] += usage_of_platform['id'].fillna(0)

# Drop temporary columns used in merging
columns_to_drop = ['artist_id', 'user_id2', 'song_id', 'id']
usage_of_platform.drop(columns=columns_to_drop, inplace=True)

bins = [0, 8, 17, float('inf')]  # Define bin edges: 0-50 (low), 50-100 (mid), >100 (high)
labels = ['low', 'mid', 'high']

usage_of_platform['enganment_bins'] = pd.cut(usage_of_platform['engagement_count'], bins=bins, labels=labels, right=False)

usage_of_platform


In [None]:
# Data manipulation for price, number_of_resubscriptions, number_of_money_spent column

df_users_subs_info = df_users_subscriptions.merge(df_subscription_info, how='left', left_on='subscription_plan_id', right_on='id', suffixes=('_user_sub', 'sub'))
df_users_subs_info_grouped = df_users_subs_info.groupby(by='user_id')['price'].sum().reset_index()
print(df_users_subs_info_grouped)

df_users_subscriptions_grouped = df_users_subscriptions.groupby(by='user_id')['subscription_plan_id'].count().reset_index()


In [116]:
# Data manipulation for favorite genre
df_songs_artists = df_songs.merge(df_artists, how='left', left_on='artist_id', right_on='id', suffixes=('_song', '_artist'))

df_likes_songs_artists = df_likes.merge(df_songs_artists, how='left', left_on='song_id', right_on='id_song')
df_likes_songs_artists_grouped = df_likes_songs_artists.groupby(by=['user_id', 'genre'])['song_id'].count().reset_index()

idx = df_likes_songs_artists_grouped.groupby('user_id')['song_id'].idxmax()

# Select the rows with the maximum song_count for each user_id
favorite_genre = df_likes_songs_artists_grouped.loc[idx].reset_index(drop=True)

favorite_genre

Unnamed: 0,user_id,genre,song_id
0,1,Pop,2
1,2,Pop,4
2,3,Jazz,3
3,4,Pop,4
4,5,Pop,2
5,6,Pop,3
6,7,Pop,3
7,8,Hip-Hop,2
8,9,Pop,2
9,10,Pop,3


In [119]:
# Creating the final analysis df
Analysis_df = pd.DataFrame()
Analysis_df['User_id'] = df_users['id']
Analysis_df["Age"] =  df_users["date_of_birth"].apply(change_users_age)
Analysis_df['User_type'] = df_users['user_type']
Analysis_df['Enganment_bins'] = usage_of_platform['enganment_bins']
Analysis_df = Analysis_df.merge(df_users_subscriptions_grouped, how='left', left_on='User_id', right_on='user_id')
Analysis_df = Analysis_df.merge(df_users_subs_info_grouped, how='left', left_on='User_id', right_on='user_id')
Analysis_df.rename(columns={"subscription_plan_id": "subscription_count", "price": "money_spent"}, inplace=True)
Analysis_df.drop(columns=["user_id_x", "user_id_y"], inplace=True)
Analysis_df["Favorite_genre"] = favorite_genre["genre"]
Analysis_df = Analysis_df.loc[Analysis_df['User_type'] == 'premium']

Analysis_df

Unnamed: 0,User_id,Age,User_type,Enganment_bins,subscription_count,money_spent,Favorite_genre
1,2,39,premium,high,6.0,22494.0,Pop
2,3,32,premium,high,4.0,23496.0,Jazz
3,4,36,premium,high,4.0,53996.0,Pop
6,7,29,premium,high,4.0,21496.0,Pop
7,8,37,premium,high,4.0,45996.0,Hip-Hop
8,9,30,premium,high,4.0,15996.0,Pop
10,11,35,premium,high,2.0,21498.0,Hip-Hop
11,12,31,premium,high,1.0,1499.0,Pop
13,14,38,premium,high,2.0,19498.0,Pop
14,15,32,premium,high,2.0,5998.0,Jazz
