In [1819]:
#!pip install pandas
import pandas as pd
#!pip install numpy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [1821]:
# Attempt to read the file with a different encoding, such as 'ISO-8859-1' or 'latin1'
df = pd.read_csv(r"C:\Users\info\spotify\StateProvincesTableFromMSWWID2016.csv", encoding='ISO-8859-1')

In [1822]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32833 entries, 0 to 32832
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  32833 non-null  object 
 1   track_name                32828 non-null  object 
 2   track_artist              32828 non-null  object 
 3   track_popularity          32833 non-null  int64  
 4   track_album_id            32833 non-null  object 
 5   track_album_name          32828 non-null  object 
 6   track_album_release_date  32833 non-null  object 
 7   playlist_name             32833 non-null  object 
 8   playlist_id               32833 non-null  object 
 9   playlist_genre            32833 non-null  object 
 10  playlist_subgenre         32833 non-null  object 
 11  danceability              32833 non-null  float64
 12  energy                    32833 non-null  float64
 13  key                       32833 non-null  int64  
 14  loudne

In [1823]:
df.shape

(32833, 23)

In [1825]:
# Check missing values in a specific column
print(df.isnull().sum())


track_id                    0
track_name                  5
track_artist                5
track_popularity            0
track_album_id              0
track_album_name            5
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
dtype: int64


In [1828]:
import pandas as pd

# Step 1: Clean the data by stripping any leading/trailing whitespaces and ensuring consistent case
df['track_id'] = df['track_id'].str.strip()  # Strip whitespaces from 'track_id'
df['playlist_genre'] = df['playlist_genre'].str.strip()  # Strip whitespaces from 'playlist_genre'

# Step 2: Define the genre priority order
genre_priority = ['edm', 'rap', 'pop', 'r&b', 'latin', 'rock']

# Step 3: Set the playlist_genre column as a categorical type with the specified order
df['playlist_genre'] = pd.Categorical(df['playlist_genre'], categories=genre_priority, ordered=True)

# Step 4: Sort the DataFrame by track_id and playlist_genre to prioritize higher genres
df_sorted = df.sort_values(by=['track_id', 'playlist_genre'], ascending=[True, True])


In [1830]:
# Step 1: Count the number of duplicates before removal
duplicates_before = df[df['track_id'].duplicated()]
initial_duplicates_count = duplicates_before.shape[0]
print(f"Number of duplicates in 'track_id' before removal: {initial_duplicates_count}")

# Step 2: Count the number of occurrences of each 'playlist_genre'
genre_counts = df['playlist_genre'].value_counts()

# Step 3: Sort the genres by size (largest first)
sorted_genres = genre_counts.index

# Step 4: Sort the DataFrame by genre size
df['playlist_genre_size'] = df['playlist_genre'].map(genre_counts)

# Step 5: Remove duplicates based on 'track_id', keeping the genre with the largest size
df = df.sort_values(by=['track_id', 'playlist_genre_size'], ascending=[True, False]).drop_duplicates(subset='track_id', keep='first')

# Step 6: Check for duplicates after removal
duplicates_after = df[df['track_id'].duplicated()]
remaining_duplicates_count = duplicates_after.shape[0]
print(f"Number of remaining duplicates in 'track_id' after removal: {remaining_duplicates_count}")

# Step 7: Optionally, display the rows that are duplicates (if any)
if remaining_duplicates_count > 0:
    print("The following rows have duplicate 'track_id's after removal:")
    print(duplicates_after)
else:
    print("No remaining duplicates in 'track_id' after removal.")

# Clean up the temporary 'playlist_genre_size' column
df = df.drop(columns=['playlist_genre_size'])

# Step 8: Display the result for the genres count in each category
genre_counts_after = df['playlist_genre'].value_counts()
print("\nNumber of playlist_genre in each category after removal:")
print(genre_counts_after)


Number of duplicates in 'track_id' before removal: 4477
Number of remaining duplicates in 'track_id' after removal: 0
No remaining duplicates in 'track_id' after removal.

Number of playlist_genre in each category after removal:
playlist_genre
rap      5057
r&b      4918
edm      4877
latin    4633
rock     4451
pop      4420
Name: count, dtype: int64


In [1831]:
import pandas as pd

# Assuming df is your DataFrame
if not df.empty:
    # Group by 'track_id' and 'playlist_genre' and count the occurrences
    track_genre_counts = df.groupby(['track_id', 'playlist_genre']).size().reset_index(name='count_per_genre')

In [1832]:
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
12644,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,01/01/2001,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,rock,...,2,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440
25675,002xjHwzEx66OWFV2IP9dk,The Others,RIKA,15,1ficfUnZMaY1QkNp15Slzm,The Others,26/01/2018,Groovy // Funky // Neo-Soul,0JmBB9HfrzDiZoPVRdv8ns,r&b,...,5,-6.242,1,0.0347,0.0651,0.0,0.212,0.698,150.863,197286
15061,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,21/11/2017,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,rock,...,9,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512
2887,008MceT31RotUANsKuzy3L,Liquid Blue,The.madpix.project,24,1Z4ANBVuhTlS6DprlP0m1q,Liquid Blue,07/08/2015,Electropop - Pop,5TiiHps0hNCyQ6ijVkNZQs,pop,...,10,-5.644,0,0.054,0.000761,0.132,0.322,0.852,128.041,228565
1029,008rk8F6ZxspZT4bUlkIQG,Fever,YOSA & TAAR,38,2BuYm9UcKvI0ydXs5JKwt0,Fever,16/11/2018,Best of 2019 Dance Pop: Japan,37i9dQZF1DXdOtZGKonFlM,pop,...,1,-6.3,1,0.0499,0.114,0.000697,0.0881,0.496,129.884,236308


In [1834]:
df.isnull().sum()

track_id                    0
track_name                  4
track_artist                4
track_popularity            0
track_album_id              0
track_album_name            4
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
dtype: int64

In [1836]:
print(df.columns)

Index(['track_id', 'track_name', 'track_artist', 'track_popularity',
       'track_album_id', 'track_album_name', 'track_album_release_date',
       'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms'],
      dtype='object')


In [1838]:
df_shape = df.shape
print(f"Shape of the DataFrame: {df_shape}")


Shape of the DataFrame: (28356, 23)


In [1840]:
#df = df[df['track_album_release_date'].str.match(r'^\d{4}$')]
#df.head()

In [1842]:
df.shape

(28356, 23)

In [1845]:
df

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
12644,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,01/01/2001,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,rock,...,2,-10.068,1,0.0236,0.279000,0.011700,0.0887,0.566,97.091,235440
25675,002xjHwzEx66OWFV2IP9dk,The Others,RIKA,15,1ficfUnZMaY1QkNp15Slzm,The Others,26/01/2018,Groovy // Funky // Neo-Soul,0JmBB9HfrzDiZoPVRdv8ns,r&b,...,5,-6.242,1,0.0347,0.065100,0.000000,0.2120,0.698,150.863,197286
15061,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,21/11/2017,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,rock,...,9,-4.739,1,0.0442,0.011700,0.009940,0.3470,0.404,135.225,373512
2887,008MceT31RotUANsKuzy3L,Liquid Blue,The.madpix.project,24,1Z4ANBVuhTlS6DprlP0m1q,Liquid Blue,07/08/2015,Electropop - Pop,5TiiHps0hNCyQ6ijVkNZQs,pop,...,10,-5.644,0,0.0540,0.000761,0.132000,0.3220,0.852,128.041,228565
1029,008rk8F6ZxspZT4bUlkIQG,Fever,YOSA & TAAR,38,2BuYm9UcKvI0ydXs5JKwt0,Fever,16/11/2018,Best of 2019 Dance Pop: Japan,37i9dQZF1DXdOtZGKonFlM,pop,...,1,-6.300,1,0.0499,0.114000,0.000697,0.0881,0.496,129.884,236308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22211,7zxRMhXxJMQCeDDg0rKAVo,Some Way,NAV,72,4JR29CNW14Zc4Z5vflxkoJ,NAV,24/02/2017,PROJECT: Contemporary,6HaCi9bqaiuSZEDfCEmwyo,r&b,...,0,-6.103,1,0.3510,0.101000,0.000000,0.0919,0.340,154.962,179773
24247,7zyLObYw4QUKQDyZOb4J0Y,I'll Do 4 U (Re-Recorded / Remastered),Father MC,36,14HYMxFhpgDIr9cci1u0kt,I'll Do 4 U (Re-Recorded / Remastered),01/10/2010,New Jack Swing/ R&B Hits: 1987 - 2002,4sji14lrB5bgcr51lPALYH,r&b,...,1,-4.920,0,0.0633,0.143000,0.000000,0.0720,0.810,109.536,223890
3793,7zycSpvjDcqh6YT1FEl2kY,Anaconda,Nicki Minaj,49,5qs8T6ZHSrnllnOuUk6muC,The Pinkprint (Deluxe Edition),15/12/2014,10er Playlist,1kEczIkZH8IgaWT2BiApxZ,pop,...,2,-6.224,1,0.1800,0.067300,0.000006,0.2140,0.647,129.990,260240
26767,7zye9v6B785eFWEFYs13C2,Bound,Ponderosa Twins Plus One,40,1xdgLmTFMSyJyI5DJOOX7T,2+2+1 = (Digitally Remastered),09/07/2013,Sexy Soul 2020,5EMARioe9z9eKOeWIAC2JW,r&b,...,5,-6.457,0,0.0270,0.715000,0.000428,0.1150,0.657,142.218,191205


In [1847]:
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
12644,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,01/01/2001,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,rock,...,2,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440
25675,002xjHwzEx66OWFV2IP9dk,The Others,RIKA,15,1ficfUnZMaY1QkNp15Slzm,The Others,26/01/2018,Groovy // Funky // Neo-Soul,0JmBB9HfrzDiZoPVRdv8ns,r&b,...,5,-6.242,1,0.0347,0.0651,0.0,0.212,0.698,150.863,197286
15061,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,21/11/2017,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,rock,...,9,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512
2887,008MceT31RotUANsKuzy3L,Liquid Blue,The.madpix.project,24,1Z4ANBVuhTlS6DprlP0m1q,Liquid Blue,07/08/2015,Electropop - Pop,5TiiHps0hNCyQ6ijVkNZQs,pop,...,10,-5.644,0,0.054,0.000761,0.132,0.322,0.852,128.041,228565
1029,008rk8F6ZxspZT4bUlkIQG,Fever,YOSA & TAAR,38,2BuYm9UcKvI0ydXs5JKwt0,Fever,16/11/2018,Best of 2019 Dance Pop: Japan,37i9dQZF1DXdOtZGKonFlM,pop,...,1,-6.3,1,0.0499,0.114,0.000697,0.0881,0.496,129.884,236308


In [1849]:
from IPython.display import display, HTML
display(HTML("<b>track_album_release_date</b>"))

In [1853]:
len(df['track_id'].unique())

28356

In [1855]:
df.shape

(28356, 23)

In [1857]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28356 entries, 12644 to 12608
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   track_id                  28356 non-null  object  
 1   track_name                28352 non-null  object  
 2   track_artist              28352 non-null  object  
 3   track_popularity          28356 non-null  int64   
 4   track_album_id            28356 non-null  object  
 5   track_album_name          28352 non-null  object  
 6   track_album_release_date  28356 non-null  object  
 7   playlist_name             28356 non-null  object  
 8   playlist_id               28356 non-null  object  
 9   playlist_genre            28356 non-null  category
 10  playlist_subgenre         28356 non-null  object  
 11  danceability              28356 non-null  float64 
 12  energy                    28356 non-null  float64 
 13  key                       28356 non-null  int64

In [1859]:
# Get the number of columns in the DataFrame
num_columns = df.shape[1]

# Print the number of columns
print(f"Number of columns in the DataFrame: {num_columns}")


Number of columns in the DataFrame: 23


In [1861]:
# Get unique values in the 'playlist_genre' column
unique_genres = df['playlist_genre'].unique()

# Display the unique genres
print(unique_genres)


['rock', 'r&b', 'pop', 'latin', 'edm', 'rap']
Categories (6, object): ['edm' < 'rap' < 'pop' < 'r&b' < 'latin' < 'rock']


In [1863]:
# Group by 'playlist_genre' and count the occurrences of each genre
genre_counts = df.groupby('playlist_genre').size().reset_index(name='count')

# Display the result
print("Number of playlist_genre in each category:")
print(genre_counts)


Number of playlist_genre in each category:
  playlist_genre  count
0            edm   4877
1            rap   5057
2            pop   4420
3            r&b   4918
4          latin   4633
5           rock   4451


In [1865]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28356 entries, 12644 to 12608
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   track_id                  28356 non-null  object  
 1   track_name                28352 non-null  object  
 2   track_artist              28352 non-null  object  
 3   track_popularity          28356 non-null  int64   
 4   track_album_id            28356 non-null  object  
 5   track_album_name          28352 non-null  object  
 6   track_album_release_date  28356 non-null  object  
 7   playlist_name             28356 non-null  object  
 8   playlist_id               28356 non-null  object  
 9   playlist_genre            28356 non-null  category
 10  playlist_subgenre         28356 non-null  object  
 11  danceability              28356 non-null  float64 
 12  energy                    28356 non-null  float64 
 13  key                       28356 non-null  int64

In [1867]:
print(df.columns)


Index(['track_id', 'track_name', 'track_artist', 'track_popularity',
       'track_album_id', 'track_album_name', 'track_album_release_date',
       'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms'],
      dtype='object')


In [1869]:
df['track_id'] = df['track_id'].astype('string')
df['track_name'] = df['track_name'].astype('string')
df['track_artist'] = df['track_artist'].astype('string')
df['track_album_id'] = df['track_album_id'].astype('string')
df['track_album_name'] = df['track_album_name'].astype('string')
df['playlist_name'] = df['playlist_name'].astype('string')
df['playlist_id'] = df['playlist_id'].astype('string')
df['playlist_genre'] = df['playlist_genre'].astype('category')
df['playlist_subgenre'] = df['playlist_subgenre'].astype('category')

In [1871]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28356 entries, 12644 to 12608
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   track_id                  28356 non-null  string  
 1   track_name                28352 non-null  string  
 2   track_artist              28352 non-null  string  
 3   track_popularity          28356 non-null  int64   
 4   track_album_id            28356 non-null  string  
 5   track_album_name          28352 non-null  string  
 6   track_album_release_date  28356 non-null  object  
 7   playlist_name             28356 non-null  string  
 8   playlist_id               28356 non-null  string  
 9   playlist_genre            28356 non-null  category
 10  playlist_subgenre         28356 non-null  category
 11  danceability              28356 non-null  float64 
 12  energy                    28356 non-null  float64 
 13  key                       28356 non-null  int64

In [1873]:
#chek 10:50
import pandas as pd

# Convert the entire column to datetime64[ns], allowing for mixed formats
df['track_album_release_date'] = pd.to_datetime(df['track_album_release_date'], errors='coerce', dayfirst=True)
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
12644,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,2001-01-01,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,rock,...,2,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440
25675,002xjHwzEx66OWFV2IP9dk,The Others,RIKA,15,1ficfUnZMaY1QkNp15Slzm,The Others,2018-01-26,Groovy // Funky // Neo-Soul,0JmBB9HfrzDiZoPVRdv8ns,r&b,...,5,-6.242,1,0.0347,0.0651,0.0,0.212,0.698,150.863,197286
15061,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,2017-11-21,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,rock,...,9,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512
2887,008MceT31RotUANsKuzy3L,Liquid Blue,The.madpix.project,24,1Z4ANBVuhTlS6DprlP0m1q,Liquid Blue,2015-08-07,Electropop - Pop,5TiiHps0hNCyQ6ijVkNZQs,pop,...,10,-5.644,0,0.054,0.000761,0.132,0.322,0.852,128.041,228565
1029,008rk8F6ZxspZT4bUlkIQG,Fever,YOSA & TAAR,38,2BuYm9UcKvI0ydXs5JKwt0,Fever,2018-11-16,Best of 2019 Dance Pop: Japan,37i9dQZF1DXdOtZGKonFlM,pop,...,1,-6.3,1,0.0499,0.114,0.000697,0.0881,0.496,129.884,236308


In [1875]:
# Step 1: Ensure the 'track_album_release_date' is in datetime format
df['track_album_release_date'] = pd.to_datetime(df['track_album_release_date'], errors='coerce', dayfirst=True)

# Step 2: Create the 'track_album_release_year' and 'track_album_release_month' columns
df['track_album_release_year'] = df['track_album_release_date'].dt.year
df['track_album_release_month'] = df['track_album_release_date'].dt.month

# Step 3: Remove the original 'track_album_release_date' column
df = df.drop(columns=['track_album_release_date'])
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,playlist_name,playlist_id,playlist_genre,playlist_subgenre,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_album_release_year,track_album_release_month
12644,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,rock,classic rock,...,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,2001.0,1.0
25675,002xjHwzEx66OWFV2IP9dk,The Others,RIKA,15,1ficfUnZMaY1QkNp15Slzm,The Others,Groovy // Funky // Neo-Soul,0JmBB9HfrzDiZoPVRdv8ns,r&b,neo soul,...,1,0.0347,0.0651,0.0,0.212,0.698,150.863,197286,2018.0,1.0
15061,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,rock,hard rock,...,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,2017.0,11.0
2887,008MceT31RotUANsKuzy3L,Liquid Blue,The.madpix.project,24,1Z4ANBVuhTlS6DprlP0m1q,Liquid Blue,Electropop - Pop,5TiiHps0hNCyQ6ijVkNZQs,pop,electropop,...,0,0.054,0.000761,0.132,0.322,0.852,128.041,228565,2015.0,8.0
1029,008rk8F6ZxspZT4bUlkIQG,Fever,YOSA & TAAR,38,2BuYm9UcKvI0ydXs5JKwt0,Fever,Best of 2019 Dance Pop: Japan,37i9dQZF1DXdOtZGKonFlM,pop,dance pop,...,1,0.0499,0.114,0.000697,0.0881,0.496,129.884,236308,2018.0,11.0


In [1877]:
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,playlist_name,playlist_id,playlist_genre,playlist_subgenre,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_album_release_year,track_album_release_month
12644,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,rock,classic rock,...,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,2001.0,1.0
25675,002xjHwzEx66OWFV2IP9dk,The Others,RIKA,15,1ficfUnZMaY1QkNp15Slzm,The Others,Groovy // Funky // Neo-Soul,0JmBB9HfrzDiZoPVRdv8ns,r&b,neo soul,...,1,0.0347,0.0651,0.0,0.212,0.698,150.863,197286,2018.0,1.0
15061,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,rock,hard rock,...,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,2017.0,11.0
2887,008MceT31RotUANsKuzy3L,Liquid Blue,The.madpix.project,24,1Z4ANBVuhTlS6DprlP0m1q,Liquid Blue,Electropop - Pop,5TiiHps0hNCyQ6ijVkNZQs,pop,electropop,...,0,0.054,0.000761,0.132,0.322,0.852,128.041,228565,2015.0,8.0
1029,008rk8F6ZxspZT4bUlkIQG,Fever,YOSA & TAAR,38,2BuYm9UcKvI0ydXs5JKwt0,Fever,Best of 2019 Dance Pop: Japan,37i9dQZF1DXdOtZGKonFlM,pop,dance pop,...,1,0.0499,0.114,0.000697,0.0881,0.496,129.884,236308,2018.0,11.0


In [1879]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
track_popularity,28356.0,39.329771,23.702376,0.0,21.0,42.0,58.0,100.0
danceability,28356.0,0.653372,0.145785,0.0,0.561,0.67,0.76,0.983
energy,28356.0,0.698388,0.183503,0.000175,0.579,0.722,0.843,1.0
key,28356.0,5.368,3.613904,0.0,2.0,6.0,9.0,11.0
loudness,28356.0,-6.817696,3.036243,-46.448,-8.30925,-6.261,-4.709,1.275
mode,28356.0,0.565489,0.495701,0.0,0.0,1.0,1.0,1.0
speechiness,28356.0,0.107954,0.102556,0.0,0.041,0.0626,0.133,0.918
acousticness,28356.0,0.177176,0.222803,0.0,0.014375,0.0797,0.26,0.994
instrumentalness,28356.0,0.091117,0.232548,0.0,0.0,2.1e-05,0.00657,0.994
liveness,28356.0,0.190958,0.155894,0.0,0.0926,0.127,0.249,0.996


In [1881]:
# Check for missing values in all columns
missing_values = df.isnull().sum()

# Display the result
print(missing_values)

track_id                        0
track_name                      4
track_artist                    4
track_popularity                0
track_album_id                  0
track_album_name                4
playlist_name                   0
playlist_id                     0
playlist_genre                  0
playlist_subgenre               0
danceability                    0
energy                          0
key                             0
loudness                        0
mode                            0
speechiness                     0
acousticness                    0
instrumentalness                0
liveness                        0
valence                         0
tempo                           0
duration_ms                     0
track_album_release_year     1681
track_album_release_month    1681
dtype: int64


In [1883]:
# Display all rows with missing values
df[df.isnull().any(axis=1)]

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,playlist_name,playlist_id,playlist_genre,playlist_subgenre,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_album_release_year,track_album_release_month
12529,00qOE7OjRl0BpYiCiweZB2,Juke Box Hero,Foreigner,67,2Pw51hAGvWpTA3AYl2WVuu,4 (Expanded),Classic Rock Drive,37i9dQZF1DXdOEFt9ZX0dh,rock,classic rock,...,1,0.0654,0.082800,0.000000,0.0844,0.522,176.647,259800,,
20572,01QhVHvkGqzO861Kl3GhfX,I'll Be All You'll Ever Need,Trinere,27,5BC8h7zzDUifrb8PJJ4UTM,All Night,80's Freestyle/Disco Dance Party (Set Crossfad...,1oReEujyWpQv2OX68BVPPA,latin,latin hip hop,...,0,0.0546,0.007940,0.000290,0.0619,0.770,123.551,324360,,
24300,01gSIlj7mZnuRr7GWajrgT,Try My Love,Shinehead,32,3pcdxm6Bp0nrOFQdYkZnWp,Sidewalk University,90's NEW JACK SWING,0dmInkymNnOTWvEFamSNzb,r&b,new jack swing,...,0,0.1130,0.194000,0.000000,0.0761,0.752,103.307,264893,,
14108,01q4ccXbvPlCwZ1fPiFaeM,Girlfriend in a Coma - 2011 Remaster,The Smiths,56,7jfexk2w5aDI25njkN0UGg,"Strangeways, Here We Come",permanent wave,7rPsfDTqiZYIT4PVzQ4c0c,rock,permanent wave,...,1,0.0255,0.126000,0.000001,0.0995,0.960,104.333,122760,,
6865,02GwPAncasVL82yC7y2hmN,Raise Up,Petey Pablo,54,5HRSHdifYk7QXStjKB0SJZ,Diary of a Sinner: 1st Entry,Southern Hip Hop,4lcyWQDOzPfcbZrcBI3FOW,rap,southern hip hop,...,1,0.1670,0.142000,0.000000,0.3190,0.742,156.069,286427,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1699,7zHRcLHhkdkVDSwR0CKdLB,All the Pretty Girls,fun.,42,6CsgMETT9H9NzcQSMD4Dfq,Aim and Ignite,Pop Punk | Post-Hardcore,1YgWQAs1s77NzieIH4ARKn,pop,post-teen pop,...,1,0.0602,0.258000,0.000000,0.1990,0.773,131.036,202893,,
8332,7zKZwfo5J4KA2vBpQHfQWp,Heart Of A Hustler,Fat Pat,0,2rzVFboUmXKVqSr0WDHmGR,Greatest Hits,3rd Coast Classics,1QJ66s6YBZgxMUaVUyrhbo,rap,southern hip hop,...,1,0.2740,0.026600,0.000000,0.1690,0.684,168.403,326267,,
14082,7zSAI7Ae6DPVepN8MksKVZ,Space Age Love Song,A Flock Of Seagulls,50,48ajNqhmdKrGVwJo0UGMiV,We Are The '80s,permanent wave,7rPsfDTqiZYIT4PVzQ4c0c,rock,permanent wave,...,1,0.0311,0.000036,0.106000,0.2100,0.552,139.959,226693,,
11615,7zSDDsIlks515d0tZGM64x,The Next Time I Fall (with Amy Grant),Peter Cetera,57,1O2sEdKLsSHROEyYgUQmnb,Solitude / Solitaire,80s Pop & Rock Hits and Album Tracks,0XOIK4m26aeYSD61E5nSVW,rock,album rock,...,1,0.0280,0.488000,0.002210,0.0645,0.368,118.288,225973,,


In [1885]:
df['track_popularity'].value_counts()

track_popularity
0      2620
1       546
51      484
57      479
50      474
       ... 
97        3
95        2
100       1
99        1
96        1
Name: count, Length: 101, dtype: int64

In [1887]:
print(df.columns)

Index(['track_id', 'track_name', 'track_artist', 'track_popularity',
       'track_album_id', 'track_album_name', 'playlist_name', 'playlist_id',
       'playlist_genre', 'playlist_subgenre', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms',
       'track_album_release_year', 'track_album_release_month'],
      dtype='object')


In [1889]:
playlist_genre_counts = df['playlist_genre'].value_counts()

# Display the counts
print(playlist_genre_counts)          

playlist_genre
rap      5057
r&b      4918
edm      4877
latin    4633
rock     4451
pop      4420
Name: count, dtype: int64


In [1891]:
playlist_subgenre_counts = df['playlist_subgenre'].value_counts()

# Display the counts
print(playlist_subgenre_counts)         

playlist_subgenre
southern hip hop             1512
neo soul                     1496
progressive electro house    1460
indie poptimism              1457
electro house                1416
latin hip hop                1336
urban contemporary           1323
tropical                     1288
gangster rap                 1285
latin pop                    1230
hard rock                    1211
hip hop                      1190
classic rock                 1142
electropop                   1124
trap                         1070
album rock                   1065
new jack swing               1057
hip pop                      1042
big room                     1034
permanent wave               1033
dance pop                    1000
pop edm                       967
post-teen pop                 839
reggaeton                     779
Name: count, dtype: int64


In [1893]:
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,playlist_name,playlist_id,playlist_genre,playlist_subgenre,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_album_release_year,track_album_release_month
12644,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,rock,classic rock,...,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,2001.0,1.0
25675,002xjHwzEx66OWFV2IP9dk,The Others,RIKA,15,1ficfUnZMaY1QkNp15Slzm,The Others,Groovy // Funky // Neo-Soul,0JmBB9HfrzDiZoPVRdv8ns,r&b,neo soul,...,1,0.0347,0.0651,0.0,0.212,0.698,150.863,197286,2018.0,1.0
15061,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,rock,hard rock,...,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,2017.0,11.0
2887,008MceT31RotUANsKuzy3L,Liquid Blue,The.madpix.project,24,1Z4ANBVuhTlS6DprlP0m1q,Liquid Blue,Electropop - Pop,5TiiHps0hNCyQ6ijVkNZQs,pop,electropop,...,0,0.054,0.000761,0.132,0.322,0.852,128.041,228565,2015.0,8.0
1029,008rk8F6ZxspZT4bUlkIQG,Fever,YOSA & TAAR,38,2BuYm9UcKvI0ydXs5JKwt0,Fever,Best of 2019 Dance Pop: Japan,37i9dQZF1DXdOtZGKonFlM,pop,dance pop,...,1,0.0499,0.114,0.000697,0.0881,0.496,129.884,236308,2018.0,11.0


In [1895]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28356 entries, 12644 to 12608
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   track_id                   28356 non-null  string  
 1   track_name                 28352 non-null  string  
 2   track_artist               28352 non-null  string  
 3   track_popularity           28356 non-null  int64   
 4   track_album_id             28356 non-null  string  
 5   track_album_name           28352 non-null  string  
 6   playlist_name              28356 non-null  string  
 7   playlist_id                28356 non-null  string  
 8   playlist_genre             28356 non-null  category
 9   playlist_subgenre          28356 non-null  category
 10  danceability               28356 non-null  float64 
 11  energy                     28356 non-null  float64 
 12  key                        28356 non-null  int64   
 13  loudness                   28356

In [1897]:
num_unique_subgenres = df['playlist_subgenre'].nunique()
print(f"Number of unique subgenres: {num_unique_subgenres}")

Number of unique subgenres: 24


In [1899]:
# Get the number of columns in the DataFrame
num_columns = df.shape[1]

# Print the number of columns
print(f"Number of columns in the DataFrame: {num_columns}")

Number of columns in the DataFrame: 24


In [1901]:
# חשב את מספר הפעמים שכל playlist_id מופיע
playlist_counts = df.groupby('playlist_id').size()

# הצג את ה-DataFrame עם כמות השורות לכל playlist_id
print(playlist_counts)


playlist_id
0275i1VNfBnsNbPl0QIBpG    91
03qQtbNHoJuFezRu2CnLuF    12
03sDEv7FN58Mb9CJOs1Tgn    38
06zrBJ5cts5aemZmqe80J7    46
07SNJ4MwYba9wwmzrbjmYi    94
                          ..
7sq6nuruoMyDhEWkX2oYOg    43
7tkgK1tm9hYkWp7EFyOcAr    27
7vJOXFe40axY7qS39vGDyH    19
7xWdFCrU5Gka6qp1ODrSdK    36
7xWuNevFBmwnFEg6wzdCc7    60
Length: 471, dtype: int64


In [1903]:
import pandas as pd
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Function to clean text (this will be used for all relevant columns)
def clean_text(text):
    if pd.isna(text):
        return ''  # If the text is NaN, return an empty string
    
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation (but keep numbers)
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Remove stop words (using sklearn's ENGLISH_STOP_WORDS or you can define your own list)
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    
    return text

# Check if the columns exist before applying cleaning
if 'track_name' in df.columns:
    # Clean the 'track_name' column
    df['track_name'] = df['track_name'].apply(clean_text)
else:
    print("'track_name' column is not in the DataFrame")

if 'track_album_name' in df.columns:
    # Clean the 'track_album_name' column
    df['track_album_name'] = df['track_album_name'].apply(clean_text)
else:
    print("'track_album_name' column is not in the DataFrame")

if 'playlist_name' in df.columns:
    # Clean the 'playlist_name' column
    df['playlist_name'] = df['playlist_name'].apply(clean_text)
else:
    print("'playlist_name' column is not in the DataFrame")

if 'track_album_id' in df.columns:
    # Clean the 'track_album_id' column (if needed)
    df['track_album_id'] = df['track_album_id'].apply(clean_text)
else:
    print("'track_album_id' column is not in the DataFrame")

if 'track_artist' in df.columns:
    # Clean the 'track_artist' column
    df['track_artist'] = df['track_artist'].apply(clean_text)
else:
    print("'track_artist' column is not in the DataFrame")

# Step 1: Move cleaned columns from df to df1
df1 = df[['track_name', 'track_album_name', 'playlist_name']].copy()

# Step 2: Remove the cleaned columns from df
df.drop(columns=['track_name', 'track_album_name', 'playlist_name'], inplace=True)

In [1904]:
df.head()

Unnamed: 0,track_id,track_artist,track_popularity,track_album_id,playlist_id,playlist_genre,playlist_subgenre,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_album_release_year,track_album_release_month
12644,0017A6SJgTbfQVU2EtsPNo,barbies cradle,41,1srjq0njeqgd8w4xsqi4jq,37i9dQZF1DWYDQ8wBxd7xt,rock,classic rock,0.682,0.401,2,...,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,2001.0,1.0
25675,002xjHwzEx66OWFV2IP9dk,rika,15,1ficfunzmay1qknp15slzm,0JmBB9HfrzDiZoPVRdv8ns,r&b,neo soul,0.582,0.704,5,...,1,0.0347,0.0651,0.0,0.212,0.698,150.863,197286,2018.0,1.0
15061,004s3t0ONYlzxII9PLgU6z,steady rollin,28,3z04lb9dsilqw68sht6jlb,3YouF0u7waJnolytf9JCXf,rock,hard rock,0.303,0.88,9,...,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,2017.0,11.0
2887,008MceT31RotUANsKuzy3L,themadpixproject,24,1z4anbvuhtls6dprlp0m1q,5TiiHps0hNCyQ6ijVkNZQs,pop,electropop,0.659,0.794,10,...,0,0.054,0.000761,0.132,0.322,0.852,128.041,228565,2015.0,8.0
1029,008rk8F6ZxspZT4bUlkIQG,yosa taar,38,2buym9uckvi0ydxs5jkwt0,37i9dQZF1DXdOtZGKonFlM,pop,dance pop,0.662,0.838,1,...,1,0.0499,0.114,0.000697,0.0881,0.496,129.884,236308,2018.0,11.0


In [1905]:
df1.head()

Unnamed: 0,track_name,track_album_name,playlist_name
12644,pangarap,trip,pinoy classic rock
25675,,,groovy funky neosoul
15061,feel alive,love loss,hard rock workout
2887,liquid blue,liquid blue,electropop pop
1029,fever,fever,best 2019 dance pop japan


In [1906]:
import pandas as pd

# Assuming df is your DataFrame
# Convert 'track_id' and 'playlist_id' to lowercase
if 'track_id' in df.columns:
    df['track_id'] = df['track_id'].str.lower()

if 'playlist_id' in df.columns:
    df['playlist_id'] = df['playlist_id'].str.lower()

# Now, 'track_id' and 'playlist_id' columns are in lowercase


In [1911]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28356 entries, 12644 to 12608
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   track_name        28356 non-null  object
 1   track_album_name  28356 non-null  object
 2   playlist_name     28356 non-null  object
dtypes: object(3)
memory usage: 886.1+ KB


In [1913]:
df.head()

Unnamed: 0,track_id,track_artist,track_popularity,track_album_id,playlist_id,playlist_genre,playlist_subgenre,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_album_release_year,track_album_release_month
12644,0017a6sjgtbfqvu2etspno,barbies cradle,41,1srjq0njeqgd8w4xsqi4jq,37i9dqzf1dwydq8wbxd7xt,rock,classic rock,0.682,0.401,2,...,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,2001.0,1.0
25675,002xjhwzex66owfv2ip9dk,rika,15,1ficfunzmay1qknp15slzm,0jmbb9hfrzdizopvrdv8ns,r&b,neo soul,0.582,0.704,5,...,1,0.0347,0.0651,0.0,0.212,0.698,150.863,197286,2018.0,1.0
15061,004s3t0onylzxii9plgu6z,steady rollin,28,3z04lb9dsilqw68sht6jlb,3youf0u7wajnolytf9jcxf,rock,hard rock,0.303,0.88,9,...,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,2017.0,11.0
2887,008mcet31rotuanskuzy3l,themadpixproject,24,1z4anbvuhtls6dprlp0m1q,5tiihps0hncyq6ijvknzqs,pop,electropop,0.659,0.794,10,...,0,0.054,0.000761,0.132,0.322,0.852,128.041,228565,2015.0,8.0
1029,008rk8f6zxspzt4bulkiqg,yosa taar,38,2buym9uckvi0ydxs5jkwt0,37i9dqzf1dxdotzgkonflm,pop,dance pop,0.662,0.838,1,...,1,0.0499,0.114,0.000697,0.0881,0.496,129.884,236308,2018.0,11.0


In [1915]:
df1['track_name'] = df1['track_name'].astype('string')
df1['track_album_name'] = df1['track_album_name'].astype('string')
df1['playlist_name'] = df1['playlist_name'].astype('string')
df['track_album_id'] = df['track_album_id'].astype('string')
df['track_artist'] = df['track_artist'].astype('string')

In [1917]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28356 entries, 12644 to 12608
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   track_name        28356 non-null  string
 1   track_album_name  28356 non-null  string
 2   playlist_name     28356 non-null  string
dtypes: string(3)
memory usage: 886.1 KB


In [1919]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28356 entries, 12644 to 12608
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   track_id                   28356 non-null  string  
 1   track_artist               28356 non-null  string  
 2   track_popularity           28356 non-null  int64   
 3   track_album_id             28356 non-null  string  
 4   playlist_id                28356 non-null  string  
 5   playlist_genre             28356 non-null  category
 6   playlist_subgenre          28356 non-null  category
 7   danceability               28356 non-null  float64 
 8   energy                     28356 non-null  float64 
 9   key                        28356 non-null  int64   
 10  loudness                   28356 non-null  float64 
 11  mode                       28356 non-null  int64   
 12  speechiness                28356 non-null  float64 
 13  acousticness               28356

In [1921]:
# Step 1: Move the 'playlist_subgenre' column from df to df1
df1['playlist_subgenre'] = df['playlist_subgenre']

# Step 2: Remove the 'playlist_subgenre' column from df
df = df.drop(columns=['playlist_subgenre'])

In [1923]:
df


Unnamed: 0,track_id,track_artist,track_popularity,track_album_id,playlist_id,playlist_genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_album_release_year,track_album_release_month
12644,0017a6sjgtbfqvu2etspno,barbies cradle,41,1srjq0njeqgd8w4xsqi4jq,37i9dqzf1dwydq8wbxd7xt,rock,0.682,0.401,2,-10.068,1,0.0236,0.279000,0.011700,0.0887,0.566,97.091,235440,2001.0,1.0
25675,002xjhwzex66owfv2ip9dk,rika,15,1ficfunzmay1qknp15slzm,0jmbb9hfrzdizopvrdv8ns,r&b,0.582,0.704,5,-6.242,1,0.0347,0.065100,0.000000,0.2120,0.698,150.863,197286,2018.0,1.0
15061,004s3t0onylzxii9plgu6z,steady rollin,28,3z04lb9dsilqw68sht6jlb,3youf0u7wajnolytf9jcxf,rock,0.303,0.880,9,-4.739,1,0.0442,0.011700,0.009940,0.3470,0.404,135.225,373512,2017.0,11.0
2887,008mcet31rotuanskuzy3l,themadpixproject,24,1z4anbvuhtls6dprlp0m1q,5tiihps0hncyq6ijvknzqs,pop,0.659,0.794,10,-5.644,0,0.0540,0.000761,0.132000,0.3220,0.852,128.041,228565,2015.0,8.0
1029,008rk8f6zxspzt4bulkiqg,yosa taar,38,2buym9uckvi0ydxs5jkwt0,37i9dqzf1dxdotzgkonflm,pop,0.662,0.838,1,-6.300,1,0.0499,0.114000,0.000697,0.0881,0.496,129.884,236308,2018.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22211,7zxrmhxxjmqceddg0rkavo,nav,72,4jr29cnw14zc4z5vflxkoj,6haci9bqaiuszedfcemwyo,r&b,0.744,0.715,0,-6.103,1,0.3510,0.101000,0.000000,0.0919,0.340,154.962,179773,2017.0,2.0
24247,7zylobyw4qukqdyzob4j0y,father mc,36,14hymxfhpgdir9cci1u0kt,4sji14lrb5bgcr51lpalyh,r&b,0.832,0.666,1,-4.920,0,0.0633,0.143000,0.000000,0.0720,0.810,109.536,223890,2010.0,10.0
3793,7zycspvjdcqh6yt1fel2ky,nicki minaj,49,5qs8t6zhsrnllnouuk6muc,1keczikzh8igawt2biapxz,pop,0.963,0.603,2,-6.224,1,0.1800,0.067300,0.000006,0.2140,0.647,129.990,260240,2014.0,12.0
26767,7zye9v6b785efwefys13c2,ponderosa twins plus,40,1xdglmtfmsyjyi5djoox7t,5emarioe9z9ekoewiac2jw,r&b,0.458,0.540,5,-6.457,0,0.0270,0.715000,0.000428,0.1150,0.657,142.218,191205,2013.0,7.0


In [1925]:
import pandas as pd

# Step 1: Count the occurrences of each genre per track_id
genre_counts = df.groupby(['track_id', 'playlist_genre']).size().reset_index(name='genre_count')

# Step 2: Identify the most popular playlist_genre for each track_id
most_popular_genre = genre_counts.loc[genre_counts.groupby('track_id')['genre_count'].idxmax()]

# Step 3: Merge the most popular genre information back into the original DataFrame
df_with_popular_genre = df.merge(most_popular_genre[['track_id', 'playlist_genre']], on=['track_id', 'playlist_genre'], how='left')

# Step 4: Remove duplicates, keeping only the row with the most popular genre for each track_id
df_unique_tracks = df_with_popular_genre.drop_duplicates(subset='track_id', keep='first')

# Step 5: Optionally, display the result
print(df_unique_tracks.head())


                 track_id      track_artist  track_popularity  \
0  0017a6sjgtbfqvu2etspno    barbies cradle                41   
1  002xjhwzex66owfv2ip9dk              rika                15   
2  004s3t0onylzxii9plgu6z     steady rollin                28   
3  008mcet31rotuanskuzy3l  themadpixproject                24   
4  008rk8f6zxspzt4bulkiqg         yosa taar                38   

           track_album_id             playlist_id playlist_genre  \
0  1srjq0njeqgd8w4xsqi4jq  37i9dqzf1dwydq8wbxd7xt           rock   
1  1ficfunzmay1qknp15slzm  0jmbb9hfrzdizopvrdv8ns            r&b   
2  3z04lb9dsilqw68sht6jlb  3youf0u7wajnolytf9jcxf           rock   
3  1z4anbvuhtls6dprlp0m1q  5tiihps0hncyq6ijvknzqs            pop   
4  2buym9uckvi0ydxs5jkwt0  37i9dqzf1dxdotzgkonflm            pop   

   danceability  energy  key  loudness  mode  speechiness  acousticness  \
0         0.682   0.401    2   -10.068     1       0.0236      0.279000   
1         0.582   0.704    5    -6.242     1      

In [1926]:
df.head()

Unnamed: 0,track_id,track_artist,track_popularity,track_album_id,playlist_id,playlist_genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_album_release_year,track_album_release_month
12644,0017a6sjgtbfqvu2etspno,barbies cradle,41,1srjq0njeqgd8w4xsqi4jq,37i9dqzf1dwydq8wbxd7xt,rock,0.682,0.401,2,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,2001.0,1.0
25675,002xjhwzex66owfv2ip9dk,rika,15,1ficfunzmay1qknp15slzm,0jmbb9hfrzdizopvrdv8ns,r&b,0.582,0.704,5,-6.242,1,0.0347,0.0651,0.0,0.212,0.698,150.863,197286,2018.0,1.0
15061,004s3t0onylzxii9plgu6z,steady rollin,28,3z04lb9dsilqw68sht6jlb,3youf0u7wajnolytf9jcxf,rock,0.303,0.88,9,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,2017.0,11.0
2887,008mcet31rotuanskuzy3l,themadpixproject,24,1z4anbvuhtls6dprlp0m1q,5tiihps0hncyq6ijvknzqs,pop,0.659,0.794,10,-5.644,0,0.054,0.000761,0.132,0.322,0.852,128.041,228565,2015.0,8.0
1029,008rk8f6zxspzt4bulkiqg,yosa taar,38,2buym9uckvi0ydxs5jkwt0,37i9dqzf1dxdotzgkonflm,pop,0.662,0.838,1,-6.3,1,0.0499,0.114,0.000697,0.0881,0.496,129.884,236308,2018.0,11.0


In [1929]:
df.head()

Unnamed: 0,track_id,track_artist,track_popularity,track_album_id,playlist_id,playlist_genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_album_release_year,track_album_release_month
12644,0017a6sjgtbfqvu2etspno,barbies cradle,41,1srjq0njeqgd8w4xsqi4jq,37i9dqzf1dwydq8wbxd7xt,rock,0.682,0.401,2,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,2001.0,1.0
25675,002xjhwzex66owfv2ip9dk,rika,15,1ficfunzmay1qknp15slzm,0jmbb9hfrzdizopvrdv8ns,r&b,0.582,0.704,5,-6.242,1,0.0347,0.0651,0.0,0.212,0.698,150.863,197286,2018.0,1.0
15061,004s3t0onylzxii9plgu6z,steady rollin,28,3z04lb9dsilqw68sht6jlb,3youf0u7wajnolytf9jcxf,rock,0.303,0.88,9,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,2017.0,11.0
2887,008mcet31rotuanskuzy3l,themadpixproject,24,1z4anbvuhtls6dprlp0m1q,5tiihps0hncyq6ijvknzqs,pop,0.659,0.794,10,-5.644,0,0.054,0.000761,0.132,0.322,0.852,128.041,228565,2015.0,8.0
1029,008rk8f6zxspzt4bulkiqg,yosa taar,38,2buym9uckvi0ydxs5jkwt0,37i9dqzf1dxdotzgkonflm,pop,0.662,0.838,1,-6.3,1,0.0499,0.114,0.000697,0.0881,0.496,129.884,236308,2018.0,11.0


In [1931]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28356 entries, 12644 to 12608
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   track_id                   28356 non-null  string  
 1   track_artist               28356 non-null  string  
 2   track_popularity           28356 non-null  int64   
 3   track_album_id             28356 non-null  string  
 4   playlist_id                28356 non-null  string  
 5   playlist_genre             28356 non-null  category
 6   danceability               28356 non-null  float64 
 7   energy                     28356 non-null  float64 
 8   key                        28356 non-null  int64   
 9   loudness                   28356 non-null  float64 
 10  mode                       28356 non-null  int64   
 11  speechiness                28356 non-null  float64 
 12  acousticness               28356 non-null  float64 
 13  instrumentalness           28356

In [1933]:
df1

Unnamed: 0,track_name,track_album_name,playlist_name,playlist_subgenre
12644,pangarap,trip,pinoy classic rock,classic rock
25675,,,groovy funky neosoul,neo soul
15061,feel alive,love loss,hard rock workout,hard rock
2887,liquid blue,liquid blue,electropop pop,electropop
1029,fever,fever,best 2019 dance pop japan,dance pop
...,...,...,...,...
22211,way,nav,project contemporary,urban contemporary
24247,ill 4 u rerecorded remastered,ill 4 u rerecorded remastered,new jack swing rb hits 1987 2002,new jack swing
3793,anaconda,pinkprint deluxe edition,10er playlist,electropop
26767,bound,221 digitally remastered,sexy soul 2020,neo soul


In [1935]:
import pickle
#Write a pickle file
with open('popularity.pkl', 'wb') as f:
    pickle.dump(df, f)

In [1937]:
# Check missing values in a specific column
print(df.isnull().sum())


track_id                        0
track_artist                    0
track_popularity                0
track_album_id                  0
playlist_id                     0
playlist_genre                  0
danceability                    0
energy                          0
key                             0
loudness                        0
mode                            0
speechiness                     0
acousticness                    0
instrumentalness                0
liveness                        0
valence                         0
tempo                           0
duration_ms                     0
track_album_release_year     1681
track_album_release_month    1681
dtype: int64


In [1939]:
df.playlist_genre.unique()



['rock', 'r&b', 'pop', 'latin', 'edm', 'rap']
Categories (6, object): ['edm' < 'rap' < 'pop' < 'r&b' < 'latin' < 'rock']

In [1941]:
# Save df and df1 to CSV files with proper file extensions
df.to_csv('spotify_df_after_data_prep.csv', index=False)  # Adding .csv extension
df1.to_csv('spotify_df1_after_data_prep.csv', index=False)  # Adding .csv extension

# Save df and df1 to Pickle files
df.to_pickle('spotify_df_after_data_prep.pkl')  # Saving as .pkl
df1.to_pickle('spotify_df1_after_data_prep.pkl')  # Saving as .pkl

# Print confirmation
print("Spotify dataset saved as CSV and Pickle files.")


Spotify dataset saved as CSV and Pickle files.


In [1942]:
df1.head()

Unnamed: 0,track_name,track_album_name,playlist_name,playlist_subgenre
12644,pangarap,trip,pinoy classic rock,classic rock
25675,,,groovy funky neosoul,neo soul
15061,feel alive,love loss,hard rock workout,hard rock
2887,liquid blue,liquid blue,electropop pop,electropop
1029,fever,fever,best 2019 dance pop japan,dance pop


In [1945]:
df.head()

Unnamed: 0,track_id,track_artist,track_popularity,track_album_id,playlist_id,playlist_genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_album_release_year,track_album_release_month
12644,0017a6sjgtbfqvu2etspno,barbies cradle,41,1srjq0njeqgd8w4xsqi4jq,37i9dqzf1dwydq8wbxd7xt,rock,0.682,0.401,2,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,2001.0,1.0
25675,002xjhwzex66owfv2ip9dk,rika,15,1ficfunzmay1qknp15slzm,0jmbb9hfrzdizopvrdv8ns,r&b,0.582,0.704,5,-6.242,1,0.0347,0.0651,0.0,0.212,0.698,150.863,197286,2018.0,1.0
15061,004s3t0onylzxii9plgu6z,steady rollin,28,3z04lb9dsilqw68sht6jlb,3youf0u7wajnolytf9jcxf,rock,0.303,0.88,9,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,2017.0,11.0
2887,008mcet31rotuanskuzy3l,themadpixproject,24,1z4anbvuhtls6dprlp0m1q,5tiihps0hncyq6ijvknzqs,pop,0.659,0.794,10,-5.644,0,0.054,0.000761,0.132,0.322,0.852,128.041,228565,2015.0,8.0
1029,008rk8f6zxspzt4bulkiqg,yosa taar,38,2buym9uckvi0ydxs5jkwt0,37i9dqzf1dxdotzgkonflm,pop,0.662,0.838,1,-6.3,1,0.0499,0.114,0.000697,0.0881,0.496,129.884,236308,2018.0,11.0


In [1947]:
df.shape

(28356, 20)