In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# path = '/content/drive/MyDrive/Data Science/DataScienceGroup/1_Project_df_files/Coding_file/df_merge.csv'
path = 'spotify_final_dataset.csv'

In [4]:
df = pd.read_csv(path)

In [5]:
df.shape

(100000, 15)

In [6]:
df.isnull().sum()

track_pos                    0
track_artist_name            0
track_track_name             0
track_duration_ms            0
track_album_name             0
playlist_name                0
playlist_num_artists         0
playlist_num_albums          0
playlist_num_tracks          0
playlist_num_followers       0
playlist_num_edits           0
playlist_duration_ms         0
playlist_collaborative       0
bag_of_words              7757
sentiment_bag_of_words       0
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
# df = df_merge.copy()

In [9]:
df.head()

Unnamed: 0,track_pos,track_artist_name,track_track_name,track_duration_ms,track_album_name,playlist_name,playlist_num_artists,playlist_num_albums,playlist_num_tracks,playlist_num_followers,playlist_num_edits,playlist_duration_ms,playlist_collaborative,bag_of_words,sentiment_bag_of_words
0,0,The Jackson 5,ABC,174866,ABC,party party,116,142,152,1,3,39413578,False,jackson c easy love b baby michael sing come s...,0.7964
1,1,Streetlight Manifesto,Point/Counterpoint,327920,Everything Goes Numb,party party,116,142,152,1,3,39413578,False,know dont never would ill ive like wont cant im,0.1316
2,2,Michael Jackson,Billie Jean,293826,Thriller 25 Super Deluxe Edition,party party,116,142,152,1,3,39413578,False,jean one billie lover uh son baby kid hoo girl,0.5859
3,3,Green Day,Basket Case,181533,Dookie,party party,116,142,152,1,3,39413578,False,sometimes chorus give creeps mind plays tricks...,0.128
4,4,The White Stripes,Seven Nation Army,231800,Elephant,party party,116,142,152,1,3,39413578,False,im na gon back comin prechorus instrumental bl...,0.0


In [10]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load your dataset
# df = pd.read_csv("your_data.csv")  # Uncomment if loading from a CSV file

# Sample Data Preparation (Replace with your actual data)
# Assume `df` has columns like 'track_artist_name', 'track_name', 'playlist_name', and so on.

# Step 1: Apply TF-IDF to `bag_of_words` column to convert lyrics or descriptions to numerical features
tfidf = TfidfVectorizer(max_features=100)  # Limit to top 100 features for memory efficiency
tfidf_matrix = tfidf.fit_transform(df['bag_of_words'])

# Convert TF-IDF result to a DataFrame and concatenate with original `df`
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
df = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Step 2: Select and scale the features for KNN
features = [
    'track_pos', 'track_duration_ms', 'playlist_num_tracks',
    'playlist_num_followers', 'playlist_duration_ms', 'sentiment_bag_of_words'
] + tfidf_df.columns.tolist()  # Include TF-IDF features in `features`

scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Step 3: Train the KNN model on the feature space
knn = NearestNeighbors(n_neighbors=5, algorithm='auto')  # Using more neighbors initially
knn.fit(df[features])

# Step 4: Define a function to recommend tracks without repetitions
def recommend_tracks(seed_tracks, num_recommendations=5):
    # Ensure there are no duplicates in the seed tracks
    seed_tracks = list(set(seed_tracks))

    # Find the rows corresponding to the seed tracks
    seed_data = df[df['track_track_name'].isin(seed_tracks)]
    if seed_data.empty:
        raise ValueError("Seed tracks not found in the dataset.")

    # Calculate the mean vector for the seed tracks (average over seed tracks' features)
    seed_vector = seed_data[features].mean(axis=0).values.reshape(1, -1)

    # Find the nearest neighbors to the seed vector
    distances, indices = knn.kneighbors(seed_vector, n_neighbors=30)

    # Extract recommended tracks, avoiding repeats in the seed playlist
    recommendations = []
    for idx in indices[0]:
        track_name = df.iloc[idx]['track_track_name']
        if track_name not in seed_tracks and track_name not in recommendations:
            recommendations.append(track_name)
        if len(recommendations) == num_recommendations:
            break

    # Return the recommended track names and other details
    recommended_tracks = df[df['track_track_name'].isin(recommendations)]
    return recommended_tracks[['track_artist_name', 'track_track_name']]

# Example usage
seed_tracks = ["ABC", "Basket Case"]  # Replace with actual seed tracks from the playlist

# Get 5 unique recommendations
recommendations = recommend_tracks(seed_tracks, num_recommendations=5)
print("Recommended Tracks:")
print(recommendations.drop_duplicates().head(5))


Recommended Tracks:
      track_artist_name                            track_track_name
5685           Wage War                                 Johnny Cash
8474            Son Lux                                        Easy
11415     Faith No More                                        Easy
13919               Cro                                        Easy
15383       The Beatles  Lucy In The Sky With Diamonds - Remastered


In [11]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

# Assuming `df` is your DataFrame

# Encode categorical features
label_encoder = LabelEncoder()
df['track_artist_encoded'] = label_encoder.fit_transform(df['track_artist_name'])
df['track_album_encoded'] = label_encoder.fit_transform(df['track_album_name'])
df['playlist_name_encoded'] = label_encoder.fit_transform(df['playlist_name'])

# Vectorize `bag_of_words`
tfidf = TfidfVectorizer(max_features=100)
bag_of_words_matrix = tfidf.fit_transform(df['bag_of_words']).toarray()
for i in range(bag_of_words_matrix.shape[1]):
    df[f'bow_{i}'] = bag_of_words_matrix[:, i]

# Prepare features
features = ['track_duration_ms', 'playlist_num_tracks', 'playlist_num_artists',
            'playlist_num_albums', 'playlist_num_followers', 'playlist_duration_ms',
            'sentiment_bag_of_words', 'track_artist_encoded', 'track_album_encoded',
            'playlist_name_encoded'] + [f'bow_{i}' for i in range(bag_of_words_matrix.shape[1])]
X = df[features]

# Fit Nearest Neighbors model
nbrs = NearestNeighbors(n_neighbors=10, algorithm='auto')
nbrs.fit(X)




In [12]:
# Select initial tracks and find neighbors
initial_indices = [0, 1, 2]  # replace with actual indices of initial set in playlist
initial_tracks = X.iloc[initial_indices]

# Get neighbors
distances, indices = nbrs.kneighbors(initial_tracks)

# Aggregate and filter top recommendations
recommended_indices = np.unique(indices.flatten())
recommended_tracks = df.iloc[recommended_indices]

# Remove duplicates and already-seen tracks
recommended_tracks = recommended_tracks[~recommended_tracks['track_track_name'].duplicated()]
recommended_tracks = recommended_tracks[~recommended_tracks.index.isin(initial_indices)]

# Get the top 5 unique recommendations
next_tracks = recommended_tracks.head(5)
print(next_tracks[['track_artist_name', 'track_track_name']])

      track_artist_name                 track_track_name
466           Sugarland                             Stay
5523       Sweet Valley                Sentimental Trash
35241            T-Pain                        Bartender
35250            T-Pain  Buy U a Drank (Shawty Snappin')
40687   Michael Jackson      P.Y.T. (Pretty Young Thing)


In [13]:
# Select initial tracks and find neighbors
initial_indices = [0, 1, 2, 24]  # replace with actual indices of initial set in playlist
initial_tracks = X.iloc[initial_indices]

# Get neighbors
distances, indices = nbrs.kneighbors(initial_tracks)

# Aggregate and filter top recommendations
recommended_indices = np.unique(indices.flatten())
recommended_tracks = df.iloc[recommended_indices]

# Remove duplicates and already-seen tracks
recommended_tracks = recommended_tracks[~recommended_tracks['track_track_name'].duplicated()]
recommended_tracks = recommended_tracks[~recommended_tracks.index.isin(initial_indices)]

# Get the top 5 unique recommendations
next_tracks = recommended_tracks.head(5)
print(next_tracks[['track_artist_name', 'track_track_name']])

      track_artist_name   track_track_name
466           Sugarland               Stay
5523       Sweet Valley  Sentimental Trash
8621          Grouplove        Tongue Tied
12827         Grinspoon     Chemical Heart
35241            T-Pain          Bartender


In [14]:
# Select initial tracks and find neighbors
initial_indices = [24,57,300,305,333,350]  # replace with actual indices of initial set in playlist
initial_tracks = X.iloc[initial_indices]

# Get neighbors
distances, indices = nbrs.kneighbors(initial_tracks)

# Aggregate and filter top recommendations
recommended_indices = np.unique(indices.flatten())
recommended_tracks = df.iloc[recommended_indices]

# Remove duplicates and already-seen tracks
recommended_tracks = recommended_tracks[~recommended_tracks['track_track_name'].duplicated()]
recommended_tracks = recommended_tracks[~recommended_tracks.index.isin(initial_indices)]

# Get the top 5 unique recommendations
next_tracks = recommended_tracks.head(5)
print(next_tracks[['track_artist_name', 'track_track_name']])

    track_artist_name                                   track_track_name
56             Boston                               Foreplay / Long Time
211        Lil Yachty                                        Fucked Over
276            Father                  Everybody in the Club Gettin Shot
303          Fat Nick  Bicky Robby (feat. Mikey the Magician & Shakew...
314          Fat Nick               Anthems for a Seventeen Year Old Boy


In [15]:
# Select initial tracks and find neighbors
initial_indices = [150,159]  # replace with actual indices of initial set in playlist
initial_tracks = X.iloc[initial_indices]

# Get neighbors
distances, indices = nbrs.kneighbors(initial_tracks)

# Aggregate and filter top recommendations
recommended_indices = np.unique(indices.flatten())
recommended_tracks = df.iloc[recommended_indices]

# Remove duplicates and already-seen tracks
recommended_tracks = recommended_tracks[~recommended_tracks['track_track_name'].duplicated()]
recommended_tracks = recommended_tracks[~recommended_tracks.index.isin(initial_indices)]

# Get the top 5 unique recommendations
next_tracks = recommended_tracks.head(5)
print(next_tracks[['track_artist_name', 'track_track_name']])

      track_artist_name                    track_track_name
661         Wiz Khalifa                  King Of Everything
1827         Luis Fonsi  Despacito (Featuring Daddy Yankee)
6220         Witt Lowry                     Kindest Regards
17146         Zella Day                            Hypnotic
21347       Wiz Khalifa           Lit (feat. Ty Dolla $ign)


In [16]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

# Assuming `df` is your DataFrame

# Encode categorical features
label_encoder = LabelEncoder()
df['track_artist_encoded'] = label_encoder.fit_transform(df['track_artist_name'])
df['track_album_encoded'] = label_encoder.fit_transform(df['track_album_name'])
df['playlist_name_encoded'] = label_encoder.fit_transform(df['playlist_name'])

# Vectorize `bag_of_words`
tfidf = TfidfVectorizer(max_features=100)
bag_of_words_matrix = tfidf.fit_transform(df['bag_of_words']).toarray()
for i in range(bag_of_words_matrix.shape[1]):
    df[f'bow_{i}'] = bag_of_words_matrix[:, i]

# Prepare features
features = ['track_duration_ms', 'playlist_num_tracks', 'playlist_num_artists',
            'playlist_num_albums', 'playlist_num_followers', 'playlist_duration_ms',
            'sentiment_bag_of_words', 'track_artist_encoded', 'track_album_encoded',
            'playlist_name_encoded'] + [f'bow_{i}' for i in range(bag_of_words_matrix.shape[1])]
X = df[features]

# Fit Nearest Neighbors model
nbrs = NearestNeighbors(n_neighbors=10, algorithm='auto', metric='cosine')
nbrs.fit(X)

# Select initial tracks and find neighbors
initial_indices = [0, 1, 2]  # replace with actual indices of initial set in playlist
initial_tracks = X.iloc[initial_indices]

# Get neighbors
distances, indices = nbrs.kneighbors(initial_tracks)

# Aggregate and filter top recommendations
recommended_indices = np.unique(indices.flatten())
recommended_tracks = df.iloc[recommended_indices]

# Remove duplicates and already-seen tracks
recommended_tracks = recommended_tracks[~recommended_tracks['track_track_name'].duplicated()]
recommended_tracks = recommended_tracks[~recommended_tracks.index.isin(initial_indices)]

# Get the top 5 unique recommendations
next_tracks = recommended_tracks.head(5)
print(next_tracks[['track_artist_name', 'track_track_name']])


         track_artist_name                                   track_track_name
22000  Panic! At The Disco  There's A Good Reason These Tables Are Numbere...
22009  Panic! At The Disco                 I Constantly Thank God For Esteban
22118  Panic! At The Disco                                      Time To Dance
28862                Migos                Bad and Boujee (feat. Lil Uzi Vert)
33582  Panic! At The Disco                         Build God, Then We'll Talk


# KNN Euclidean

In [17]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

# Assuming `df` is your DataFrame

# Encode categorical features
label_encoder = LabelEncoder()
df['track_artist_encoded'] = label_encoder.fit_transform(df['track_artist_name'])
df['track_album_encoded'] = label_encoder.fit_transform(df['track_album_name'])
df['playlist_name_encoded'] = label_encoder.fit_transform(df['playlist_name'])

# Vectorize `bag_of_words`
tfidf = TfidfVectorizer(max_features=100)
bag_of_words_matrix = tfidf.fit_transform(df['bag_of_words']).toarray()
for i in range(bag_of_words_matrix.shape[1]):
    df[f'bow_{i}'] = bag_of_words_matrix[:, i]

# Prepare features
features = ['track_duration_ms', 'playlist_num_tracks', 'playlist_num_artists',
            'playlist_num_albums', 'playlist_num_followers', 'playlist_duration_ms',
            'sentiment_bag_of_words', 'track_artist_encoded', 'track_album_encoded',
            'playlist_name_encoded'] + [f'bow_{i}' for i in range(bag_of_words_matrix.shape[1])]
X = df[features]

# Fit Nearest Neighbors model
nbrs = NearestNeighbors(n_neighbors=10, algorithm='auto', metric='euclidean')
nbrs.fit(X)



In [18]:
initial_indices = [200,206,190]  # replace with actual indices of initial set in playlist
initial_tracks = X.iloc[initial_indices]

# Get neighbors
distances, indices = nbrs.kneighbors(initial_tracks)

# Aggregate and filter top recommendations
recommended_indices = np.unique(indices.flatten())
recommended_tracks = df.iloc[recommended_indices]

# Remove duplicates and already-seen tracks
recommended_tracks = recommended_tracks[~recommended_tracks['track_track_name'].duplicated()]
recommended_tracks = recommended_tracks[~recommended_tracks.index.isin(initial_indices)]

# Get the top 5 unique recommendations
next_tracks = recommended_tracks.head(5)
print(next_tracks[['track_artist_name', 'track_track_name','playlist_name']])


    track_artist_name  track_track_name playlist_name
156    Bankroll Mafia            Hyenas           Rap
225       Xavier Wulf    Wulf Takahashi           Rap
233       Xavier Wulf  Akina Speed Star           Rap
234       Xavier Wulf  1st Summer Night           Rap
235       Xavier Wulf     Wulf of Akina           Rap


In [19]:
df.iloc[[200,206,190]].sentiment_bag_of_words.mean()

-1.6049679972737276

In [20]:
initial_indices = [1,2,200,206,190]  # replace with actual indices of initial set in playlist
initial_tracks = X.iloc[initial_indices]

# Get neighbors
distances, indices = nbrs.kneighbors(initial_tracks)

# Aggregate and filter top recommendations
recommended_indices = np.unique(indices.flatten())
recommended_tracks = df.iloc[recommended_indices]

# Remove duplicates and already-seen tracks
recommended_tracks = recommended_tracks[~recommended_tracks['track_track_name'].duplicated()]
recommended_tracks = recommended_tracks[~recommended_tracks.index.isin(initial_indices)]

# Get the top 5 unique recommendations
next_tracks = recommended_tracks.head(5)
print(next_tracks[['track_artist_name', 'track_track_name','playlist_name']])


    track_artist_name  track_track_name playlist_name
156    Bankroll Mafia            Hyenas           Rap
225       Xavier Wulf    Wulf Takahashi           Rap
233       Xavier Wulf  Akina Speed Star           Rap
234       Xavier Wulf  1st Summer Night           Rap
235       Xavier Wulf     Wulf of Akina           Rap


In [21]:
df[df.playlist_name == 'Piano Music']

Unnamed: 0,track_pos,track_artist_name,track_track_name,track_duration_ms,track_album_name,playlist_name,playlist_num_artists,playlist_num_albums,playlist_num_tracks,playlist_num_followers,...,bow_90,bow_91,bow_92,bow_93,bow_94,bow_95,bow_96,bow_97,bow_98,bow_99
63946,-1.104281,Michael Silverman,"Pavane In C Minor, Op. 50",-1.881811,"50 Best Loved Classical, Patriotic, And Christ...",Piano Music,12,21,-1.272971,-0.032845,...,0.0,0.0,0.0,0.0,0.424687,0.0,0.0,0.0,0.0,0.0
63947,-1.083129,Jeffrey Michael,All I Ask Of You,0.15312,Cinematic Fantasy,Piano Music,12,21,-1.272971,-0.032845,...,0.0,0.0,0.0,0.0,0.0,0.0,0.458421,0.0,0.0,0.0
63948,-0.998522,The O'Neill Brothers,The Prayer,-0.05052,The Journey,Piano Music,12,21,-1.272971,-0.032845,...,0.0,0.0,0.0,0.0,0.0,0.524071,0.0,0.0,0.0,0.0
63949,-0.97737,The O'Neill Brothers,I Will Be Here,0.590719,A Day To Remember - Instrumental Music for You...,Piano Music,12,21,-1.272971,-0.032845,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63950,-0.956218,The O'Neill Brothers,To Make You Feel My Love,-0.388269,Wedding Songs: Top 15 Wedding Ceremony Songs,Piano Music,12,21,-1.272971,-0.032845,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63951,-0.913915,David Tolk,In Reverence,0.221589,In Reverence,Piano Music,12,21,-1.272971,-0.032845,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63952,-0.85046,David Nevue,The Night Season,0.2835,Whisperings - The Best of David Nevue,Piano Music,12,21,-1.272971,-0.032845,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63953,-0.829308,David Nevue,Home,-0.207735,The Vigil,Piano Music,12,21,-1.272971,-0.032845,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63954,-0.808156,David Nevue,While the Trees Sleep,1.805185,While the Trees Sleep,Piano Music,12,21,-1.272971,-0.032845,...,0.0,0.0,0.0,0.0,0.670483,0.0,0.0,0.0,0.0,0.0
63955,-0.787004,Jeffrey Michael,Titanic,2.110654,Cinematic Fantasy,Piano Music,12,21,-1.272971,-0.032845,...,0.0,0.0,0.0,0.572412,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
initial_indices = [58709]  # replace with actual indices of initial set in playlist
initial_tracks = X.iloc[initial_indices]

# Get neighbors
distances, indices = nbrs.kneighbors(initial_tracks)

# Aggregate and filter top recommendations
recommended_indices = np.unique(indices.flatten())
recommended_tracks = df.iloc[recommended_indices]

# Remove duplicates and already-seen tracks
recommended_tracks = recommended_tracks[~recommended_tracks['track_track_name'].duplicated()]
recommended_tracks = recommended_tracks[~recommended_tracks.index.isin(initial_indices)]

# Get the top 5 unique recommendations
next_tracks = recommended_tracks.head(5)
print(next_tracks[['track_artist_name', 'track_track_name','playlist_name']])


      track_artist_name               track_track_name playlist_name
3914      Missy Elliott                        Work It          GALa
15459             Melvv                           Vibe  Experimental
24536       Marvin Gaye  Ain't No Mountain High Enough           Hap
50595        Matt Costa                    Mr. Pitiful   Alternative
58727            Metrik                     Starchaser           DnB


Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
Successfully installed tokenizers-0.21.0 transformers-4.47.0
