In [2]:
from pymongo import MongoClient
import pandas as pd

In [4]:
connection_string = "mongodb+srv://thurein:yZvltzHCExQyT4Mw@cluster1.gd9wruo.mongodb.net/?retryWrites=true&w=majority"
client = MongoClient(connection_string)
db = client['MustreamDatabase']
collection = db['tracks']
data = collection.find()

spotify_songs_df = pd.DataFrame(data)

In [33]:
spotify_songs_df.dtypes

_id                          object
track_id                     object
track_name                   object
track_artist                 object
track_popularity              int64
track_album_id               object
track_album_name             object
track_album_release_date     object
playlist_name                object
playlist_id                  object
playlist_genre               object
playlist_subgenre            object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
duration_ms                   int64
dtype: object

In [34]:
import torch
from collections import defaultdict

In [35]:
# Create mapping for songs
genre_to_idx = {genre: idx for idx, genre in enumerate(spotify_songs_df['playlist_subgenre'].unique())}
artist_to_idx = {artist: idx for idx, artist in enumerate(spotify_songs_df['track_artist'].unique())}
track_to_idx = {track: idx for idx, track in enumerate(spotify_songs_df['track_name'].unique())}
idx_to_track = {idx: track for track, idx in track_to_idx.items()}


In [19]:
print(spotify_songs_df[spotify_songs_df['track_popularity'] > 80]['playlist_subgenre'].unique())

['dance pop' 'post-teen pop' 'electropop' 'indie poptimism' 'hip hop'
 'southern hip hop' 'gangster rap' 'trap' 'album rock' 'classic rock'
 'permanent wave' 'hard rock' 'tropical' 'latin pop' 'reggaeton'
 'latin hip hop' 'urban contemporary' 'hip pop' 'neo soul' 'electro house'
 'big room' 'pop edm' 'progressive electro house']


In [10]:
print(spotify_songs_df[spotify_songs_df['track_popularity'] > 90]['track_artist'].unique())

['The Black Eyed Peas' 'MEDUZA' 'Billie Eilish' 'Regard' 'KAROL G'
 'Shawn Mendes' 'Maroon 5' 'The Weeknd' 'Juice WRLD' 'Justin Bieber'
 'Tones and I' 'Lewis Capaldi' 'J Balvin' 'Selena Gomez' 'Anuel AA'
 'Sam Smith' 'Ed Sheeran' 'Harry Styles' 'Travis Scott' 'Rauw Alejandro'
 'Post Malone' 'Dua Lipa' 'blackbear' 'Dan + Shay' 'Y2K' 'Camila Cabello'
 'Trevor Daniel' 'Tyga' 'Don Toliver' 'DaBaby' 'Future' 'Roddy Ricch'
 'Lil Uzi Vert' 'Bad Bunny' 'Dalex' 'Arizona Zervas']


In [36]:
print(len(spotify_songs_df.index))
print(spotify_songs_df['track_artist'].nunique())
print(spotify_songs_df['playlist_subgenre'].nunique())
print(list(spotify_songs_df['playlist_subgenre'].unique()))

32833
10693
24
['dance pop', 'post-teen pop', 'electropop', 'indie poptimism', 'hip hop', 'southern hip hop', 'gangster rap', 'trap', 'album rock', 'classic rock', 'permanent wave', 'hard rock', 'tropical', 'latin pop', 'reggaeton', 'latin hip hop', 'urban contemporary', 'hip pop', 'new jack swing', 'neo soul', 'electro house', 'big room', 'pop edm', 'progressive electro house']


In [37]:
import torch.nn as nn

class SongRecommender(nn.Module):
    def __init__(self, num_songs, num_genres, num_artists, embedding_size):
        super(SongRecommender, self).__init__()
        self.song_embedding = nn.Embedding(num_songs, embedding_size)
        self.genre_embedding = nn.Embedding(num_genres, embedding_size)
        self.artist_embedding = nn.Embedding(num_artists, embedding_size)
        # The linear layer should match the combined size of all embeddings
        self.fc = nn.Linear(embedding_size * 3, 1)

    def forward(self, genre_indices, artist_indices, song_indices):
        genre_embed = self.genre_embedding(genre_indices)
        artist_embed = self.artist_embedding(artist_indices)
        song_embed = self.song_embedding(song_indices)

        # Ensure all tensors are 2D
        if genre_embed.ndim == 1:
            genre_embed = genre_embed.unsqueeze(0)
        if artist_embed.ndim == 1:
            artist_embed = artist_embed.unsqueeze(0)
        if song_embed.ndim == 1:
            song_embed = song_embed.unsqueeze(0)

        # Concatenate embeddings along the feature dimension
        combined = torch.cat((genre_embed, artist_embed, song_embed), dim=1)
        scores = self.fc(combined).squeeze()
        return scores

num_songs = len(track_to_idx)
num_genres = len(genre_to_idx)
num_artists = len(artist_to_idx)
embedding_size = 50 

model = SongRecommender(num_songs, num_genres, num_artists, embedding_size)

# Train the model (You should train it with your actual data)

# Save the entire model (including architecture and weights)
model_filename = 'mustream-recommender.pth'
torch.save(model, model_filename)


In [38]:
def recommend_songs(model, user_like_genres, user_like_artists, top_k=10):
    # Convert user preferences to indices
    genre_indices = torch.tensor([genre_to_idx.get(genre, -1) for genre in user_like_genres if genre in genre_to_idx], dtype=torch.long)
    artist_indices = torch.tensor([artist_to_idx.get(artist, -1) for artist in user_like_artists if artist in artist_to_idx], dtype=torch.long)

    if len(genre_indices) == 0 or len(artist_indices) == 0:
        return []

    # Calculate the average embedding for genres and artists
    avg_genre_embedding = model.genre_embedding(genre_indices).mean(dim=0, keepdim=True)
    avg_artist_embedding = model.artist_embedding(artist_indices).mean(dim=0, keepdim=True)

    # Prepare song embeddings
    song_indices = torch.arange(len(track_to_idx), dtype=torch.long)
    song_embeddings = model.song_embedding(song_indices)

    # Repeat genre and artist embeddings to match the number of songs
    repeated_genre_embedding = avg_genre_embedding.repeat(len(track_to_idx), 1)
    repeated_artist_embedding = avg_artist_embedding.repeat(len(track_to_idx), 1)

    # Concatenate embeddings
    combined_embeddings = torch.cat((repeated_genre_embedding, repeated_artist_embedding, song_embeddings), dim=1)

    # Get song scores from the model
    model.eval()
    with torch.no_grad():
        song_scores = model.fc(combined_embeddings).squeeze()

    # Get the top k song indices
    _, top_song_indices = torch.topk(song_scores, top_k, largest=True)
    top_song_indices = top_song_indices.cpu().numpy()

    # Map indices to song names
    recommended_songs = [idx_to_track[idx] for idx in top_song_indices]
    return recommended_songs


In [39]:
num_songs = len(track_to_idx)
num_genres = len(genre_to_idx)
num_artists = len(artist_to_idx)
embedding_size = 50  # Example size, adjust as needed

model = SongRecommender(num_songs, num_genres, num_artists, embedding_size)

# Example user preferences
user_like_genres = ['electropop', 'hip hop']  # Example genres
user_like_artists = ['Ed Sheeran', 'The Beatles']  # Example artists

# Get recommendations (assuming the model is already trained)
recommended_songs = recommend_songs(model, user_like_genres, user_like_artists)
print("Recommended Songs:", recommended_songs)

Recommended Songs: ['Tommy Gun', 'How We Move (feat. King L)', 'Conga', 'Méditerranée', 'All Night Long - Zakkov Remix', 'One Wine (feat. Major Lazer)', 'Until', 'Cataclysm جائحة', 'Psycho Killer - 2005 Remaster', 'Waverunners']
