<a href="https://colab.research.google.com/github/dev-yusupov/ml-projects/blob/main/Musicoo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Preprocessing**

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load and preprocess the data
df = pd.read_csv("spotify_data.csv")
df = df.dropna(axis=0).reset_index(drop=True)

In [None]:
columns = ['index', 'Unnamed: 0', 'artist_name', 'track_name', 'track_id',
       'popularity', 'year', 'genre', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']

df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166.0,3.0
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387.0,4.0
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960.0,4.0
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293.0,4.0
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320.0,4.0


In [None]:
scaler = StandardScaler()
numerical_cols = ["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness",
                  "liveness", "valence", "tempo", "duration_ms"]
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Encode categorical variables
label_encoder = LabelEncoder()
categorical_cols = ["artist_name", "track_name", "track_id", "genre", "key"]
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,541,2509,4578,68,2012,0,-0.046171,-1.180304,4,-0.177757,1,-0.389774,1.112289,-0.786464,-0.488707,-0.992529,0.401178,-0.232911,3.0
1,1,541,59,1668,50,2012,0,0.424051,-0.630242,3,-0.218606,1,-0.677281,0.492352,-0.786428,-0.585958,0.409071,0.631896,-0.393953,4.0
2,2,585,1353,6476,57,2012,0,-0.437142,-1.431657,3,-0.832241,1,-0.567995,0.09525,-0.786333,-0.62961,-0.970163,0.619979,-0.782875,4.0
3,3,163,1674,5434,58,2012,0,-0.52696,-1.36973,10,-0.139595,1,-0.500742,1.435113,-0.786464,-0.683761,0.382977,2.837577,0.201386,4.0
4,4,57,4705,6107,54,2012,0,-0.326191,0.597379,6,0.653383,0,-0.603303,-0.662958,-0.735924,-0.516335,-0.701771,1.710647,-0.204778,4.0


In [None]:
# Define features for similarity calculation
features = df[['genre', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
               'instrumentalness', 'liveness', 'valence', 'tempo']]

# Calculate similarity matrix
similarity_matrix = cosine_similarity(features, features)

# Function to decode track ID to song name and artist name
def decode_track_id(track_id):
    track_info = df[df['track_id'] == track_id]
    if not track_info.empty:
        track_name = track_info.iloc[0]['track_name']
        artist_name = track_info.iloc[0]['artist_name']
        # Fit the label encoder to the data used to generate recommendations
        label_encoder.fit(df[['track_name', 'artist_name']].values.ravel())

        # Convert track_name and artist_name to integers
        track_id_int = int(track_id)

        # Get the decoded track name and artist name
        decoded_track_name = label_encoder.inverse_transform([track_id_int])[0]
        decoded_artist_name = label_encoder.inverse_transform([track_id_int])[0]
        return decoded_track_name, decoded_artist_name
    else:
        return "Unknown", "Unknown"




# Recommendation function with similarity score threshold
def recommend_tracks(selected_tracks, n_recommendations=3, similarity_threshold=0.1):
    selected_indices = [index for index, _, _, _, _ in selected_tracks]
    avg_similarity_scores = np.mean([similarity_matrix[selected_index] for selected_index in selected_indices], axis=0)
    # Filter recommendations by similarity score threshold
    top_recommendations_indices = [idx for idx, score in enumerate(avg_similarity_scores) if score > similarity_threshold]
    top_recommendations_indices = top_recommendations_indices[-n_recommendations:][::-1]  # Select top recommendations
    recommended_tracks = [(df.iloc[index]['track_id'], avg_similarity_scores[index]) for index in top_recommendations_indices]
    return recommended_tracks



In [None]:


# User inputs number of random songs to select
n_random_songs = int(input("Enter the number of random songs to select: "))

# Select a random subset of the dataset for recommendation
subset_df = df.sample(n=min(n_random_songs, len(df)), replace=False)
selected_tracks = [(index, row['track_name'], row['artist_name'], row['genre'], row['year']) for index, row in subset_df.iterrows()]

# Print selected random songs
print("\nSelected random songs:")
for i, (index, track, artist, genre, year) in enumerate(selected_tracks, start=1):
    print(f"{i}. {track} by {artist} ({genre}, {year})")

# Get recommendations for selected songs
recommendations = recommend_tracks(selected_tracks)

# Print recommendations
print("\nGeneral recommendations based on the selected songs:")
for i, (track_id, similarity_score) in enumerate(recommendations, start=1):
    decoded_track_name, decoded_artist_name = decode_track_id(track_id)
    print(f"{i}. {decoded_track_name} by {decoded_artist_name} (Similarity Score: {similarity_score:.2f})")