In [1]:
import os
import pandas as pd

def read_all_csv_in_folder(folder_path):
    all_files = os.listdir(folder_path)  # List all files and directories in the folder
    csv_files = [file for file in all_files if file.endswith('.csv')]  # Filter for CSV files

    # Read each CSV file and store in a list
    dataframes = [pd.read_csv(os.path.join(folder_path, file), on_bad_lines='skip') for file in csv_files]

    # Concatenate all DataFrames into one
    full_dataframe = pd.concat(dataframes, ignore_index=True)
    return full_dataframe

# Use the function for both folders
audio_df = read_all_csv_in_folder('./audio_features')
track_df = read_all_csv_in_folder('./tracks')

# Merge the dataframes on a common column, e.g., 'feature_id'
merged_df = pd.merge(audio_df, track_df, on='id')

merged_df.head()


Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,time_signature,artist_id,album_id,disc_number,explicit,name,track_number,popularity,duration_ms_y,spotify_url
0,6YV2AI87l1n2fzqU8Dyo05,0.402,0.514,1,-7.322,0,0.0471,0.106,0.0,0.181,...,4,3TVXtAsR1Inumwj472S9r4,4Q7cRXio6mF2ImVUCcezPO,1,True,Virginia Beach,1,57.0,251094,https://open.spotify.com/track/6YV2AI87l1n2fzq...
1,59ZmQR4pwCaa5iM3veM7Cs,0.646,0.605,0,-7.59,1,0.188,0.0636,0.0,0.253,...,3,3TVXtAsR1Inumwj472S9r4,4Q7cRXio6mF2ImVUCcezPO,1,True,Amen (feat. Teezo Touchdown),2,49.0,141212,https://open.spotify.com/track/59ZmQR4pwCaa5iM...
2,2R30S0W4JCM9JaQWlpmeWn,0.571,0.55,0,-6.567,1,0.106,0.0548,0.0,0.179,...,4,3TVXtAsR1Inumwj472S9r4,4Q7cRXio6mF2ImVUCcezPO,1,True,Calling For You (feat. 21 Savage),3,50.0,285630,https://open.spotify.com/track/2R30S0W4JCM9JaQ...
3,6LFW4dEsLeiGluniXRgVYr,0.697,0.32,7,-9.347,0,0.214,0.0127,0.0,0.18,...,4,3TVXtAsR1Inumwj472S9r4,4Q7cRXio6mF2ImVUCcezPO,1,True,Fear Of Heights,4,49.0,155154,https://open.spotify.com/track/6LFW4dEsLeiGlun...
4,1us5wNgZc0YLT8RQQs2Q7L,0.566,0.564,10,-7.88,0,0.277,0.0149,0.0,0.104,...,4,3TVXtAsR1Inumwj472S9r4,4Q7cRXio6mF2ImVUCcezPO,1,True,Daylight,5,48.0,164142,https://open.spotify.com/track/1us5wNgZc0YLT8R...


Delete some column in dataframe

In [2]:
columns_to_drop = ['id', 'artist_id', 'album_id', 'disc_number', 'spotify_url', 'duration_ms_y']
merged_df.drop(columns=columns_to_drop, inplace=True)

merged_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms_x,time_signature,explicit,name,track_number,popularity
0,0.402,0.514,1,-7.322,0,0.0471,0.106,0.0,0.181,0.2,120.009,251095,4,True,Virginia Beach,1,57.0
1,0.646,0.605,0,-7.59,1,0.188,0.0636,0.0,0.253,0.309,90.156,141213,3,True,Amen (feat. Teezo Touchdown),2,49.0
2,0.571,0.55,0,-6.567,1,0.106,0.0548,0.0,0.179,0.104,120.947,285630,4,True,Calling For You (feat. 21 Savage),3,50.0
3,0.697,0.32,7,-9.347,0,0.214,0.0127,0.0,0.18,0.0658,136.976,155154,4,True,Fear Of Heights,4,49.0
4,0.566,0.564,10,-7.88,0,0.277,0.0149,0.0,0.104,0.0667,139.92,164143,4,True,Daylight,5,48.0


In [3]:
merged_df['explicit'] = merged_df['explicit'].astype(int)
merged_df['track_number'] = pd.to_numeric(merged_df['track_number'], errors='coerce')
merged_df['popularity'] = pd.to_numeric(merged_df['popularity'], errors='coerce')
merged_df['duration_ms_x'] = pd.to_numeric(merged_df['duration_ms_x'], errors='coerce')


In [4]:
print(merged_df.dtypes)


danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms_x         int64
time_signature        int64
explicit              int32
name                 object
track_number        float64
popularity          float64
dtype: object


In [5]:
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
            'duration_ms_x', 'time_signature', 'explicit', 'track_number', 'popularity']

X = merged_df[features]


In [None]:
nan_percentage = X.isna().mean() * 100
# In kết quả
print("Phần trăm dữ liệu NaN trong mỗi cột:")
print(nan_percentage)

Phần trăm dữ liệu NaN trong mỗi cột:
danceability        0.000000
energy              0.000000
key                 0.000000
loudness            0.000000
mode                0.000000
speechiness         0.000000
acousticness        0.000000
instrumentalness    0.000000
liveness            0.000000
valence             0.000000
tempo               0.000000
duration_ms_x       0.000000
time_signature      0.000000
explicit            0.000000
track_number        0.019715
popularity          0.011829
dtype: float64


In [8]:
X_cleaned = X.dropna()

In [15]:
merged_df = merged_df.dropna()

In [9]:
from sklearn.neighbors import NearestNeighbors

# Initialize NearestNeighbors model
knn = NearestNeighbors(n_neighbors=5)  # You can adjust the number of neighbors

# Fit the model on the feature set
knn.fit(X_cleaned)


In [10]:
def recommend_songs(song_list, df, knn_model):
    indices = []
    for song in song_list:
        song_index = df[df['name'] == song].index
        if not song_index.empty:
            distances, idx = knn_model.kneighbors([X.iloc[song_index[0]]])
            indices.extend(idx[0])
    
    # Trả về các bài hát gợi ý, loại trừ các bài hát đã nhập
    recommended_indices = list(set(indices) - set(song_list))
    return df.iloc[recommended_indices]['name'].unique()


In [None]:
song_list = ['Hello', '7 years', "November Rain"] 
recommended_songs = recommend_songs(song_list, merged_df, knn)
print("Các bài hát được đề xuất:")
print(recommended_songs)

Các bài hát được đề xuất:
['Insane'
 "Je Ne Veux Pas La Fin De Nous (I Just Can't Stop Loving You) - French Version"
 'My Name Is - Instrumental']




In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def recommend_songs(song_list, df, num_recommendations=5):
    # Tính trung bình các vector đặc trưng của các bài hát trong danh sách
    song_indices = df.index[df['name'].isin(song_list)].tolist()
    if not song_indices:
        return "Không tìm thấy bài hát trong danh sách dữ liệu."
    
    feature_avg = np.mean(df.loc[song_indices, features], axis=0)

    
    # Tính độ tương đồng của trung bình vector với tất cả các bài hát
    similarities = cosine_similarity([feature_avg], df[features])[0]
    
    # Tìm chỉ số của 5 bài hát có độ tương đồng cao nhất
    recommended_indices = np.argsort(-similarities)[:num_recommendations]
    
    # Trả về tên của các bài hát được đề xuất 
    return df.iloc[recommended_indices]['name'].unique()

# Các đặc trưng được sử dụng để đo đạc
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
            'explicit', 'duration_ms_x', 'time_signature', 'popularity']

# Gọi hàm đề xuất
song_list = ['Hello', '7 years', "November Rain"]
recommended_songs = recommend_songs(song_list, merged_df)
print("Các bài hát được đề xuất:")
print(recommended_songs)

[1839, 1868, 1890, 6653, 6667, 7778, 8379, 8681, 11170, 11218, 11909, 24688]
Các bài hát được đề xuất:
['Se Porta Mal' 'D.D.' 'Y No Le Conviene'
 'Burn It Down - One More Light Live' 'Good In Bed']
