In [8]:
import pandas as pd
import random

# Shuffle dataset function
def shuffle_dataset(data):
    """
    Shuffle the dataset randomly.
    
    Args:
        data (DataFrame): The input pandas DataFrame to shuffle.
    
    Returns:
        DataFrame: Shuffled DataFrame.
    """
    data = data.sample(frac=1, random_state=random.randint(1, 1000)).reset_index(drop=True)
    return data

# Merge Sort implementation for `track_genre` and `popularity`
def merge_sort_genre_popularity(data, key1, key2):
    """
    Merge Sort implementation for sorting by two keys:
    - Primary Key: `key1` (e.g., 'track_genre') in ascending order.
    - Secondary Key: `key2` (e.g., 'popularity') in descending order within key1 groups.
    
    Args:
        data (list): List of dictionaries to be sorted.
        key1 (str): Primary sorting key (e.g., 'track_genre').
        key2 (str): Secondary sorting key (e.g., 'popularity').

    Returns:
        list: Sorted list of dictionaries.
    """
    if len(data) <= 1:
        return data

    mid = len(data) // 2
    left = merge_sort_genre_popularity(data[:mid], key1, key2)
    right = merge_sort_genre_popularity(data[mid:], key1, key2)

    return merge_genre_popularity(left, right, key1, key2)

def merge_genre_popularity(left, right, key1, key2):
    """
    Merge function for sorting by two keys.

    Args:
        left (list): Left half of the data.
        right (list): Right half of the data.
        key1 (str): Primary sorting key.
        key2 (str): Secondary sorting key.

    Returns:
        list: Merged and sorted list.
    """
    sorted_data = []
    i = j = 0

    while i < len(left) and j < len(right):
        # Compare by primary key (track_genre)
        if left[i][key1] < right[j][key1]:
            sorted_data.append(left[i])
            i += 1
        elif left[i][key1] > right[j][key1]:
            sorted_data.append(right[j])
            j += 1
        else:
            # If primary key is equal, compare by secondary key (popularity, descending)
            if left[i][key2] >= right[j][key2]:
                sorted_data.append(left[i])
                i += 1
            else:
                sorted_data.append(right[j])
                j += 1

    # Append remaining elements
    sorted_data.extend(left[i:])
    sorted_data.extend(right[j:])

    return sorted_data

# Main workflow
file_path = 'filtered_music_data.csv'  # Replace with the correct path to your file
music_data = pd.read_csv(file_path)

# Shuffle the dataset
shuffled_data = shuffle_dataset(music_data)

print(shuffled_data.head(20))  # Display the first 20 rows of the shuffled dataset

# Convert shuffled data to a list of dictionaries for sorting
data_list = shuffled_data.to_dict('records')

# Perform Merge Sort
sorted_data = merge_sort_genre_popularity(data_list, key1="track_genre", key2="popularity")

# Convert back to DataFrame
sorted_df = pd.DataFrame(sorted_data)

# Save or print the sorted DataFrame
print(sorted_df.head(20))  # Display the first 20 rows of the sorted dataset
# sorted_df.to_csv("sorted_music_data.csv", index=False)  # Save to a new CSV file


                  track_id                                            artists  \
0   1uUf8MUvBnhC8TudDxV2hj                                   Impaled Nazarene   
1   2eFjKl5cyPPYElDByCh6Tb                                 ILLENIUM;iann dior   
2   32YMjckfKOLVITAfe6zK9F                                       Ran-D;Redixx   
3   5N37thhxSu00xqUKO1tTvn            Patricio Rey y sus Redonditos de Ricota   
4   5Xw7riSdSL7mWmxXZCBZrM                                Mad Maxx;Shivadelic   
5   3G8zJFJ29WNAktaMbKW1hX                                    The Temptations   
6   2Rs1cBCuncS95efeyANpne                        Sarah, the Illstrumentalist   
7   249e2fS064cOwvWXm4JU28                                             Big Up   
8   4wRYpv96z3nmsiC9f3BdWh                                           LaLa Hsu   
9   5ptQepn6HP5jMllqHCmr4F                                           Emre Fel   
10  1fbHsIsgVoZsltBtQhSgvc                                        Ata Ebtekar   
11  4vxeeLNZHjOPGMwZA0fHQi  

In [10]:
import pandas as pd
from memory_profiler import memory_usage
import time
import random

# Load the dataset
df = pd.read_csv('music_data.csv')

# Select only the relevant columns
filtered_df = df[['track_id', 'artists', 'album_name', 'track_name', 'popularity', 'track_genre']]

# Shuffle dataset function
def shuffle_dataset(data):
    data = data.sample(frac=1, random_state=random.randint(1, 1000)).reset_index(drop=True)
    return data

# Merge Sort implementation
def merge_sort(data, key1, key2):
    if len(data) <= 1:
        return data
    
    mid = len(data) // 2
    left = merge_sort(data[:mid], key1, key2)
    right = merge_sort(data[mid:], key1, key2)
    
    return merge(left, right, key1, key2)

def merge(left, right, key1, key2):
    sorted_data = []
    i = j = 0

    while i < len(left) and j < len(right):
        if left[i][key1] < right[j][key1]:
            sorted_data.append(left[i])
            i += 1
        elif left[i][key1] > right[j][key1]:
            sorted_data.append(right[j])
            j += 1
        else:
            if left[i][key2] >= right[j][key2]:  # Descending order for popularity
                sorted_data.append(left[i])
                i += 1
            else:
                sorted_data.append(right[j])
                j += 1

    sorted_data.extend(left[i:])
    sorted_data.extend(right[j:])
    return sorted_data

# Quick Sort implementation
def quick_sort(data, key1, key2, low=0, high=None):
    if high is None:
        high = len(data) - 1

    if low < high:
        pivot_index = partition(data, key1, key2, low, high)
        quick_sort(data, key1, key2, low, pivot_index - 1)
        quick_sort(data, key1, key2, pivot_index + 1, high)

def median_of_three(data, key1, key2, low, high):
    mid = (low + high) // 2
    candidates = [low, mid, high]
    candidates.sort(key=lambda x: (data[x][key1], -data[x][key2]))
    return candidates[1]

def partition(data, key1, key2, low, high):
    pivot_index = median_of_three(data, key1, key2, low, high)
    data[pivot_index], data[high] = data[high], data[pivot_index]
    pivot = data[high]
    i = low - 1
    for j in range(low, high):
        if data[j][key1] < pivot[key1] or (data[j][key1] == pivot[key1] and data[j][key2] >= pivot[key2]):
            i += 1
            data[i], data[j] = data[j], data[i]
    data[i + 1], data[high] = data[high], data[i + 1]
    return i + 1

# Benchmarking function
def benchmark_sort(sort_function, sort_name):
    total_time = 0
    total_memory = 0
    runs = 5  # Number of runs
    sorted_data = None

    for _ in range(runs):
        # Shuffle the dataset
        shuffled_dataset = shuffle_dataset(filtered_df)
        data_list = shuffled_dataset.to_dict('records')

        # Measure memory usage and execution time
        mem_usage = memory_usage((sort_function, (data_list, 'track_genre', 'popularity')), max_iterations=1)
        start_time = time.time()
        sort_function(data_list, 'track_genre', 'popularity')
        end_time = time.time()

        # Calculate time and memory used
        time_used = end_time - start_time
        memory_used = max(mem_usage) - min(mem_usage)

        # Add to totals
        total_time += time_used
        total_memory += memory_used

        print(f"{sort_name} {_+1} attempt Execution Time: {time_used:.6f} seconds")
        print(f"{sort_name} {_+1} attempt Peak Memory Usage: {memory_used:.2f} MB")

        # Save the sorted data from the last run
        if _ == runs - 1:
            sorted_data = data_list

    # Calculate averages
    avg_time = total_time / runs
    avg_memory = total_memory / runs

    print(f"{sort_name} Average Execution Time: {avg_time:.6f} seconds")
    print(f"{sort_name} Average Peak Memory Usage: {avg_memory:.2f} MB")

    return sorted_data

# Run both sorts and benchmark them
print("\nBenchmarking Merge Sort:")
sorted_data_merge = benchmark_sort(merge_sort, "Merge Sort")

print("\nBenchmarking Quick Sort:")
sorted_data_quick = benchmark_sort(quick_sort, "Quick Sort")

# Display the sorted dataset
print("\nSorted Dataset (Merge Sort):")
sorted_df_merge = pd.DataFrame(sorted_data_merge)
print(sorted_df_merge.head())

print("\nSorted Dataset (Quick Sort):")
sorted_df_quick = pd.DataFrame(sorted_data_quick)
print(sorted_df_quick.head())


Benchmarking Merge Sort:
Merge Sort 1 attempt Execution Time: 0.794799 seconds
Merge Sort 1 attempt Peak Memory Usage: 2.20 MB
Merge Sort 2 attempt Execution Time: 0.843546 seconds
Merge Sort 2 attempt Peak Memory Usage: 2.32 MB
Merge Sort 3 attempt Execution Time: 0.791416 seconds
Merge Sort 3 attempt Peak Memory Usage: 0.01 MB
Merge Sort 4 attempt Execution Time: 0.780108 seconds
Merge Sort 4 attempt Peak Memory Usage: 0.88 MB
Merge Sort 5 attempt Execution Time: 0.797138 seconds
Merge Sort 5 attempt Peak Memory Usage: 0.55 MB
Merge Sort Average Execution Time: 0.801402 seconds
Merge Sort Average Peak Memory Usage: 1.19 MB

Benchmarking Quick Sort:
Quick Sort 1 attempt Execution Time: 2.242891 seconds
Quick Sort 1 attempt Peak Memory Usage: 0.27 MB
Quick Sort 2 attempt Execution Time: 2.254413 seconds
Quick Sort 2 attempt Peak Memory Usage: 0.00 MB
Quick Sort 3 attempt Execution Time: 2.229511 seconds
Quick Sort 3 attempt Peak Memory Usage: 0.18 MB
Quick Sort 4 attempt Execution Tim