In [4]:
import os
import json
import pandas as pd
import csv

def process_playlists(json_file_path, csv_file_path):
    # This function removes unnecessary details like 'track_uri' and 'artist_uri'. Tracks not in the lyrics dataset are excluded. Remaining tracks are then enhanced with keywords, mood, and genre.

    # Read the JSON file
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    # Read the CSV file for moods and keywords
    moods = pd.read_csv(csv_file_path)
    mood_keywords_dict = dict(zip(moods['track_name'], moods[['Keywords', 'Mood','track_genre']].to_dict('records')))

    # Initialize a list to hold processed playlists
    processed_playlists = []

    # Process each playlist
    for playlist in data['playlists']:
        # Remove specified fields from the playlist
        for field in ['collaborative', 'pid', 'modified_at', 'num_tracks', 'num_albums', 'num_followers']:
            playlist.pop(field, None)

        # Filter tracks and remove unnecessary fields
        filtered_tracks = []
        for track in playlist['tracks']:
            track_name = track['track_name'].lower() 
            if track_name in mood_keywords_dict:
                # Remove unnecessary fields from tracks
                for field in ['track_uri', 'artist_uri', 'album_uri', 'duration_ms']:
                    track.pop(field, None)

                # Add keywords and mood to the track
                track_info = mood_keywords_dict[track_name]
                track['keywords'] = track_info['Keywords']
                track['mood'] = track_info['Mood']
                track['track_genre'] = track_info['track_genre']
                filtered_tracks.append(track)

        # Check if filtered tracks are more than 10
        if len(filtered_tracks) > 10:
            # Update the playlist with filtered tracks and their count
            for i, track in enumerate(filtered_tracks):
                track['pos'] = i

            playlist['tracks'] = filtered_tracks
            processed_playlists.append(playlist)

    # Return only the processed playlists that meet the criteria
    data['playlists'] = processed_playlists
    return data

# Directory where JSON files are stored
directory = 'F:\\学习资料\\2023 Fall\\ece1786\\ECE-1786_project\\data\\data'
csv_file_path = os.path.join('F:\\学习资料\\2023 Fall\\ece1786\\ECE-1786_project', 'processed_songs_2w.csv')

# Collect all processed data
processed_data_collection = []

# Loop through file range shown in the image
for i in range(1000):  # Adjust the range according to the actual number of files
    start_index = i * 1000
    end_index = start_index + 999
    json_file_name = f'mpd.slice.{start_index}-{end_index}.json'
    json_file_path = os.path.join(directory, json_file_name)
    if os.path.exists(json_file_path):
        processed_data = process_playlists(json_file_path, csv_file_path)
        # Save the processed data into a new JSON file
        output_file_name = f'processed_{start_index}-{end_index}.json'
        output_file_path = os.path.join(directory, output_file_name)
        with open(output_file_path, 'w') as outfile:
            json.dump(processed_data, outfile, indent=4)
        print(f"Processed data saved to {output_file_name}")
    else:
        print(f"File {json_file_name} does not exist.")

# The variable processed_data_collection now contains all the processed data.


Processed data saved to processed_0-999.json
Processed data saved to processed_1000-1999.json
Processed data saved to processed_2000-2999.json
Processed data saved to processed_3000-3999.json
Processed data saved to processed_4000-4999.json
Processed data saved to processed_5000-5999.json
Processed data saved to processed_6000-6999.json
Processed data saved to processed_7000-7999.json
Processed data saved to processed_8000-8999.json
Processed data saved to processed_9000-9999.json
Processed data saved to processed_10000-10999.json
Processed data saved to processed_11000-11999.json
Processed data saved to processed_12000-12999.json
Processed data saved to processed_13000-13999.json
Processed data saved to processed_14000-14999.json
Processed data saved to processed_15000-15999.json
Processed data saved to processed_16000-16999.json
Processed data saved to processed_17000-17999.json
Processed data saved to processed_18000-18999.json
Processed data saved to processed_19000-19999.json
Proc

In [5]:


def convert_csv_to_json(csv_file_path, json_file_path):
    # The function changes the processed lyric dataset from a CSV format to a JSON file that can be used by ChromaDB.
    track_data = []

    with open(csv_file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            # Check if any field in the row is None
            if any(value == 'None' for value in row.values()):
                continue  # Skip this row

            track_details = {
                'track_name': row['track_name'],
                'track_id': row['track_id'],
                'artist_name': row['artist_name'],
                'release_date': row['release_date'],
                'track_genre': row['track_genre'],
                'topic': row['topic'],
                'Keywords': row['Keywords'],
                'Mood': row['Mood']
            }
            track_data.append(track_details)

    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump({'track_data': track_data}, json_file, indent=4)





# Example usage
csv_file_path = 'processed_songs_2w.csv'
json_file_path = 'processed_songs_2w_gpt.json'
convert_csv_to_json(csv_file_path, json_file_path)
