# Projeto final

Este projeto tem como objetivo desenvolver uma solução para o desafio [RecSys Challenge 2018](http://www.recsyschallenge.com/2018/)

Disciplina: EEL 410250 - Aprendizado de Máquina<br>
Aluno: Gustavo de Paula Santos<br>
Matrícula: 19100833<br>

In [1]:
import csv
import json
import os

import autorootcwd
import ijson
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from IPython.display import clear_output
from torch.utils.data import DataLoader, Dataset

In [2]:
class MPDDataset(Dataset):
    def __init__(self, json_dir):
        self.json_files = [os.path.join(json_dir, f) for f in os.listdir(json_dir) if f.endswith('.json')]
        self.file_indices = []
        self._index_files()

    def _index_files(self):
        """Create an index of files and their respective playlist positions"""
        for file_idx, json_file in enumerate(self.json_files):
            with open(json_file, 'r') as f:
                data = json.load(f)
                for playlist_idx, playlist in enumerate(data['playlists']):
                    self.file_indices.append((file_idx, playlist_idx))

    def __len__(self):
        return len(self.file_indices)

    def __getitem__(self, idx):
        file_idx, playlist_idx = self.file_indices[idx]
        json_file = self.json_files[file_idx]
        with open(json_file, 'r') as f:
            data = json.load(f)
            playlist = data['playlists'][playlist_idx]
            return self._process_playlist(playlist)

    def _process_playlist(self, playlist):
        playlist_id = playlist['pid']
        playlist_name = playlist['name']
        tracks = playlist['tracks']
        processed_tracks = []
        for track in tracks:
            track_id = track['track_uri']
            track_name = track['track_name']
            artist_id = track['artist_uri']
            artist_name = track['artist_name']
            processed_tracks.append({
                'track_id': track_id,
                'track_name': track_name,
                'artist_id': artist_id,
                'artist_name': artist_name,
            })
        return {
            'playlist_id': playlist_id,
            'playlist_name': playlist_name,
            'tracks': processed_tracks
        }

In [2]:
class million_playlist_dataset(torch.utils.data.Dataset):
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

In [3]:
def preprocess_and_save(data, output_file):
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['playlist_id', 'playlist_name', 'track_id', 'track_name', 'artist_id', 'artist_name']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for playlist in data:
            playlist_id = playlist['pid']
            playlist_name = playlist['name']
            tracks = playlist['tracks']
            for track in tracks:
                track_id = track['track_uri']
                track_name = track['track_name']
                artist_id = track['artist_uri']
                artist_name = track['artist_name']
                writer.writerow({
                    'playlist_id': playlist_id,
                    'playlist_name': playlist_name,
                    'track_id': track_id,
                    'track_name': track_name,
                    'artist_id': artist_id,
                    'artist_name': artist_name,
                })

In [4]:
def stream_json(file_path):
    with open(file_path, 'r') as file:
        objects = ijson.items(file, 'playlists.item')
        for obj in objects:
            yield obj

In [5]:
folder_path = "../spotify_million_playlist_dataset/data/"
files = os.listdir(folder_path)
files = [file for file in files if file.endswith(".json")]
files = sorted(files)

In [8]:
for file in tqdm(files):
    print(f"Processing file: {file}")
    filepath = os.path.join(folder_path, file)
    output_csv_file = os.path.join( "data/", file.replace(".json", ".csv"))
    
    data_generator = stream_json(filepath)
    preprocess_and_save(data_generator, output_csv_file)
    print(f"Done processing file: {file}")
    clear_output(wait=True)

100%|██████████| 1000/1000 [15:46<00:00,  1.06it/s]


In [4]:
# Usage example
json_dir = '../spotify_million_playlist_dataset/data/'
dataset = MPDDataset(json_dir)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

KeyboardInterrupt: 

In [19]:
len(dataloader) *32

1000000

In [17]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)
dataloader.dataset[0]

{'playlist_id': 549000,
 'playlist_name': 'Bob Dylan',
 'tracks': [{'track_id': 'spotify:track:6QHYEZlm9wyfXfEM1vSu1P',
   'track_name': 'Boots of Spanish Leather',
   'artist_id': 'spotify:artist:74ASZWbe4lXaubB36ztrGX',
   'artist_name': 'Bob Dylan'},
  {'track_id': 'spotify:track:3RkQ3UwOyPqpIiIvGVewuU',
   'track_name': 'Mr. Tambourine Man',
   'artist_id': 'spotify:artist:74ASZWbe4lXaubB36ztrGX',
   'artist_name': 'Bob Dylan'},
  {'track_id': 'spotify:track:0ju1jP0cSPJ8tmojYBEI89',
   'track_name': "Danny's Song",
   'artist_id': 'spotify:artist:7emRV8AluG3d4e5T0DZiK9',
   'artist_name': 'Loggins & Messina'},
  {'track_id': 'spotify:track:7ny2ATvjtKszCpLpfsGnVQ',
   'track_name': "A Hard Rain's A-Gonna Fall",
   'artist_id': 'spotify:artist:74ASZWbe4lXaubB36ztrGX',
   'artist_name': 'Bob Dylan'},
  {'track_id': 'spotify:track:18GiV1BaXzPVYpp9rmOg0E',
   'track_name': "Blowin' In the Wind",
   'artist_id': 'spotify:artist:74ASZWbe4lXaubB36ztrGX',
   'artist_name': 'Bob Dylan'},
  {