In [2]:
import pyspark
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql import SparkSession

spark = SparkSession.builder \
         .master("local[16]") \
         .appName("Exp") \
         .getOrCreate()

spark.conf.set("spark.executor.memory", "8g")
spark.conf.set("spark.driver.memory", "8g")
spark.conf.set("spark.driver.maxResultSize", "4g")
spark.conf.set("spark.sql.broadcastTimeout", "900")

In [17]:
### GENRE DATASET

df = spark.read.json('json/balanced_track_with_genre.json')
df = df.withColumnRenamed("track_id", "MSD_track_id")
df = df.withColumn("MSD_track_id", sf.substring(sf.col("MSD_track_id"), 0, 17))

df.show(5)

+-------+-----------------+
|  genre|     MSD_track_id|
+-------+-----------------+
|hip-hop|TRZWOFZ12903CEB1D|
|hip-hop|TRKLOMN12903CE2A6|
|hip-hop|TRAETOX128F933AD7|
|hip-hop|TRLURAC128F933AD7|
|hip-hop|TRJSOGI128F933AD8|
+-------+-----------------+
only showing top 5 rows



In [18]:
from pyspark.sql import functions as sf

base_path = "/nfs/msd/mp3/MSD/audio/"

path_map = spark.read.csv("../msd/mp3/MSD/dataset_annotation.csv", header="true")
path_map = path_map.withColumnRenamed("id", "MSD_track_id")

path_map = path_map.withColumn("path", sf.concat(sf.lit(base_path), 
                                                 sf.lit(sf.col('filename').substr(1,1)),
                                                 sf.lit("/"),
                                                 sf.lit(sf.col('filename').substr(2,1)),
                                                 sf.lit("/"),
                                                 sf.col('filename')))


path_map = path_map.repartition(80)

In [20]:
df_joined = df.join(path_map, "MSD_track_id")

df_joined.show(5)

+-----------------+--------------------+----------------+------------------+--------------------+
|     MSD_track_id|               genre|        filename|             label|                path|
+-----------------+--------------------+----------------+------------------+--------------------+
|TRYHFCD128E07826A|classic pop and rock|  11146.clip.mp3|ARD3LXU1187B9ABFC5|/nfs/msd/mp3/MSD/...|
|TRYHFCD128E07826A|                punk|  11146.clip.mp3|ARD3LXU1187B9ABFC5|/nfs/msd/mp3/MSD/...|
|TRTFUBW128F42958F|classic pop and rock|1141957.clip.mp3|ARLH9TN1187B98E170|/nfs/msd/mp3/MSD/...|
|TRMXDKU128F428F27|      jazz and blues|1101597.clip.mp3|ARU1K2U1187FB48529|/nfs/msd/mp3/MSD/...|
|TREWNVA128F429CD1|dance and electro...|1858978.clip.mp3|ARTTHF71187B9895A0|/nfs/msd/mp3/MSD/...|
+-----------------+--------------------+----------------+------------------+--------------------+
only showing top 5 rows



In [21]:
from source.sound_transforms import log_mel_spectrogram
from pyspark.sql.functions import udf, array, struct
from pyspark.sql.types import *
import librosa
import lmdb


def write_to_db(row):
    path = row["path"]
    msd_id = row["MSD_track_id"].encode()
    
    env = lmdb.open('data/MSD/MSD_ID_to_log_mel_spectrogram.lmdb', subdir=False,
            map_size=1e12 )
    
    with env.begin(write=False) as txn:
        data = txn.get(msd_id)
    
    if data is None:
        try:
            x, sr = librosa.load(path)
        except:
            return "error"
        features = log_mel_spectrogram(x,sr)

        with env.begin(write=True) as txn:
            txn.put(msd_id, features.tobytes())
            
        return 1
    
    return 0

to_db_udf = udf(write_to_db, returnType=ArrayType(FloatType()))

In [None]:
df_written = df_joined.withColumn("written", to_db_udf(struct("*")))

df_written.collect()
# df_written.groupBy('written').count().show()

env = lmdb.open('data/MSD/MSD_ID_to_log_mel_spectrogram.lmdb', subdir=False,
                map_size=1e12 )

env.stat()

In [3]:
mtat = spark.read.option("sep", "\t").csv('/nfs/subtasks/MagnaTagATune/annotations_final.csv', header="true")

#mtat.groupBy('no voice').count().show()
mtat.show(5)

+-------+--------+------+----+--------+---------+-----+------+-----------+--------------+--------+-----+------+------------+----------+------+--------+-----+-------+-----+---+--------+-----+----------+------+-------+--------+-----+---------+------+---------+---------+-----+------+----+------+-----+-------+-----+--------+----+-------+----+------------+----+-----+-------+-------+-----+----+----------+--------------+-------+----------+----+----+-----+----------+-----+-----+-------+----+----------+--------+----------+----+--------+--------+----+-------+------+-----+----------+-------------+--------+---------+--------+---------+-----+--------+-------+------+-----+-----+-------+----+---------------+---------------+-----------+-----------+----------------+-------+------+-------+------+------+----+----------+---------+-----+-----------+----+--------+------+---+---+------------+------+-----+-------+-----+-----------+-----------+------+-----+-----+---+----+---------+----------+----+----+--------

In [39]:
base_path = "../subtasks/MagnaTagATune/mp3/"
full_path = mtat.withColumn("path", sf.concat(sf.lit(base_path), sf.col('mp3_path')))

full_path.count()

25863

In [41]:
from source.datasets.sound_transforms import log_mel_spectrogram
from pyspark.sql.functions import udf, array, struct
from pyspark.sql.types import *
import librosa
import lmdb


def write_to_db(row):
    path = row["path"]
    clip_id = row["clip_id"].encode()
    
    env = lmdb.open('data/MTAT/clip_id_to_log_mel_spectrogram.lmdb', subdir=False,
            map_size=1e12 )
    
    with env.begin(write=False) as txn:
        data = txn.get(clip_id)
    
    if data is None:
        try:
            x, sr = librosa.load(path)
        except:
            return "error"
        features = log_mel_spectrogram(x,sr)

        with env.begin(write=True) as txn:
            txn.put(clip_id, features.tobytes())
            
        return 1
    
    return 0

to_db_udf = udf(write_to_db, returnType=ArrayType(FloatType()))

In [42]:
full_path_written = full_path.withColumn("written", to_db_udf(struct("*")))
full_path_written.collect()

env = lmdb.open('data/MTAT/clip_id_to_log_mel_spectrogram.lmdb', subdir=False,
                map_size=1e12 )
print("done")

env.stat()

done


{'psize': 4096,
 'depth': 3,
 'branch_pages': 3,
 'leaf_pages': 275,
 'overflow_pages': 9413040,
 'entries': 25860}

In [56]:
import csv
from tqdm import tqdm

ids = set()

for row in csv.reader(open('/nfs/subtasks/MagnaTagATune/annotations_final.csv', 'r'), delimiter='\t'):
    
    ids.update([row[0]])    
    
len(ids)

env = lmdb.open('data/MTAT/clip_id_to_log_mel_spectrogram.lmdb', subdir=False,
            map_size=1e12 )

for clip_id in tqdm(ids):
    with env.begin(write=False) as txn:
        data = txn.get(clip_id.encode())
        
    if data is None:
        print(clip_id)

  9%|▊         | 2235/25864 [00:01<00:13, 1732.87it/s]

35644


 46%|████▌     | 11942/25864 [00:06<00:07, 1770.92it/s]

57881


 61%|██████    | 15698/25864 [00:08<00:05, 1787.33it/s]

clip_id


 74%|███████▍  | 19079/25864 [00:10<00:03, 1744.08it/s]

55753


100%|██████████| 25864/25864 [00:14<00:00, 1741.68it/s]


In [57]:
ids.remove("35644")
ids.remove("57881")
ids.remove("55753")

In [58]:


len(ids)

25861