In [1]:
%config Application.log_level="ERROR"
import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')
logger=logging.getLogger()
logger.setLevel(logging.ERROR)

In [2]:
import requests
import time
import json
import itertools
import random

In [3]:
def get_rest_with_wait(url,wait_time=1):
    logger.debug('Grabbing '+url+'...')
    time.sleep(wait_time)
    return requests.get(url).json()

In [4]:
users='https://bsaber.com/wp-json/buddypress/v1/members?per_page={cnt}&page={pag}'
songs_bookmarked_by='https://bsaber.com/wp-json/bsaber-api/songs/?bookmarked_by={usr}&count={cnt}&page={pag}'
song_ratings='https://bsaber.com/wp-json/bsaber-api/songs/{song_key}/ratings'

In [21]:
wait=0.5

usr_cnt=50
usr_offs=1
usr_pags=5
max_usr_pags=4500
users_file='./src/data/users.txt'

bkm_cnt=200
bookmarks_file='./src/data/bookmarks.txt'

drop_song_keys=['song_key','hash']
songs_file='./src/data/songs.txt'
songs_enhanced_file='./src/data/songs_enhanced.txt'

users_file,bookmarks_file,songs_file

('./src/data/users.txt', './src/data/bookmarks.txt', './src/data/songs.txt')

In [9]:
users_set=set()
pag=0
print(time.strftime('%Y%m%d-%H%M%S',time.localtime())+' - Started scrapping users...')
for page in random.sample(range(max_usr_pags),usr_pags):
    pag=pag+1
    logger.info('Working on page {} ({} of {})...'.format(page,pag,usr_pags))
    aux=time.strftime('%Y%m%d-%H%M%S',time.localtime())+' - Working on page {} ({} of {})...'.format(page,pag,usr_pags)
    print(aux,end = "\r")
    url=users.format(cnt=usr_cnt,pag=page)
    try:
        chunk=get_rest_with_wait(url,wait)
    except:
        pass
    new_users=[(user['id'],user['name']) for user in chunk if user['name'] not in users_set]
    with open(users_file,'a+') as f:
        logger.debug('Adding {} users to {}...'.format(len(new_users),users_file))
        f.writelines(['{},{}\n'.format(user[0],user[1]) for user in new_users])
    users_set.update([user[1] for user in new_users])
old_len=len(aux)
aux=time.strftime('%Y%m%d-%H%M%S',time.localtime())+' - Done scrapping users!'
print(aux+' '*(old_len-len(aux)))

20211208-163349 - Started scrapping users...
20211208-163440 - Done scrapping users!           


In [10]:
songs_set=set()
usr=0
usrs=len(users_set)
old_len=0
print(time.strftime('%Y%m%d-%H%M%S',time.localtime())+' - Started scrapping bookmarks...')
for user in users_set:
    usr=usr+1
    logger.info('Working on user {} of {} - {}...'.format(usr,usrs,user))
    aux=time.strftime('%Y%m%d-%H%M%S',time.localtime())+' - Working on user {} of {} - {}...'.format(usr,usrs,user)
    print(aux+' '*(old_len-len(aux)),end="\r")
    old_len=len(aux)
    bookmarks_lst=[]
    new_songs={}
    pag=1
    while pag:
        logger.debug('Working on page {} of user {}...'.format(pag,user))
        url=songs_bookmarked_by.format(usr=user,cnt=bkm_cnt,pag=pag)
        try:
            chunk=get_rest_with_wait(url,wait)
            bookmarks_lst.extend([song['song_key'] \
                                  for song in chunk['songs'] \
                                  if (song['song_key']!='') and (song['song_key'] not in bookmarks_lst)])
            new_songs.update({song['song_key']:{key:song[key] for key in song if key not in drop_song_keys} \
                              for song in chunk['songs'] \
                              if (song['song_key']!='') and (song['song_key'] not in songs_set)})
            pag=chunk['next_page']
        except:
            pag=None
    if len(bookmarks_lst)>0:
        with open(bookmarks_file,'a+') as f:
            logger.debug('Adding {}\'s {} bookmarks to {}...'.format(user,len(bookmarks_lst),bookmarks_file))
            f.write('{},{}\n'.format(user,','.join(bookmarks_lst)))
        with open(songs_file,'a+') as f:
            logger.debug('Adding {} new songs to {}...'.format(len(new_songs),songs_file))
            f.writelines(['{},{}\n'.format(new_song,json.dumps(new_songs[new_song])) for new_song in new_songs])
        songs_set.update(new_songs.keys())

aux=time.strftime('%Y%m%d-%H%M%S',time.localtime())+' - Done scrapping bookmarks!'
print(aux+' '*(old_len-len(aux)))      

20211208-164355 - Started scrapping bookmarks...
20211208-165205 - Done scrapping bookmarks!                               


### Quality control

In [11]:
with open(bookmarks_file,'r') as f:
    bookmarks_dict={row.strip().split(',',1)[0]:set(row.strip().split(',',1)[1].split(',')) for row in f}
len(bookmarks_dict)

103

In [12]:
[user for user in bookmarks_dict if bookmarks_dict[user]==set([''])]

[]

In [13]:
[user for user in bookmarks_dict if '' in bookmarks_dict[user]]

[]

In [19]:
with open(songs_file,'r') as f:
    songs_dict={row.strip().split(',',1)[0]:json.loads(row.strip().split(',',1)[1]) for row in f}
len(songs_dict)

3494

In [20]:
'' in songs_dict

False

### Takes forever

In [None]:
print(time.strftime('%Y%m%d-%H%M%S',time.localtime())+' - Started enhancing songs metadata...')
with open(songs_file,'r') as f_songs:
    sng=0
    sngs=sum([1 for line in open(songs_file)])
    old_len=0
    for line in f_songs:
        sng=sng+1
        song_key,meta=line.strip().split(',',1)
        meta=json.loads(meta)
        logger.info('Working on song {} of {} - {}...'.format(sng,sngs,meta['title']))
        aux=time.strftime('%Y%m%d-%H%M%S',time.localtime())+' - Working on song {} of {} - {}...'.format(sng,sngs,meta['title'])
        print(aux+' '*(old_len-len(aux)),end="\r")
        old_len=len(aux)
        url=song_ratings.format(song_key=song_key)
        meta.update(get_rest_with_wait(url,wait))
        with open(songs_enhanced_file,'a+') as f_enhc:
            logger.debug('Adding {} to {}...'.format(meta['title'],songs_enhanced_file))
            f_enhc.write('{},{}\n'.format(song_key,json.dumps(meta)))
aux=time.strftime('%Y%m%d-%H%M%S',time.localtime())+' - Done enhancing song metadata!'
print(aux+' '*(old_len-len(aux)))