In [1]:
import logging
logging.basicConfig()
formatter=logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s','%m/%d/%Y %I:%M:%S %p')
logger=logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.handlers[0].setFormatter(formatter)

logger.info('Hello world!')

12/08/2021 05:04:06 PM root         INFO     Hello world!


In [2]:
import os
import sys
logger.debug('Running on {}!'.format(os.uname()[1]))

12/08/2021 05:04:08 PM root         DEBUG    Running on MightyWalrus!


In [3]:
root_dirs=[
    './lib/',
]
for root_dir_candidate in root_dirs:
    if os.path.exists(root_dir_candidate):
        root_dir=root_dir_candidate
        break
logger.debug('Adding code at {} to the path...'.format(root_dir))
sys.path.insert(0, root_dir)

12/08/2021 05:04:19 PM root         DEBUG    Adding code at ./lib/ to the path...


In [5]:
import itertools
import collections
import pandas as pd
import parallelizer

In [22]:
min_bookmarks=5
max_bookmarks=250
min_num_pairs=1

bookmarks_file='./src/data/bookmarks.txt'
likeness_file='./src/data/likeness.txt'
bookmarks_file,likeness_file

('./src/data/bookmarks.txt', './src/data/likeness.txt')

In [16]:
%%time
with open(bookmarks_file,'r') as f:
    bookmarks_lst=[row.strip().split(',',1)[1].split(',') for row in f.readlines()]
bookmarks_lst=[bookmarks for bookmarks in bookmarks_lst if len(bookmarks)>=min_bookmarks]
bookmarks_lst=[bookmarks for bookmarks in bookmarks_lst if len(bookmarks)<=max_bookmarks]
bookmarks_lst=sorted(bookmarks_lst,key=len)
len(bookmarks_lst)

CPU times: user 4.27 ms, sys: 0 ns, total: 4.27 ms
Wall time: 2.81 ms


72

### Raw processing

In [17]:
%%time
song_pairs=[ itertools.combinations(sorted(bookmarks),2) for bookmarks in bookmarks_lst ]
song_pairs=[ pair for aux_lst in song_pairs for pair in aux_lst ]
song_pair_counts=collections.Counter(song_pairs)
song_pair_counts.most_common(5)

CPU times: user 38.5 ms, sys: 4.23 ms, total: 42.7 ms
Wall time: 41.6 ms


[(('570', '9d7'), 14),
 (('2144', '570'), 11),
 (('2087', '570'), 10),
 (('2144', '8553'), 10),
 (('570', '8553'), 10)]

In [18]:
%%time
with open(likeness_file,'w') as f:
    for pair in (pair for pair in song_pair_counts if song_pair_counts[pair]>min_num_pairs):
        f.write('{},{},{}\n'.format(pair[0],pair[1],song_pair_counts[pair]))

CPU times: user 27.5 ms, sys: 2.69 ms, total: 30.2 ms
Wall time: 28.8 ms


### Parallelized, with Counters

In [19]:
%%time
def ct_process_user_bookmarks(bookmarks):
    return list(itertools.combinations(sorted(bookmarks),2))
out=parallelizer.parallelize(parallelizer.function_wrapper,bookmarks_lst,(ct_process_user_bookmarks,),
                             threads=4,timer_step=10)
ct_song_pairs=[pair for aux_lst in out for pair in aux_lst]
ct_song_pair_counts=collections.Counter(ct_song_pairs)
ct_song_pair_counts.most_common(5)

CPU times: user 61 ms, sys: 4.66 ms, total: 65.7 ms
Wall time: 122 ms


[(('570', '9d7'), 14),
 (('2144', '570'), 11),
 (('2087', '570'), 10),
 (('2144', '8553'), 10),
 (('570', '8553'), 10)]

### Parallelized, with DataFrames

In [20]:
%%time
def pd_process_user_bookmarks(bookmarks):
    return collections.Counter(itertools.combinations(sorted(bookmarks),2))
out=parallelizer.parallelize(parallelizer.function_wrapper,bookmarks_lst,(pd_process_user_bookmarks,),
                             threads=4,timer_step=10)
pd_out=pd.concat([pd.DataFrame.from_dict(o,orient='index') for o in out]).reset_index().set_axis(['song_pair','counts'],axis=1,inplace=False)
pd_song_pair_counts=pd_out.groupby('song_pair').sum()
pd_song_pair_counts.sort_values('counts',ascending=False).head()

CPU times: user 279 ms, sys: 35.1 ms, total: 314 ms
Wall time: 318 ms


Unnamed: 0_level_0,counts
song_pair,Unnamed: 1_level_1
"(570, 9d7)",14
"(2144, 570)",11
"(570, 5fc)",10
"(2144, 8553)",10
"(2087, 570)",10


### So...

Fancy in this case is not faster.

In any case on 16GB RAM one cannot build the intermediate full pairs list for more than ~25000 users - a more map-reduce-like strategy would be needed. One can code it on a single node by accumulating song pairs user by uswer, but it becomes slow as the number of accumulated pairs increases.