In [102]:
import json
import string
import nltk
from multiprocessing.pool import Pool
from functools import reduce
from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize

In [103]:
punctuation = set(string.punctuation).union({"''", '...', "''", '...'})
stopwords = set(stopwords.words('english'))
nouns = set(x.name().split('.', 1)[0] for x in wordnet.all_synsets('n'))

In [104]:
def merge_counters(a, b):
    '''Merge counters and return (impure)'''
    # assert a is not b
    a.update(b)
    return a

def flatten(x):
    '''Flatten a 2d collection into 1d'''
    for y in x:
        yield from y

def strip(string):
    '''Strip stopwords and punctuation from nltk tokenized words'''
    yield from filter(lambda word: word not in stopwords and word not in punctuation, word_tokenize(string))

def strip_no_tokenize(string):
    # TODO: trailing punctuation
    yield from filter(lambda word: word not in stopwords, string.split())

In [120]:
def count_pairs(lyric):
    '''Return a count of end word pairs for a song (couplets and uh AxA rhymes)'''
    # filter out blank lines
    lines = lyric.split('\n')
    # tokenize lines
    lines = map(strip, lines)
    lines = filter(lambda x: x, map(list, lines))
    return Counter(map(frozenset, get_word_pairs(lines)))

def get_word_pairs(lines):
    '''Get end word pairs for a song (couplets and AxA rhymes)'''
    prev_words = (None, None)
    for line in lines:
        last_word = line[-1]
        if (prev_words[0] != last_word) and (prev_words[0] and last_word):
            yield prev_words[0], last_word
        if (prev_words[1] != last_word) and (prev_words[1] and last_word):
            yield prev_words[1], last_word
        prev_words = (prev_words[1], last_word)
        
        

In [162]:
def chunk(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

def reduce_helper(func, chunk_, init):
    return reduce(func, chunk_, init)

def multi_reduce(func, coll, init_func=None, chunksize=None, processes=4):
    if not isinstance(coll, list) and not isinstance(coll, tuple):
        coll = list(coll)
    chunks = chunk(coll, len(coll) // processes)
    args = map(lambda chnk: (func, chnk, init_func()), chunks)
    with Pool(processes) as pool:
        results = pool.starmap(reduce_helper, args, chunksize)
    return reduce(func, results, init_func())

In [166]:
with open('lyrics.json') as f:
    lyric_json = json.load(f)
lyric_json = filter(lambda artist_entry: artist_entry[1], lyric_json.items())
lyric_dicts = map(lambda artist_entry: artist_entry[1], lyric_json)
# creates a list of dicts, so flatten
lyrics = flatten(map(lambda lyric_dict: map(lambda entry: entry[1].lower(),
                                            lyric_dict.items()),
                     lyric_dicts))
with Pool(6) as pool:
    counted_lyrics = pool.map(count_pairs, lyrics, chunksize=100)
# counts = reduce(lambda prev, curr: merge_counters(prev, curr),
#                 counted_lyrics, Counter())
counts = multi_reduce(merge_counters, counted_lyrics, Counter,
                      processes=6, chunksize=100)

3486
3486
3486
3486
3486
3486
1


In [None]:
# counts = reduce(lambda prev, curr: merge_counters(prev, curr), counted_lyrics, Counter())

In [167]:
list(enumerate(sorted(counts.items(), key=lambda x: -x[1]), 1))[:1000]

[(1, (frozenset({'go', 'know'}), 1401)),
 (2, (frozenset({'night', 'right'}), 871)),
 (3, (frozenset({'girl', 'world'}), 870)),
 (4, (frozenset({'mind', 'time'}), 839)),
 (5, (frozenset({'baby', 'yeah'}), 761)),
 (6, (frozenset({'baby', 'love'}), 754)),
 (7, (frozenset({'know', 'love'}), 714)),
 (8, (frozenset({'mine', 'time'}), 689)),
 (9, (frozenset({'love', 'yeah'}), 654)),
 (10, (frozenset({'right', 'tonight'}), 637)),
 (11, (frozenset({'oh', 'yeah'}), 626)),
 (12, (frozenset({'say', 'way'}), 599)),
 (13, (frozenset({'baby', 'crazy'}), 594)),
 (14, (frozenset({'away', 'day'}), 576)),
 (15, (frozenset({'bitch', 'shit'}), 567)),
 (16, (frozenset({'right', 'yeah'}), 531)),
 (17, (frozenset({'away', 'say'}), 524)),
 (18, (frozenset({'nigga', 'niggas'}), 516)),
 (19, (frozenset({'life', 'right'}), 512)),
 (20, (frozenset({'enough', 'love'}), 508)),
 (21, (frozenset({'love', 'oh'}), 495)),
 (22, (frozenset({'day', 'way'}), 484)),
 (23, (frozenset({'love', 'time'}), 477)),
 (24, (frozense

In [168]:
list(map(Counter, [[1],[1]]))

[Counter({1: 1}), Counter({1: 1})]