In [1]:
import os
import re
import string
import random

NEWLINE_TOKEN = 'newlinetoken'

In [2]:
corpus = []

base_dir = 'data'

for artist in os.listdir(base_dir):
    for file in os.listdir(f'{base_dir}/{artist}'):
        with open(f'{base_dir}/{artist}/{file}', 'rb') as f:
            text = f.read().decode('utf-8')
            text = re.sub(r'\[(.+)\]', ' ', text)
            text = text.replace('\n', f' {NEWLINE_TOKEN} ')
            text = ' '.join(text.split())
            text = text.lower()
            text = text.translate(str.maketrans('', '', string.punctuation))
        corpus.extend(text.split())

i = 0
while i < len(corpus)-1:
    if corpus[i] == corpus[i+1] and corpus[i] == NEWLINE_TOKEN:
        del corpus[i]
    else:
        i += 1
if corpus[0] == NEWLINE_TOKEN:
    del corpus[0]

In [3]:
len(corpus)

5658

In [4]:
len(set(corpus))

782

In [5]:
class Ngram(object):
    def __init__(self, words):
        self.words = words
    
    def __str__(self):
        return ', '.join(self.words)

    def __repr__(self):
        return self.__str__()

    def __hash__(self):
        return hash(tuple(self.words))

In [6]:
class Vertex(object):
    def __init__(self, ngram):
        self.ngram = ngram
        self.adjacent = {}

    @property
    def neighbors(self):
        return [neighbor for (neighbor, _) in self.adjacent.items()]

    @property
    def neighbors_weights(self):
        return [weight for (_, weight) in self.adjacent.items()]
    
    def increment_edge(self, vertex):
        self.adjacent[vertex] = self.adjacent.get(vertex, 0) + 1

    def next_vertex(self):
        return random.choices(self.neighbors, weights=self.neighbors_weights)[0]

    def __str__(self):
        return f'Vertex: {self.ngram}'
  
    def __repr__(self):
        return self.__str__()

In [7]:
class Graph(object):
    def __init__(self):
        self.vertices = {}

    def get_vertex(self, words):
        ngram = Ngram(words)
        if ngram.__str__() not in self.vertices:
            self.vertices[ngram.__repr__()] = Vertex(ngram)
        return self.vertices[ngram.__repr__()]
  
    def get_next_vertex(self, vertex):
        return self.vertices[vertex.ngram.__repr__()].next_vertex()

In [8]:
ORDER = 2

In [9]:
graph = Graph()
prev_words = None

for i in range(len(corpus)):
    words = corpus[i:i+ORDER]
    words_vertex = graph.get_vertex(words)

    if prev_words:
        prev_words.increment_edge(words_vertex)

    prev_words = words_vertex

In [10]:
len([vertex for vertex in graph.vertices.values()])

2119

In [11]:
def compose(graph, seed=None, length=100):
    if seed:
        seed = seed.split()
    else:
        random_index = random.randint(0, len(corpus) - ORDER)
        seed = corpus[random_index:random_index + ORDER]
    
    assert len(seed) == ORDER

    composition = seed
    vertex = graph.get_vertex(seed)
    for _ in range(length):
        vertex = graph.get_next_vertex(vertex)
        composition.append(vertex.ngram.words[-1])

    return composition

In [12]:
composition = compose(graph)
for word in composition:
    if word == NEWLINE_TOKEN:
        print()
    else:
        print(word, end=' ')

de miedo con locura 
y no puedo ocultar que te mueva 
intenta verme y ponte a prueba 
pues tu pronostico es estar contigo 
y aunque lo hagas sin ganas 
yo te amare ladrona 
aunque hay suficientes heridos 
habra una cancion 
al montarme en ese avion 
y no te debo nada oh 
sabiendo que tus besos matan morire de amor 
oh oh 
tambien escondido quedo tu recuerdo que se hiciera eterno 
desde el momento en que vi tu mirada 
porque yo 
en un solo baile te entregue mi 