# Generate Phrases from Blonde

**Part 1: Naive Frequency of words**
- Count Vectorizer, and LSA
- lsa.components_ and lsa.explained_variance_ratio_ matrices 
- generate n words based on relative probabilities

**Part 2: Use Markov Chains to predict continued words**
- Create a markov chain with each word in the corpus
- pick starting word randomly
- generate n-1 words based on markov chain

In [1]:
import pandas as pd

df = pd.read_csv('Frank_Lyrics.csv')
df.head()

Unnamed: 0,track,album,lyrics
0,Strawberry Swing,"Nostalgia, Ultra",when we were kids we handpainted strawberries...
1,Novacane,"Nostalgia, Ultra",i think i started somethin i got what i wante...
2,We All Try,"Nostalgia, Ultra",i believe jehovah jireh i believe theres heav...
3,Bitches Talkin' (Metal Gear Solid),"Nostalgia, Ultra",stop let me go you dont got no jodeci all you...
4,Songs For Women,"Nostalgia, Ultra",when i was younger i used to wonder like if i...


## Using LSA's frequency proportions to predict word accurances 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer
from sklearn.decomposition import TruncatedSVD # LSA
import numpy as np

class nlp_pipe:
    
    def __init__(self, vectorizer, tokenizer):
        self.tokenizer = tokenizer # groups words
        self.vectorizer = vectorizer #counts words
    
    def fit_transform(self, df):
        documents = df.copy()
        for idx, document in enumerate(documents):
            documents[idx] = self._tok_stem(document)
        doc_vect = self.vectorizer.fit_transform(documents)
        return doc_vect, self.vectorizer.get_feature_names()
    
    # internal -- tokenizes and stems words in a string
    def _tok_stem(self, text_to_fit_on):
        tokenized_text = self.tokenizer.tokenize(text_to_fit_on)
        return ' '.join(tokenized_text)

In [8]:
nlp = nlp_pipe(CountVectorizer(ngram_range=(1, 2)),
               TreebankWordTokenizer())

X = df[df['album'] == 'Blonde'].reset_index()['lyrics']

doc_spars, terms = nlp.fit_transform(X)

doc_term = pd.DataFrame(doc_spars.toarray(), columns=terms)
doc_term

Unnamed: 0,16,16 how,180,180 on,1998,1998 my,20,20 years,2009,2009 aint,...,youre taking,youre tired,yourself,yourself and,yourself be,yourself rely,youve,youve ever,zip,zip down
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,3,1,1,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
comps = 8
lsa = TruncatedSVD(comps)
doc_topic = lsa.fit_transform(doc_term)
topic_percent = lsa.explained_variance_ratio_
topic_percent

array([0.17523092, 0.14285216, 0.11301132, 0.10258867, 0.07568818,
       0.06912978, 0.06380062, 0.05396418])

In [51]:
sum(topic_percent)

0.7962658295562356

In [54]:
index_ls = [f"component_{x+1}" for x in range(comps)]

topic_term_percent = lsa.components_.round(3)

topic_word = pd.DataFrame(topic_term_percent,
             index = index_ls,
             columns = None)
topic_word

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4693,4694,4695,4696,4697,4698,4699,4700,4701,4702
component_1,0.001,0.001,0.004,0.004,0.005,0.005,0.001,0.001,0.004,0.004,...,0.001,0.001,0.002,0.001,0.001,0.001,0.004,0.004,0.004,0.004
component_2,-0.001,-0.001,-0.009,-0.009,0.014,0.014,-0.002,-0.002,-0.009,-0.009,...,-0.001,-0.001,-0.004,-0.001,-0.001,-0.001,-0.009,-0.009,-0.009,-0.009
component_3,0.001,0.001,-0.01,-0.01,-0.003,-0.003,0.002,0.002,-0.01,-0.01,...,0.0,0.001,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,-0.01
component_4,0.002,0.002,-0.008,-0.008,-0.004,-0.004,-0.002,-0.002,-0.008,-0.008,...,0.0,0.002,0.0,0.0,0.0,0.0,-0.008,-0.008,-0.008,-0.008
component_5,-0.0,-0.0,-0.0,-0.0,-0.003,-0.003,-0.004,-0.004,-0.0,-0.0,...,-0.001,-0.0,-0.002,-0.001,-0.001,-0.001,-0.0,-0.0,-0.0,-0.0
component_6,0.002,0.002,-0.005,-0.005,0.0,0.0,0.002,0.002,-0.005,-0.005,...,0.002,0.002,0.005,0.002,0.002,0.002,-0.005,-0.005,-0.005,-0.005
component_7,0.003,0.003,-0.004,-0.004,-0.0,-0.0,-0.0,-0.0,-0.004,-0.004,...,0.001,0.003,0.002,0.001,0.001,0.001,-0.004,-0.004,-0.004,-0.004
component_8,0.0,0.0,-0.002,-0.002,0.001,0.001,-0.0,-0.0,-0.002,-0.002,...,0.002,0.0,0.007,0.002,0.002,0.002,-0.002,-0.002,-0.002,-0.002


In [68]:
### topic_percent
### topic_term_percent
### terms

# function takes in weights and number of words to return, and returns
# corresponding indeces
def get_index(weights, n_words):
    length = len(weights)
    temp_list = np.random.choice(length, n_words, p=weights)
    if n_words == 1:
        return temp_list[0]
    else:
        return temp_list

def normalize(values):
    values = [value if value >=0 else 0 for value in values]
    total = sum(values)
    return [value/total for value in values]

# prints generated documents given probabilities and num of words
def make_doc(topic_word_probs, words, n_words,
             topic_probs=np.random.dirichlet((1, 1, 1))):
    
    topic_idx = get_index(normalize(topic_probs), 1)
    
    if sum(topic_term_percent[topic_idx]) != 1.0:
        weights = normalize(topic_term_percent[topic_idx])
    else:
        weights = topic_term_percent[topic_idx]
    
    word_idxs = get_index(weights, n_words)
    
    document = ''
    for idx in word_idxs:
        document += words[idx] + ' '
    
    return document


# iterating through different docs
print(make_doc(topic_term_percent, terms, n_words=40, topic_probs=topic_percent))

nothing keep every night you call handmade you working shut out stressed round needs waited stayin day shit kumbaya shit fuck up it last shit nani na though up every play mmm and the na feel wan na shit every want to want your dont even my every off know night if day shit fuckin lighters til in me broke beginnings wake you come could you just everyday 


In [69]:
print(make_doc(topic_term_percent, terms, n_words=40, topic_probs=topic_percent))

youre spitting ta hit will solo solo we both solo ssolo got heaven theres theres heaven in grams hit solo solo then everything comin youre sky on fire solo got hate time we chance again so function am prefer you now low and function now summer last and the myself solo for the to vapors we summer solo sky heaven 


In [71]:
print(make_doc(topic_term_percent, terms, n_words=40, topic_probs=topic_percent))

shine hair spitting game care know you miss make place number one ill for me your nigga we have tears between yall body jump our make summer tears our got someone for aap take scales keep ta self summer last me for spitting him you got but if me yeah you miss must demons like never 


## Using Markov Chains to predict word sequence

In [78]:
lyrics_corp = ' '.join(df['lyrics'])

markov_dict = {}

previous_word = None

for word in lyrics_corp.split():
    if previous_word:
        markov_dict[previous_word].append(word)
    if word not in markov_dict:
        markov_dict[word] = []
    previous_word = word

markov_dict

{'when': ['we',
  'im',
  'im',
  'im',
  'you',
  'i',
  'i',
  'the',
  'those',
  'those',
  'the',
  'the',
  'he',
  'i',
  'you',
  'we',
  'i',
  'i',
  'i',
  'i',
  'i',
  'i',
  'i',
  'i',
  'you',
  'things',
  'you',
  'you',
  'you',
  'your',
  'i',
  'i',
  'im',
  'im',
  'im',
  'im',
  'i',
  'i',
  'i',
  'i',
  'i',
  'you',
  'your',
  'all',
  'they',
  'they',
  'i',
  'im',
  'they',
  'they',
  'you',
  'we',
  'the',
  'the',
  'the',
  'im',
  'you',
  'im',
  'you',
  'you',
  'the',
  'it',
  'you',
  'it',
  'you',
  'it',
  'you',
  'it',
  'you',
  'it',
  'you',
  'i',
  'i',
  'you',
  'youre',
  'youre',
  'im',
  'you',
  'i',
  'i',
  'i',
  'hes',
  'were',
  'we',
  'to',
  'you',
  'you',
  'i',
  'i',
  'i',
  'i',
  'i',
  'im',
  'we',
  'i',
  'im',
  'we',
  'we',
  'the',
  'the',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
  'im',
 

In [81]:
import random

def generate(dictionary, num_words):
    first_word = random.choice([i for i in dictionary])
    sentence = [first_word]
    
    for index in range(num_words-1):
        possibly_next = dictionary[sentence[index]]
        sentence.append(random.choice(possibly_next))
    
    sentence[0] = sentence[0].capitalize()
    
    return " ".join(sentence)

generate(markov_dict, 15)

'Wishin away the rhythm on the rate is sky but yeah im up here spaceships.'

In [90]:
generate(markov_dict, 40)

'Parachute or youll never see it wont risk her cant do anything for modeling thick girls are times theyre coming yeah yeah oh no no one here before and when i wouldnt be im right here without you simply hard.'

In [91]:
generate(markov_dict, 40)

'Freaks aint quite me you say nope i was my dad id be making out waiting for the waters deep inside not the start breaking the way too i didnt care about to my car im spending mula he doesnt.'

In [98]:
generate(markov_dict, 30)

'Ours picture four cousins stayed and steel steel steel a love and this side fuck boys in straight lines talking bout damn right there you wanna dream i need ya.'