# Song Analyser

The Python modules used can be downloaded using

```pip -r requirements.txt```

## Imports

In [58]:
import random
import re
import string
import time
import tkinter
from collections import Counter

import numpy as np
import pandas as pd
import pronouncing

## Methods

In [44]:
def remove_punc(df, column_name):
    punctuation = [p for p in string.punctuation]
    df[column_name] = df[column_name].str.replace(r'[^\w\s]+', '')


def seperate_abvs(df, column_name):
    df[column_name] = df[column_name].replace(['youre ','Youre '], ['you are ']*2, regex=True)
    df[column_name] = df[column_name].replace(['Im '], ['I am '], regex=True)
    df[column_name] = df[column_name].replace(['Ive ','youve ','Youve '], ['I have ','you have ','you have '], regex=True)
    df[column_name] = df[column_name].replace(['Ill ','Youll ','youll '], ['I will ','You will ','you will '], regex=True)
    df[column_name] = df[column_name].replace(['Id ','Youd ','youd '], ['I would ','You had ', 'You had '], regex=True) # just randomly picked the def of 'd
    df[column_name] = df[column_name].replace(['weve ','Weve '], ['we have ']*2, regex=True)


def rhyme_count(lyrics_row):
    '''
    Return how many times there was a rhyme
    
    Param
    -----
    lyrics_row (str)

    Output
    ------
    int
    '''
    pars = [el for el in lyrics_row if el != '']
    prev_line = pars[0]
    rhyme_score = 0
    for par in pars[1:]:
        line = par
        rhymes = pronouncing.rhymes(line.split()[-1])
        if (prev_line.split()[-1] in rhymes):
            rhyme_score += 1
        prev_line = line
    return rhyme_score


def find_followers(row, word):
    '''
    Returns words that follow the given word and their frequencies.

    Output
    ------
    Counter {word: word_frequency}
    '''
    # keeping the newlines allows for structure.
    lyrics_list = re.findall(r'\S+|\n', row)

    # Find list of words that follow the given word.
    # Check if the word is the last word as well.
    followers =  [lyrics_list[i+1] for i, el in enumerate(lyrics_list) if (el==word and i+1<len(lyrics_list))]

    return Counter(followers)


def find_follower_probabilities(df):
    '''
    Create a dictionary of words and probability of other words to follow it.

    Param
    -----
    df (DataFrame)
    '''
    # First, create the dictionary of (unique) words
    word_keys = set()
    for row in df.lyrics.str.lower():
        word_keys = word_keys.union(set(re.findall(r'\S+|\n',row))) 
    words_dict = {key: None for key in word_keys}

    # Now calculate the (simple) probability of a word following another word
    # for each word, find what other words follow it, sum the total
    # and each words frequency
    for word in words_dict.keys():
        words_dict[word] = df.lyrics.str.lower().apply(lambda row: find_followers(row, word)).sum()
        
        # normalise the data to get a probability.
        total = sum(words_dict[word].values(), 0.0)
        words_dict[word] = {key: val/total for key, val in words_dict[word].items()}

    return words_dict


def generate_lyrics(prob_dict):
    '''
    Generate lyrics.

    Param
    -----
    prob_dict (dict) : output of find_follower_probabilities.

    Output
    ------
    str : generated lyrics as string.
    '''
    end_words =  [key for key in prob_dict.keys() if key[-1] == '.']
    word = np.random.choice([el for el in list(prob_dict) if el not in end_words])
    max_length =  random.randint(df.lyrics.str.split().str.len().min(),
                                df.lyrics.str.split().str.len().max()) * 0.75
    lyrics_list = [word.capitalize()]
    while (max_length >= 0):

        if (max_length > 0):
            followers = list(prob_dict[word.lower()].keys())
            follower_probabilities = list(prob_dict[word.lower()].values())
            word = np.random.choice(followers, p=follower_probabilities)
            if ('\n' in lyrics_list[-1] and word != '\n'):
                word = word.capitalize()
            lyrics_list.append(word)
        else:
            lyrics_list.append(np.random.choice(end_words))
        max_length -= 1

    return ' '.join(lyrics_list)


def format_lyrics(lyrics):
    '''
    Format lyrics: fix open/closed paranthesis, remove extra newlines and spaces.

    Param
    -----
    lyrics (str)

    Output
    ------
    str : formated lyrics.
    '''
    # Remove whitespace between newline chars
    formated_lyrics = lyrics.replace('\n ', '\n')
    formated_lyrics = formated_lyrics.replace(' \n', '\n')

    # Remove extra newline chars
    extra_newlines_regex = re.compile('(?:\n){3,}')
    formated_lyrics = re.sub(extra_newlines_regex, '\n\n', formated_lyrics)

    # Close open paranthesis and remove ) if there was no (
    i = 0
    open_flag = False
    closed_paran = ''
    for char in formated_lyrics:
        if (char == '\n' and open_flag):
            char = ')\n'
            open_flag = False
        if (char == '('):
            open_flag = True
        if (char == ')'):
            if (not open_flag):
                char = ''
            open_flag = False
        i += 1
        closed_paran += char

    formated_lyrics = closed_paran

    return formated_lyrics


## Analysis

Basic analysis of the lyrics. Two main dataframes are created here. The raw data is stored in **master_df** and it is used in lyric generation later on; The cleaned data is stored in **df** and it is used for the analysis.

In [77]:
master_df = pd.read_csv('lyrics_data_master.csv', index_col=0)
df = master_df.copy()

### Cleaning the data

- Separate the _album year_ from the _album_ column in **df**.
- Remove brackets, extra spaces and punctuation.
- Seperate abbreviations such as I've to I have. 

In [78]:
df.album = df.album.str.replace('album: ', '')
df['year'] = df['album'].str[-7:].replace(['\(','\)'], ['']*2, regex=True).str.strip()
df['album'] = df['album'].str[:-7]
remove_punc(df, 'lyrics')
seperate_abvs(df, 'lyrics')

### Word Count For All Songs

In [79]:
all_words = df.lyrics.str.lower().str.split(expand=True).stack()
all_words_count = all_words.value_counts()

temp = all_words_count.sort_values(ascending=False).reset_index(name='Count').rename(columns={'index':'Word'})
temp['% of Total'] = temp['Count'] / temp['Count'].sum() * 100
temp.index = temp.index + 1

#### Top 10 Word Count

In [80]:
temp.head(10)

Unnamed: 0,Word,Count,% of Total
1,you,2577,6.010916
2,i,2549,5.945606
3,the,1163,2.712726
4,to,922,2.150588
5,me,803,1.873017
6,it,737,1.719071
7,and,672,1.567457
8,a,596,1.390185
9,my,593,1.383187
10,are,520,1.212913


##### Removing the irrelevant words

Words such as _a, the, and, from, to, it, that, thats, or, this, of, is, are, am, these_ are not meaningful and so we can remove them to get a better idea of what are top 10 words used:

In [81]:
all_words_count_filtered = all_words_count.drop(index=['a', 'the', 'and', 'from', 'to', 'it', 'that','thats', 'or', 'this','of','is','are', 'am', 'these'])
temp = all_words_count_filtered.sort_values(ascending=False).reset_index(name='Count').rename(columns={'index':'Word'})
temp['% of Total'] = temp['Count'] / temp['Count'].sum() * 100
temp.index = temp.index + 1

#### Top 10 Word Count (filtered)

In [50]:
temp.head(10)

Unnamed: 0,Word,Count,% of Total
1,you,2577,7.112301
2,i,2549,7.035023
3,me,803,2.216212
4,my,593,1.63663
5,your,470,1.29716
6,oh,466,1.28612
7,love,428,1.181244
8,in,407,1.123285
9,be,405,1.117766
10,know,396,1.092926


### Check For Rhyming

In [84]:
temp = df[['album','year', 'title']].rename(columns={'album':'Album', 'title':'Song Title', 'year':'Year'})
temp['Rhyme Count'] = df.lyrics.apply(lambda row: rhyme_count(row.split('\n')))
temp = temp.sort_values(by=['Rhyme Count', 'Year'], ascending=(False,True), ignore_index=True)
temp.index = temp.index + 1


#### Top 5 Songs with Most Rhyme Count

In [85]:
temp.head(10)

Unnamed: 0,Album,Year,Song Title,Rhyme Count
1,"""Backstreet Boys""",1995,I Wanna Be With You,14
2,"""Backstreet's Back""",1997,If I Don't Have You,14
3,"""DNA""",2019,No Place,12
4,"""Millennium""",1999,I Want It That Way,10
5,"""Millennium""",1999,I Want It That Way,10
6,"""Never Gone""",2005,Never Gone,10
7,"""Black & Blue""",2000,Shining Star,9
8,"""Unbreakable""",2007,You Can Let Go,9
9,"""In A World Like This""",2013,Make Believe,9
10,"""Backstreet Boys""",1995,Nobody But You,8


##### Putting together the number songs with/without rhymes for each album year

In [86]:
rhyming_songs = temp.loc[temp['Rhyme Count'] > 0, 'Year'].value_counts().reset_index(name='Total Songs with Rhymes')
no_rhyming_songs = temp.loc[temp['Rhyme Count'] == 0, 'Year'].value_counts().reset_index(name='Total Songs with No Rhymes')
rhyme_totals = rhyming_songs.merge(no_rhyming_songs, on='index', how='outer', sort=True).rename(columns={'index':'Year'})
rhyme_totals_pvt = rhyme_totals.pivot_table(index='Year', margins=True, margins_name='Total:', aggfunc=sum)
no_rhymes_percent = len(temp[temp['Rhyme Count'] == 0]) / len(temp)

rhyme_totals_pvt

Unnamed: 0_level_0,Total Songs with No Rhymes,Total Songs with Rhymes
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1995,3,11
1997,1,11
1999,1,13
2000,1,13
2005,3,13
2007,11,11
2009,4,9
2011,1,3
2013,4,12
2019,6,9


In [62]:
print('Therefore, {:.0%} of the songs did not have any rhymes in it.\n'.format(no_rhymes_percent))


Therefore, 25% of the songs did not have any rhymes in it.



## Generate Lyrics Using Markov Chain

Use the **master_df** with no cleaning. This is because we want to keep the structure of the text (punctuation, new lines, etc.). First, we create a dictionary of probabilities of words and their _followers_. A _follower_ is a word that comes after a given word. The dictionary contains every _unique words_ as **keys** and the _follower-probability pairs_ as **values**.

In [None]:
follower_probabilities_dict = find_follower_probabilities(master_df)
lyrics = generate_lyrics(follower_probabilities_dict)

##### Format the generated lyrics by removing open paranthesis and so on.

In [70]:
formated_lyrics = format_lyrics(lyrics)

##### Generate a new song title

In [74]:
canditates = [ el for el in formated_lyrics.split() if ('[' not in el)]
title_length = random.randint(1, 5)
title = ' '.join([canditates[random.randint(0, len(canditates))].capitalize() for i in range(title_length)])

And there we have it, out own lyrics with a title:

In [75]:
lyrics_with_title = '{title}\n{sep}\n\n{formated_lyrics}'.format(title=title,sep='-'*len(title),formated_lyrics=formated_lyrics)
print(lyrics_with_title)

Breathing My Let
----------------

Worth it no stronger than you remember those tear us to see
My side
Nobody but i had it in love me starts fillin' up all of you, i'm gonna have finished
Baby,
And kevin:]
That's barely breathing
That we never let go
That i just when you go
Ain't nothin' but now i’m falling apart
Yeah
I'll keep coming up from the chateau
We never make you are you knew you back right
So, don't ever take it makes you don't wanna share you hands on y'all

(you know, i never make alone)
Roll with you can't let it all the party at
It's killing me show me it's now i was
Took you to make you through all of affection
[spoken:]
All a time to tremble
My life
Cause i be afraid, don't want you shut me
[chorus]
And fight until she can make it all around, all a raining day you'll always say goodbye to you know that it's getting closer, closer
No one like a first time in my heart, you're bigger
All about
More
Just me? is for you back
[nick, howie:] can't believe, you are
You're under