In [3]:
import json, time, re, requests, pickle
import pandas as pd
import psycopg2 as pg2
import numpy as np

import plotly.plotly as py
import plotly.graph_objs as go


from sqlalchemy import create_engine
from psycopg2.extras import RealDictCursor, Json
from psycopg2.extensions import AsIs

%matplotlib inline
%run ../assets/sql_cred.py

In [3]:
def filename_format_log(file_path, 
                        logfile = '../assets/file_log.txt', 
                        now = round(time.time()), 
                        file_description = None): 
   
    try:
        ext = re.search('(?<!^)(?<!\.)\.(?!\.)', file_path).start() 
    except:
        raise NameError('Please enter a relative path with a file extension.') 
    
    stamp = re.search('(?<!^)(?<!\.)[A-z]+_[0-z]+(?=\.)', file_path).start()
    formatted_name = f'{file_path[:stamp]}{now}_{file_path[stamp:]}'  
    if not file_description:
        file_description = f'Saved at: {time.asctime(time.gmtime(now))}'
    with open(logfile, 'a+') as f:
        f.write(f'{formatted_name}: {file_description}\n')
    return formatted_name, now, file_description

In [4]:
engine = create_engine(ENGINE)

In [5]:
def con_cur_to_db(dbname=DBNAME, dict_cur=None):
    con = pg2.connect(host=IP_ADDRESS,
                  dbname=dbname,
                  user=USER,
                  password=PASSWORD)
    if dict_cur:
        cur = con.cursor(cursor_factory=RealDictCursor)
    else:
        cur = con.cursor()
    return con, cur
    
def execute_query(query, dbname=DBNAME, dict_cur=None, command=False):
    con, cur = con_cur_to_db(dbname, dict_cur)
    cur.execute(f'{query}')
    if not command:
        data = cur.fetchall()
        con.close()
        return data
    con.commit() #sends to server
    con.close() #closes server connection

In [6]:
query = '''SELECT * FROM track_list;'''
response = execute_query(query, dict_cur=True)
track_df = pd.DataFrame(response)
track_df.set_index('track_id', inplace=True)
track_df.head()

Unnamed: 0_level_0,album_name,artist_name,lyrics,playlist_id,playlist_name,playlist_owner,track_name
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0h7TlF8gKb61aSm874s3cV,I Can't Tell You How Much It Hurts,moow,\n\nIf your needle is near\nNeedle is near\nYo...,37i9dQZF1DXarebqD2nAVg,Tender,spotify,You'r in My Head
6koowTu9pFHPEcZnACLKbK,Coming Home,Leon Bridges,\n\n[Verse 1]\nBrown skin girl on the other si...,37i9dQZF1DX4adj7PFEBwf,Wedding Bells,spotify,Brown Skin Girl
1JkhKUXAoNivi87ipmV3rp,Back To Love (Deluxe Version),Anthony Hamilton,"\n\n[Verse 1]\nIt's simple, I love it\nHaving ...",37i9dQZF1DX4adj7PFEBwf,Wedding Bells,spotify,Best of Me
51lPx6ZCSalL2kvSrDUyJc,The Search for Everything,John Mayer,\n\n[Intro: Whistling]\n\n[Verse 1]\nA great b...,37i9dQZF1DX4adj7PFEBwf,Wedding Bells,spotify,You're Gonna Live Forever in Me
3vqlZUIT3rEmLaYKDBfb4Q,Songs In The Key Of Life,Stevie Wonder,\n\n[Verse 1]\nIsn't she lovely\nIsn't she won...,37i9dQZF1DX4adj7PFEBwf,Wedding Bells,spotify,Isn't She Lovely


In [7]:
lyric_df = track_df[['lyrics']]

In [8]:
lyric_df = lyric_df.dropna(axis=0)

In [9]:
lyric_df.shape

(2276, 1)

In [10]:
lyric_df.head()

Unnamed: 0_level_0,lyrics
track_id,Unnamed: 1_level_1
0h7TlF8gKb61aSm874s3cV,\n\nIf your needle is near\nNeedle is near\nYo...
6koowTu9pFHPEcZnACLKbK,\n\n[Verse 1]\nBrown skin girl on the other si...
1JkhKUXAoNivi87ipmV3rp,"\n\n[Verse 1]\nIt's simple, I love it\nHaving ..."
51lPx6ZCSalL2kvSrDUyJc,\n\n[Intro: Whistling]\n\n[Verse 1]\nA great b...
3vqlZUIT3rEmLaYKDBfb4Q,\n\n[Verse 1]\nIsn't she lovely\nIsn't she won...


In [11]:
def clean_lyrics(lyrics, keep_tags=False, keep_nl=False):

    text = lyrics.lower()

    if not keep_tags:
        text = re.sub('(\[.+\])', '', text)

    text = re.sub('[^a-z\s]', '', text)

    text = text.strip()
    text = re.sub('\n\n', ' ', text) 

    text = text.strip()

    if keep_nl:
        text = re.sub('\n', ' \n ', text)
    else:
        text = re.sub('\n', ' ', text)


    return text

In [12]:
lyric_df['clean_lyrics'] = lyric_df.lyrics.map(lambda x: clean_lyrics(x, keep_tags=False, keep_nl=True))

In [13]:
lyric_df.head()

Unnamed: 0_level_0,lyrics,clean_lyrics
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0h7TlF8gKb61aSm874s3cV,\n\nIf your needle is near\nNeedle is near\nYo...,if your needle is near \n needle is near \n yo...
6koowTu9pFHPEcZnACLKbK,\n\n[Verse 1]\nBrown skin girl on the other si...,brown skin girl on the other side of the room ...
1JkhKUXAoNivi87ipmV3rp,"\n\n[Verse 1]\nIt's simple, I love it\nHaving ...",its simple i love it \n having you near me hav...
51lPx6ZCSalL2kvSrDUyJc,\n\n[Intro: Whistling]\n\n[Verse 1]\nA great b...,a great big bang and dinosaurs \n fiery rainin...
3vqlZUIT3rEmLaYKDBfb4Q,\n\n[Verse 1]\nIsn't she lovely\nIsn't she won...,isnt she lovely \n isnt she wonderful \n isnt ...


In [14]:
not_eng = lyric_df[lyric_df['clean_lyrics'].str.contains('(\sdel\s)|(\sque\s)|(\svous\s)|(\setre\s)')].index.tolist()


This pattern has match groups. To actually get the groups, use str.extract.



In [15]:
lyric_df = lyric_df.drop(index=not_eng, axis=0)

In [16]:
not_lyrics = lyric_df[lyric_df['clean_lyrics'].str.contains('lyrics for this song have yet')].index.tolist()

In [17]:
lyric_df = lyric_df.drop(index=not_lyrics, axis=0)

In [23]:
long_gap = lyric_df[lyric_df['clean_lyrics'].str.contains(r'(\s{6,})')].index.tolist()


This pattern has match groups. To actually get the groups, use str.extract.



In [24]:
len(long_gap)

41

In [26]:
lyric_df = lyric_df.drop(index=long_gap, axis=0)

In [27]:
lyric_df.head()

Unnamed: 0_level_0,lyrics,clean_lyrics
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0h7TlF8gKb61aSm874s3cV,\n\nIf your needle is near\nNeedle is near\nYo...,if your needle is near \n needle is near \n yo...
6koowTu9pFHPEcZnACLKbK,\n\n[Verse 1]\nBrown skin girl on the other si...,brown skin girl on the other side of the room ...
1JkhKUXAoNivi87ipmV3rp,"\n\n[Verse 1]\nIt's simple, I love it\nHaving ...",its simple i love it \n having you near me hav...
51lPx6ZCSalL2kvSrDUyJc,\n\n[Intro: Whistling]\n\n[Verse 1]\nA great b...,a great big bang and dinosaurs \n fiery rainin...
3vqlZUIT3rEmLaYKDBfb4Q,\n\n[Verse 1]\nIsn't she lovely\nIsn't she won...,isnt she lovely \n isnt she wonderful \n isnt ...


In [28]:
lyric_df.describe()

Unnamed: 0,lyrics,clean_lyrics
count,1907,1907.0
unique,1792,1787.0
top,\n\n[Verse 1]\nWhat would I do without your sm...,
freq,4,6.0


In [29]:
lyric_df = lyric_df.drop_duplicates(subset=['clean_lyrics'])

In [30]:
lyric_df.describe()

Unnamed: 0,lyrics,clean_lyrics
count,1787,1787
unique,1787,1787
top,\n\n[Verse 1]\nThere were nights when the wind...,look at me \n you think that im tryin to fight...
freq,1,1


In [None]:
corpus = []
words_track = []    
lines_track = []

for _, track in lyric_df[['clean_lyrics']].iterrows():
    #words in full set
    lyrics = track[0]
    lyrics_spaced = re.sub(r'( +)', ' ', lyrics)
    lyrics_split = lyrics_spaced.split(' ')
    corpus.extend(lyrics_split)
    
    #lines in track
    lines_split = lyrics_spaced.split('\n')
    lines_track.append(lines_split)
    
    #words in track w/o \n 
    lyrics_spaced = re.sub(r'\n ', '', lyrics_spaced)
    lyrics_split = lyrics_spaced.split(' ')
    words_track.append(lyrics_split)

In [None]:
vocab = set(list(corpus))

In [None]:
len(vocab)

In [None]:
count_words_track = []
for i in words_track:
    count_words_track.append(len(i))

In [None]:
len(count_words_track)

In [None]:
lyric_df['total_words_track'] = count_words_track

In [None]:
lyric_df.head()

In [None]:
unique_words_track = []
for track in words_track:
    unique_words_track.append(set(track))

In [None]:
count_unique_words_track = []
for track in unique_words_track:
    count_unique_words_track.append(len(track))

In [None]:
lyric_df['unique_words_track'] = count_unique_words_track

In [None]:
lyric_df.head()

In [None]:
np.mean(count_words_track)

In [None]:
len_words_track = []
for i in words_track:
    len_words = []
    for word in i:
        len_words.append(len(word))
    len_words_track.append(np.mean(len_words))

In [None]:
np.mean(len_words_track)

In [None]:
lyric_df['total_lines_track'] = count_lines_track

In [None]:
unique_lines_track = []
for track in lines_track:
    unique_lines_track.append(set(track))

In [None]:
count_unique_lines_track = []
for track in unique_lines_track:
    count_unique_lines_track.append(len(track))

In [None]:
lyric_df['unique_lines_track'] = count_unique_lines_track

In [None]:
lyric_df.head()

In [None]:
count_lines_track = []
for i in lines_track:
    count_lines_track.append(len(i))

In [None]:
np.mean(count_lines_track)

In [None]:
len_word = 0
for word in vocab:
    len_word += len(word)

In [None]:
mean_len_word = len_word / len(vocab)
mean_len_word

In [None]:
lines_track[:2]

In [None]:
mean_words_line = []
for track in lines_track:
    words_line = []
    for line in track:
        words_line.append(len(line.split(' ')))
    mean_words_line.append(np.around(np.mean(words_line),1))

In [None]:
mean_words_line

In [None]:
words_line = []
for _, line in eda_df[['line']].iterrows():
    line = line[0].split(' ')
    words_line.append(line) 

In [None]:
count_words_line = []
[count_words_line.append(len(words)) for words in words_line]
mean_words_line = np.mean(count_words_line)

In [None]:
mean_words_line

In [None]:
plt.hist(count_lines_track, range=(0, 125));

In [None]:
plt.hist(count_words_track, range=(0, 1000));

In [None]:
plt.hist(count_words_line, range=(0,20));

In [None]:
plt.hist(len_words_track);

In [None]:
trace = go.histogram(x=words_line)
data = trace
py.iplot([trace])

In [31]:
# formatted_name, now, file_description= filename_format_log(file_path = '../assets/clean_lyrics.csv')

# lyric_df.to_csv(formatted_name, index=False)

In [32]:
eda_df = pd.DataFrame(columns=['id', 'line', 'line_num', 'tag'])

for idx, track in lyric_df[['lyrics']].iterrows():
    track = re.sub('(\[[^\]]+\]\\n\\n)', '', track[0]) #removes tags without lyrics
    track = re.sub('\[[^\[]+(:[^\]]+)\]', '', track) #removes all text after :
    track = re.sub('\[[^\[]+(–[^\]]+)|\[[^\[]+(-\s[^\]]+)\]', '', track) #removes all after –
    track = re.sub('\[[^\[]+(\(([^\]]+))\]', '', track) #removes all after ()
    track = re.sub('(^|)(\[Ad-Libs\])|(\[Keisha\])|(\[Shane Powers\])', '', track) #removes artist and non-standard tags
    track = re.sub('(^|)(\[Break\])|(\[Vocals / Lyrics By\])|(\[Skit\])', '', track) #removes artist and non-standard tags
    
    tags = re.findall('(\[[^\]]+\])', track) #finds all tags
    track = re.sub('(\[[^\]]+\])', '', track) #removes all tags
              
    track = track.strip().split('\n\n') 
    track_split = [verse.strip().split('\n') for verse in track] 
    
    v_count = len(track_split)

    for i in range(v_count):
        if len(tags) == v_count:
            tag = tags[i]
        else:
            tag = f'[verse {(i+1)}]'
        
        for j, line in enumerate(track_split[i]):
            index = 1
            
            row = {
            'id': idx,
            'line':line,
            'line_num':(j+1),
            'tag': tag.lower()
            }
            
            eda_df = eda_df.append(row, ignore_index=True)

In [33]:
eda_df

Unnamed: 0,id,line,line_num,tag
0,0h7TlF8gKb61aSm874s3cV,If your needle is near,1,[verse 1]
1,0h7TlF8gKb61aSm874s3cV,Needle is near,2,[verse 1]
2,0h7TlF8gKb61aSm874s3cV,You can take my blood,3,[verse 1]
3,0h7TlF8gKb61aSm874s3cV,Oh I saved it for you,4,[verse 1]
4,0h7TlF8gKb61aSm874s3cV,Hundreds of drops,5,[verse 1]
5,0h7TlF8gKb61aSm874s3cV,Running red,6,[verse 1]
6,0h7TlF8gKb61aSm874s3cV,Needle is near,1,[verse 2]
7,0h7TlF8gKb61aSm874s3cV,You can take my blood,2,[verse 2]
8,0h7TlF8gKb61aSm874s3cV,Oh I saved it for you,3,[verse 2]
9,0h7TlF8gKb61aSm874s3cV,Hundreds of drops,4,[verse 2]


In [63]:
# formatted_name, now, file_description = filename_format_log(file_path ='../assets/eda_df.csv')
# eda_df.to_csv(formatted_name, index=False)

In [64]:
# eda_df = pd.read_csv('../assets/1548892595_eda_df.csv')

In [65]:
eda_df.tag.value_counts()[eda_df.tag.value_counts() < 10]

[verse  4]              9
[verse 33]              9
[jazz]                  9
[chorus/outro]          9
[rod stewart]           9
[mike]                  8
[pre-chorus 3]          8
[brian morgan]          8
[verse 30]              8
[coko]                  8
[sisqo]                 8
[verse 34]              8
[chorus 3/outro]        7
[verse 32]              7
[verse  2]              7
[verse 31]              7
[hook 1]                7
[verse 2 / outro]       7
[vesre 1]               6
[verse 36]              6
[verso 2]               6
[verso 3]               6
[verso 1]               5
[verse 5/bridge]        5
[instrumental break]    5
[carols]                5
[verse 38]              5
[​chorus]               5
[verse 59]              5
[alt. chorus]           5
                       ..
[verse 71]              2
[verse 86]              2
[verse 93]              2
[bridge 4x]             2
[verse 82]              2
[verse 90]              2
[verse 95]              2
[verse 70]  

In [66]:
drop = eda_df[eda_df['id'] == '4TYZXfu6VeblQMK2TwbDte'].index.tolist()

In [67]:
eda_df = eda_df.drop(drop, axis=0)

In [68]:
eda_df[eda_df.loc[:,'tag'] == '[verse 55]']

Unnamed: 0,id,line,line_num,tag
2786,6xYd4zCVeSp80Un2Rl9wDs,Cruisin' is made for love,1,[verse 55]


In [69]:
lyric_df.loc['6xYd4zCVeSp80Un2Rl9wDs',:]['lyrics']

"\n\nBaby let's cruise away from here\n\nDon't be confused baby, the way is clear\n\nAnd if you want it you got it forever\n\nOh, this is not a one night stand baby\n\nLet the music take your mind\n\nAnd just release and you will find, baby\n\nWe're going to fly away\n\nGlad you're going my way\n\nI love it when we're cruisin' together\n\nMusic was made for love\n\nCruisin' is made for love\n\nI love it when we're cruisin' together\n\nBaby, tonight belongs to us\n\nEverything's right, do what you must, baby\n\nAnd inch by inch we get closer and closer\n\nEvery little part of each other ooh, baby, baby\n\nLet the music take your mind\n\nJust release and you will find, baby\n\nWe're going to fly away\n\nGlad you're going my way\n\nI love it when we're cruisin' together\n\nMusic was made for love\n\nCruisin' is made for love\n\nI love you when we're cruisin' together\n\nWe're going to fly away\n\nGlad you're going my way\n\nI love it when we're cruisin' together\n\nMusic was made for love

In [70]:
off_verse = eda_df[eda_df.id =='6xYd4zCVeSp80Un2Rl9wDs'].index.tolist()

In [71]:
eda_df.tag.loc[off_verse] = eda_df.tag.loc[off_verse].map(lambda x: '[verse 1]')

In [72]:
eda_df.loc[off_verse,'line_num'] = list(range(1,63))

In [73]:
eda_df.tag.value_counts()

[verse 1]                                                                                                                                                                                      11237
[verse 2]                                                                                                                                                                                       9976
[chorus]                                                                                                                                                                                        8023
[verse 3]                                                                                                                                                                                       7825
[verse 4]                                                                                                                                                                                       6687
[verse 5]      

In [74]:
eda_df[eda_df.loc[:,'tag'] == '[verse 28]']

Unnamed: 0,id,line,line_num,tag
7666,27zrFrtUtWl2urlvjOn5xc,Jinan bomeneun neol uyeonhi bwatgo,1,[verse 28]
7667,27zrFrtUtWl2urlvjOn5xc,Deo isangeun uyeoni aniyeosseumyeon hae,2,[verse 28]
7668,27zrFrtUtWl2urlvjOn5xc,Seuchigiman haetdeon neoui soneul japgo,3,[verse 28]
7669,27zrFrtUtWl2urlvjOn5xc,Eotteon girirado gachi georeosseumyeon hae,4,[verse 28]
7823,5efB9wfc6dn3pzll9ElIrH,Baby I’m so Lonely so Lonely,1,[verse 28]
7824,5efB9wfc6dn3pzll9ElIrH,I feel like I’m alone,2,[verse 28]
7825,5efB9wfc6dn3pzll9ElIrH,"Still, I don’t wanna hide it from you",3,[verse 28]
7826,5efB9wfc6dn3pzll9ElIrH,But I’m used to just holding it in,4,[verse 28]
7827,5efB9wfc6dn3pzll9ElIrH,Understand me,5,[verse 28]
9426,2za6PlkAQAMMu0VFsfBoHC,The engineers then proceeded to inspect the Sc...,1,[verse 28]


In [75]:
leagues = eda_df[eda_df.id =='2za6PlkAQAMMu0VFsfBoHC'].index.tolist()

In [76]:
eda_df = eda_df.drop(index=leagues, axis=0)

In [1]:
eda_df.describe()

NameError: name 'eda_df' is not defined

In [None]:
eda_df[eda_df.id == '3yPMLvxeEor7rbDLDseJFt']

In [None]:
tracks = eda_df[eda_df.id == '3yPMLvxeEor7rbDLDseJFt'].index.tolist()

In [None]:
eda_df = eda_df.drop(index=tracks, axis=0)

In [None]:
eda_df.describe()

In [None]:
eda_df.tag.value_counts()

In [None]:
eda_df[eda_df.tag.str.contains('\n')]

In [None]:
lyric_df.loc['1Ob94QesoxHSBrypyBbdfd'].lyrics = "\n\n[Verse 1]\nWe've been here before, surrounded in the cold\nYou take me to places I've never known\nAnd you push me to places I'll never go\nI would die for you\nTell me the truth\nAm I all that you would need?\nIf we're keeping it a hundred, you're all that I need for me\nFrom the jump until forever from now, I would ride for you\n\n[Chorus]\nBut you got me shot down by love\nYou got my heart now\nWhy won't you stop now?\nOh I've been knocked down by you\nYou got my heart now\nWhy won't you stop now?\n\n[Verse 2]\nI've been through it whole\nI've been through the worst\nBut I never knew how much our love could hurt\nOver my family I put you first\nWriting out my feelings is the only thing that work\nI don't hear from the friends I thought were mine too\nBut I hold on to the poems I would write you\nHappy seventeen, I saw us two and you saw three\nI guess our lines are structured like a haiku\n\n[Chorus 2]\nYou got me shot down by love\nAnd you got my heart now\nWhy won't you stop now?\nOh I've been knocked down by you\nAnd you got my heart now\nWhy won't you stop now?\n\n[Bridge]\nKnocked down, knocked down\nKnocked down, knocked down\nKnocked down\nKnocked down\n\n[Chorus]\nI've been shot down by love\nYou got my heart now\nWhy won't you stop now?\nI've been knocked down by you\nAnd you got my heart now\nWhy won't you stop now?\n\n"

In [None]:
bad_tag = eda_df.tag[eda_df.tag.str.contains('\n')].index.tolist()

In [None]:
eda_df = eda_df.drop(index=bad_tag,axis=0)

In [None]:
eda_df.tag.value_counts()

In [None]:
trace = go.histogarm(x=eda_df.tag)
data = trace
py.iplot([trace])