# Top Charts Exploratory Data Analysis

## Loading Dependencies

In [36]:
import pandas as pd
from collections import Counter
import altair as alt
import nltk
import regex as re

## Loading in Data

In [2]:
df = pd.read_csv('cleaned_data/all_top_songs_with_genres_nolist.csv')
# preview of dataframe
df.head()

Unnamed: 0,artist,track,genius_artist,genius_track,lyrics,genre
0,"The Weeknd, Daft Punk",Starboy,The weeknd,Starboy,"I'm tryna put you in the worst mood, ah P1 ...","r-b, pop"
1,"The Chainsmokers, Halsey",Closer,The chainsmokers,Closer,"Hey, I was doing just fine before I met you...",pop
2,Clean Bandit,Rockabye (feat. Sean Paul & Anne-Marie),Clean bandit,Rockabye,Call it love and devotion Call it a mom's a...,pop
3,"DJ Snake, Justin Bieber",Let Me Love You,Dj snake,Let me love you,I used to believe We were burnin' on the ed...,pop
4,"ZAYN, Taylor Swift",I Don’t Wanna Live Forever (Fifty Shades Darke...,Zayn,I dont wanna live forever,Been sitting eyes wide open behind these fo...,"r-b, pop"


## Cleaning Up List of Genres

In [3]:
# cleaning up the genres column on copy of dataframe
df_ = df.copy()
df_['genre'] = df_['genre'].str.split(", ")


In [4]:
# add all values to a list to generate a unique list of values
genres_list = []
for idx, value in enumerate(df_['genre']):
    genres_list.extend(value)

### Adding in Columns for genres

In [5]:
df_['pop'] = df.genre.str.contains('pop')==True
df_['rb'] = df.genre.str.contains('r-b')==True
df_['rap'] = df.genre.str.contains('rap')==True
df_['rock'] = df.genre.str.contains('rock')==True
df_['non-music'] = df.genre.str.contains('non-music')==True
df_['country'] = df.genre.str.contains('country')==True
df_['no_genre'] = df.genre.str.contains('m')==True

In [6]:
df_['pop'] = df_['pop'].astype(int)
df_['rb'] = df_['rb'].astype(int)
df_['rap'] = df_['rap'].astype(int)
df_['rock'] = df_['rock'].astype(int)
df_['non-music'] = df_['non-music'].astype(int)
df_['country'] = df_['country'].astype(int)
df_['no_genre'] = df_['no_genre'].astype(int)
df_.head()

Unnamed: 0,artist,track,genius_artist,genius_track,lyrics,genre,pop,rb,rap,rock,non-music,country,no_genre
0,"The Weeknd, Daft Punk",Starboy,The weeknd,Starboy,"I'm tryna put you in the worst mood, ah P1 ...","[r-b, pop]",1,1,0,0,0,0,0
1,"The Chainsmokers, Halsey",Closer,The chainsmokers,Closer,"Hey, I was doing just fine before I met you...",[pop],1,0,0,0,0,0,0
2,Clean Bandit,Rockabye (feat. Sean Paul & Anne-Marie),Clean bandit,Rockabye,Call it love and devotion Call it a mom's a...,[pop],1,0,0,0,0,0,0
3,"DJ Snake, Justin Bieber",Let Me Love You,Dj snake,Let me love you,I used to believe We were burnin' on the ed...,[pop],1,0,0,0,0,0,0
4,"ZAYN, Taylor Swift",I Don’t Wanna Live Forever (Fifty Shades Darke...,Zayn,I dont wanna live forever,Been sitting eyes wide open behind these fo...,"[r-b, pop]",1,1,0,0,0,0,0


In [7]:
### Saving to CSV
df_.to_csv('cleaned_data/OHE_all_top_songs.csv', index=False)

In [8]:
df_[df_['non-music'] == 1]['artist']

202         Don Omar, Zion & Lennox
586                    Lil Uzi Vert
591                    Lil Uzi Vert
614     G-Eazy, A$AP Rocky, Cardi B
700                          Eminem
750                          Eminem
758                          Eminem
763                          Eminem
767                          Eminem
769            G-Eazy, Charlie Puth
770                          Eminem
771                          Eminem
773                          Eminem
775                          Eminem
777                          Eminem
810                    Travis Scott
829                      Juice WRLD
1016                     Juice WRLD
1060                     Juice WRLD
1061                     Juice WRLD
1574                   Taylor Swift
1858                     Kanye West
2021                   Travis Scott
2093                         Eminem
2260                    Don Toliver
2312                         Eminem
2317                         Eminem
2466                    Don 

In [9]:
# drop non-music and bc they are all either having another genre or missing a genre
df_ = df_.drop(columns=['non-music'])

In [10]:
missing_genres = []
for i in range(len(df_.artist)):
    if sum(df_.iloc[i,6:11]) > 0:
        item = 0
        missing_genres.append(item)
    else:
        item = 1
        missing_genres.append(item)

In [11]:
df_['no_genre'] = missing_genres

## Visualizations

In [12]:
genre_frequencies = dict(Counter(genres_list))

genre_frequencies_df = pd.DataFrame.from_records([genre_frequencies])
genre_frequencies_df = genre_frequencies_df.rename(index={0:'counts'}).T.reset_index().rename(columns={'index':'genres'})
genre_frequencies

{'r-b': 520,
 'pop': 1912,
 'rap': 1463,
 'rock': 225,
 'non-music': 35,
 'country': 42,
 'm': 148}

In [13]:
bars = alt.Chart(data=genre_frequencies_df).mark_bar().encode(
x= 'genres',
y = 'counts',
color = 'genres'
)
text = bars.mark_text(
    align='center',
    # baseline='top',
    dy=-10 
).encode(
    text='counts:Q',
)

(bars + text).properties(height=500, width = 400,title = "Frequency of Genres on Top 200 Charts").configure_range(
    category={'scheme': 'tableau10'}
)

There seem to be data that is labeled as non-music which is strange because there shouldn't be any labeled non-music. If there is another genre listed, remove non-music

# Keyword Extraction

In [22]:
### Importing More Dependencies
from resources.word_extraction.text_cleaning import lem_stem_text
from resources.word_extraction.stopwords import remove_stopw, get_stopwords
from resources.analyze import find_keywords, find_instances

In [24]:
responses = list(df_['lyrics'])
concat_response = " ".join(df_["lyrics"].to_list())
keywords_df = find_keywords(concat_response)
responses_w_stopwords = " ".join(df_['lyrics'].to_list())
keywords_df['context_fragments'] = keywords_df['word'].progress_apply(find_instances, unaltered_string_list=responses_w_stopwords)
keywords_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unigram_df['associated_phrases'] = unigram_df['word'].apply(word_in_list, args=(keywords,))


  0%|          | 0/18 [00:00<?, ?it/s]

Unnamed: 0,word,associated_phrases,context_fragments
3,yeah,"[yeah yeah yeah, fuck yeah yeah, baby baby yea...","[...do what I got to do."", yeah So, rockabye, ..."
10,baby,"[baby baby yeah, yeah baby, bad bitch baby, ba...",[]
23,love,"[love hey yeah, love ooh ooh, baby i'ma love, ...","[...it with her face, man, I love my baby, ah ..."
52,fuck,"[fuck yeah yeah, yeah i'ma fuck, bitch wanna f...",[]
57,nigga,"[yeah niggas wanna, yeah ayy niggas, nigga fuc...","[...a motherfuckin' starboy Every day, a nigga..."
58,bitch,"[bad bitch baby, bitch wanna fuck, yeah bad bi...","[...in a week too, yah Main bitch out of your ..."
72,make,"[wanna make love, yeah make love, i'ma make, n...","[...frustration Clean Bandit, Sean-da-Paul, An..."
79,ooh,"[love ooh ooh, yeah ooh, ooh ooh yeah, christm...","[...they be like Ooh, so player ooh Everywhere..."
81,back,"[back high bitch, bitch back yeah, walk back y...",[...that you stole From your roommate back in ...
82,wanna,"[bitch wanna fuck, yeah niggas wanna, wanna ma...",[...is happening to me? I don't wanna live for...


In [15]:
df_.head()

Unnamed: 0,artist,track,genius_artist,genius_track,lyrics,genre,pop,rb,rap,rock,country,no_genre
0,"The Weeknd, Daft Punk",Starboy,The weeknd,Starboy,"I'm tryna put you in the worst mood, ah P1 ...","[r-b, pop]",1,1,0,0,0,0
1,"The Chainsmokers, Halsey",Closer,The chainsmokers,Closer,"Hey, I was doing just fine before I met you...",[pop],1,0,0,0,0,0
2,Clean Bandit,Rockabye (feat. Sean Paul & Anne-Marie),Clean bandit,Rockabye,Call it love and devotion Call it a mom's a...,[pop],1,0,0,0,0,0
3,"DJ Snake, Justin Bieber",Let Me Love You,Dj snake,Let me love you,I used to believe We were burnin' on the ed...,[pop],1,0,0,0,0,0
4,"ZAYN, Taylor Swift",I Don’t Wanna Live Forever (Fifty Shades Darke...,Zayn,I dont wanna live forever,Been sitting eyes wide open behind these fo...,"[r-b, pop]",1,1,0,0,0,0


In [16]:
df_['cleaned_lyrics'] = df_['lyrics'].str.replace('[^\w\s]','')
df_['cleaned_lyrics'] = df_['cleaned_lyrics'].str.replace('missing lyrics','')
df_['cleaned_lyrics'] = df_['cleaned_lyrics'].apply(remove_stopw)
df_['cleaned_lyrics'] = df_['cleaned_lyrics'].apply(lem_stem_text)
df_['cleaned_lyrics'] = df_.cleaned_lyrics.str.strip().str.split(' ')

In [17]:
df_

Unnamed: 0,artist,track,genius_artist,genius_track,lyrics,genre,pop,rb,rap,rock,country,no_genre,cleaned_lyrics
0,"The Weeknd, Daft Punk",Starboy,The weeknd,Starboy,"I'm tryna put you in the worst mood, ah P1 ...","[r-b, pop]",1,1,0,0,0,0,"[im, tryna, worst, mood, ah, p1, cleaner, chur..."
1,"The Chainsmokers, Halsey",Closer,The chainsmokers,Closer,"Hey, I was doing just fine before I met you...",[pop],1,0,0,0,0,0,"[hey, fine, met, drink, issu, im, hey, friend,..."
2,Clean Bandit,Rockabye (feat. Sean Paul & Anne-Marie),Clean bandit,Rockabye,Call it love and devotion Call it a mom's a...,[pop],1,0,0,0,0,0,"[call, love, devot, call, mom, ador, foundat, ..."
3,"DJ Snake, Justin Bieber",Let Me Love You,Dj snake,Let me love you,I used to believe We were burnin' on the ed...,[pop],1,0,0,0,0,0,"[burnin, edg, somethin, beauti, somethin, beau..."
4,"ZAYN, Taylor Swift",I Don’t Wanna Live Forever (Fifty Shades Darke...,Zayn,I dont wanna live forever,Been sitting eyes wide open behind these fo...,"[r-b, pop]",1,1,0,0,0,0,"[sit, eye, wide, wall, hope, youd, call, cruel..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3419,Sfera Ebbasta,Hollywood (feat. Diplo),Sfera ebbasta,Hollywood,La mia ex mi scrive Mi dice che le manco ...,[pop],1,0,0,0,0,0,"[scrive, dice, che, manco, da, morir, quando, ..."
3420,Trio Vegabajeño,CANTARES DE NAVIDAD,Trio vegabajeno,Cantares de navidad,"Navidad que vuelve, tradición del año Uno...",[pop],1,0,0,0,0,0,"[navidad, vuelv, tradición, año, van, alegr, v..."
3421,"Camilo, El Alfa",BEBÉ,Camilo,Bebe,"Un, dos, tres y El la'o de tu cama que ...",[pop],1,0,0,0,0,0,"[do, tre, lao, cama, calient, está, congelando..."
3422,Taylor Swift,long story short,Taylor swift,Long story short,missing lyrics,[m],0,0,0,0,0,1,[]


In [18]:
## getting a list of all lemmed and stemmed keywords without stopwords
lyrics_wordlist = df_['cleaned_lyrics'].tolist()
words_list = []
for i in lyrics_wordlist:
    words_list.extend(i)
len(words_list)

579547

In [19]:
# Creating a DataFrame of the Word Counts
lyric_word_frequencies = pd.DataFrame.from_dict(Counter(words_list), orient = 'index').reset_index()
lyric_word_frequencies = lyric_word_frequencies.rename(columns={'index':'word', 0:'count'})
lyric_word_frequencies = lyric_word_frequencies.sort_values(by = "count", ascending = False)
lyric_word_frequencies

Unnamed: 0,word,count
0,im,15249
214,yeah,9620
40,dont,8962
33,love,6381
34,babi,4687
...,...,...
23310,børn,1
23307,lore,1
23306,yoke,1
23305,esso,1


In [20]:
lyric_word_frequencies.head(20)

Unnamed: 0,word,count
0,im,15249
214,yeah,9620
40,dont,8962
33,love,6381
34,babi,4687
52,nigga,4462
19,bitch,3959
134,aint,3938
250,wanna,3580
247,feel,3389


In [46]:
lyric_word_frequencies.to_csv('cleaned_data/lyric_word_frequencies.csv', index = False)

In [44]:
top_100 = lyric_word_frequencies[:100]
top_100

Unnamed: 0,word,count
0,im,15249
214,yeah,9620
40,dont,8962
33,love,6381
34,babi,4687
...,...,...
456,fuckin,673
306,chang,667
723,diamond,653
1202,má,645


In [43]:
top_100['context_fragments'] = top_100['word'].progress_apply(find_instances, unaltered_string_list=responses_w_stopwords)
top_100

  0%|          | 0/100 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_100['context_fragments'] = top_100['word'].progress_apply(find_instances, unaltered_string_list=responses_w_stopwords)


Unnamed: 0,word,count,context_fragments
0,im,15249,[]
214,yeah,9620,"[...do what I got to do."", yeah So, rockabye, ..."
40,dont,8962,"[...life shit, niggas make one I dont c...]"
33,love,6381,"[...it with her face, man, I love my baby, ah ..."
34,babi,4687,[]
...,...,...,...
456,fuckin,673,[]
306,chang,667,[]
723,diamond,653,[]
1202,má,645,[]
