# Top Charts Exploratory Data Analysis

## Loading Dependencies

In [191]:
import pandas as pd
from collections import Counter
import altair as alt
import nltk
import regex as re

## Loading in Data

In [192]:
df = pd.read_csv('cleaned_data/all_top_songs_with_genres_nolist.csv')
# preview of dataframe
df.head()

Unnamed: 0,artist,track,genius_artist,genius_track,lyrics,genre
0,"The Weeknd, Daft Punk",Starboy,The weeknd,Starboy,"I'm tryna put you in the worst mood, ah P1 ...","r-b, pop"
1,"The Chainsmokers, Halsey",Closer,The chainsmokers,Closer,"Hey, I was doing just fine before I met you...",pop
2,Clean Bandit,Rockabye (feat. Sean Paul & Anne-Marie),Clean bandit,Rockabye,Call it love and devotion Call it a mom's a...,pop
3,"DJ Snake, Justin Bieber",Let Me Love You,Dj snake,Let me love you,I used to believe We were burnin' on the ed...,pop
4,"ZAYN, Taylor Swift",I Don’t Wanna Live Forever (Fifty Shades Darke...,Zayn,I dont wanna live forever,Been sitting eyes wide open behind these fo...,"r-b, pop"


## Cleaning Up List of Genres

In [193]:
# cleaning up the genres column on copy of dataframe
df_ = df.copy()
df_['genre'] = df_['genre'].str.split(", ")


In [194]:
# add all values to a list to generate a unique list of values
genres_list = []
for idx, value in enumerate(df_['genre']):
    genres_list.extend(value)

### Adding in Columns for genres

In [195]:
df_['pop'] = df.genre.str.contains('pop')==True
df_['rb'] = df.genre.str.contains('r-b')==True
df_['rap'] = df.genre.str.contains('rap')==True
df_['rock'] = df.genre.str.contains('rock')==True
df_['non-music'] = df.genre.str.contains('non-music')==True
df_['country'] = df.genre.str.contains('country')==True
df_['no_genre'] = df.genre.str.contains('m')==True

In [196]:
df_['pop'] = df_['pop'].astype(int)
df_['rb'] = df_['rb'].astype(int)
df_['rap'] = df_['rap'].astype(int)
df_['rock'] = df_['rock'].astype(int)
df_['non-music'] = df_['non-music'].astype(int)
df_['country'] = df_['country'].astype(int)
df_['no_genre'] = df_['no_genre'].astype(int)
df_.head()

Unnamed: 0,artist,track,genius_artist,genius_track,lyrics,genre,pop,rb,rap,rock,non-music,country,no_genre
0,"The Weeknd, Daft Punk",Starboy,The weeknd,Starboy,"I'm tryna put you in the worst mood, ah P1 ...","[r-b, pop]",1,1,0,0,0,0,0
1,"The Chainsmokers, Halsey",Closer,The chainsmokers,Closer,"Hey, I was doing just fine before I met you...",[pop],1,0,0,0,0,0,0
2,Clean Bandit,Rockabye (feat. Sean Paul & Anne-Marie),Clean bandit,Rockabye,Call it love and devotion Call it a mom's a...,[pop],1,0,0,0,0,0,0
3,"DJ Snake, Justin Bieber",Let Me Love You,Dj snake,Let me love you,I used to believe We were burnin' on the ed...,[pop],1,0,0,0,0,0,0
4,"ZAYN, Taylor Swift",I Don’t Wanna Live Forever (Fifty Shades Darke...,Zayn,I dont wanna live forever,Been sitting eyes wide open behind these fo...,"[r-b, pop]",1,1,0,0,0,0,0


In [197]:
### Saving to CSV
df_.to_csv('cleaned_data/OHE_all_top_songs.csv', index=False)

In [198]:
df_[df_['non-music'] == 1]['artist']

202         Don Omar, Zion & Lennox
586                    Lil Uzi Vert
591                    Lil Uzi Vert
614     G-Eazy, A$AP Rocky, Cardi B
700                          Eminem
750                          Eminem
758                          Eminem
763                          Eminem
767                          Eminem
769            G-Eazy, Charlie Puth
770                          Eminem
771                          Eminem
773                          Eminem
775                          Eminem
777                          Eminem
810                    Travis Scott
829                      Juice WRLD
1016                     Juice WRLD
1060                     Juice WRLD
1061                     Juice WRLD
1574                   Taylor Swift
1858                     Kanye West
2021                   Travis Scott
2093                         Eminem
2260                    Don Toliver
2312                         Eminem
2317                         Eminem
2466                    Don 

In [199]:
# drop non-music and bc they are all either having another genre or missing a genre
df_ = df_.drop(columns=['non-music'])

In [200]:
missing_genres = []
for i in range(len(df_.artist)):
    if sum(df_.iloc[i,6:11]) > 0:
        item = 0
        missing_genres.append(item)
    else:
        item = 1
        missing_genres.append(item)

In [201]:
df_['no_genre'] = missing_genres

## Visualizations

In [202]:
genre_frequencies = dict(Counter(genres_list))
genre_frequencies

{'r-b': 520,
 'pop': 1912,
 'rap': 1463,
 'rock': 225,
 'non-music': 35,
 'country': 42,
 'm': 148}

In [203]:
genre_frequencies_df = pd.DataFrame.from_records([genre_frequencies])
genre_frequencies_df = genre_frequencies_df.rename(index={0:'counts'}).T.reset_index().rename(columns={'index':'genres'})
genre_frequencies_df = genre_frequencies_df[genre_frequencies_df['genres'].isin(['r-b', 'pop', 'rap', 'rock', 'country'])]
genre_frequencies_df.to_csv('cleaned_data/genre_song_counts.csv', index = False)

In [204]:
bars = alt.Chart(data=genre_frequencies_df).mark_bar().encode(
x= 'genres',
y = 'counts',
color = 'genres'
)
text = bars.mark_text(
    align='center',
    # baseline='top',
    dy=-10 
).encode(
    text='counts:Q',
)

(bars + text).properties(height=500, width = 400,title = "Frequency of Genres on Top 200 Charts").configure_range(
    category={'scheme': 'tableau10'}
)

There seem to be data that is labeled as non-music which is strange because there shouldn't be any labeled non-music. If there is another genre listed, remove non-music

# Keyword Extraction of all Genres

In [205]:
### Importing More Dependencies
from resources.word_extraction.text_cleaning import lem_stem_text
from resources.word_extraction.stopwords import remove_stopw, get_stopwords
from resources.analyze import find_keywords, find_instances

In [206]:
df_['cleaned_lyrics'] = df_['lyrics'].str.replace('[^\w\s]','')
df_['cleaned_lyrics'] = df_['cleaned_lyrics'].str.replace('missing lyrics','')
df_['cleaned_lyrics'] = df_['cleaned_lyrics'].apply(remove_stopw)
df_['cleaned_lyrics'] = df_['cleaned_lyrics'].apply(lem_stem_text)
df_['cleaned_lyrics'] = df_.cleaned_lyrics.str.strip().str.split(' ')

In [207]:
df_

Unnamed: 0,artist,track,genius_artist,genius_track,lyrics,genre,pop,rb,rap,rock,country,no_genre,cleaned_lyrics
0,"The Weeknd, Daft Punk",Starboy,The weeknd,Starboy,"I'm tryna put you in the worst mood, ah P1 ...","[r-b, pop]",1,1,0,0,0,0,"[im, tryna, worst, mood, ah, p1, cleaner, chur..."
1,"The Chainsmokers, Halsey",Closer,The chainsmokers,Closer,"Hey, I was doing just fine before I met you...",[pop],1,0,0,0,0,0,"[hey, fine, met, drink, issu, im, hey, friend,..."
2,Clean Bandit,Rockabye (feat. Sean Paul & Anne-Marie),Clean bandit,Rockabye,Call it love and devotion Call it a mom's a...,[pop],1,0,0,0,0,0,"[call, love, devot, call, mom, ador, foundat, ..."
3,"DJ Snake, Justin Bieber",Let Me Love You,Dj snake,Let me love you,I used to believe We were burnin' on the ed...,[pop],1,0,0,0,0,0,"[burnin, edg, somethin, beauti, somethin, beau..."
4,"ZAYN, Taylor Swift",I Don’t Wanna Live Forever (Fifty Shades Darke...,Zayn,I dont wanna live forever,Been sitting eyes wide open behind these fo...,"[r-b, pop]",1,1,0,0,0,0,"[sit, eye, wide, wall, hope, youd, call, cruel..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3419,Sfera Ebbasta,Hollywood (feat. Diplo),Sfera ebbasta,Hollywood,La mia ex mi scrive Mi dice che le manco ...,[pop],1,0,0,0,0,0,"[scrive, dice, che, manco, da, morir, quando, ..."
3420,Trio Vegabajeño,CANTARES DE NAVIDAD,Trio vegabajeno,Cantares de navidad,"Navidad que vuelve, tradición del año Uno...",[pop],1,0,0,0,0,0,"[navidad, vuelv, tradición, año, van, alegr, v..."
3421,"Camilo, El Alfa",BEBÉ,Camilo,Bebe,"Un, dos, tres y El la'o de tu cama que ...",[pop],1,0,0,0,0,0,"[do, tre, lao, cama, calient, está, congelando..."
3422,Taylor Swift,long story short,Taylor swift,Long story short,missing lyrics,[m],0,0,0,0,0,1,[]


In [208]:
## getting a list of all lemmed and stemmed keywords without stopwords
lyrics_wordlist = df_['cleaned_lyrics'].tolist()
words_list = []
for i in lyrics_wordlist:
    words_list.extend(i)
len(words_list)

579547

In [209]:
# Creating a DataFrame of the Word Counts
lyric_word_frequencies = pd.DataFrame.from_dict(Counter(words_list), orient = 'index').reset_index()
lyric_word_frequencies = lyric_word_frequencies.rename(columns={'index':'word', 0:'count'})
lyric_word_frequencies = lyric_word_frequencies.sort_values(by = "count", ascending = False)
lyric_word_frequencies

Unnamed: 0,word,count
0,im,15249
214,yeah,9620
40,dont,8962
33,love,6381
34,babi,4687
...,...,...
23310,børn,1
23307,lore,1
23306,yoke,1
23305,esso,1


In [210]:
lyric_word_frequencies.head(20)

Unnamed: 0,word,count
0,im,15249
214,yeah,9620
40,dont,8962
33,love,6381
34,babi,4687
52,nigga,4462
19,bitch,3959
134,aint,3938
250,wanna,3580
247,feel,3389


In [211]:
lyric_word_frequencies.to_csv('cleaned_data/lyric_word_frequencies.csv', index = False)

In [212]:
top_100 = lyric_word_frequencies[:100]
top_100

Unnamed: 0,word,count
0,im,15249
214,yeah,9620
40,dont,8962
33,love,6381
34,babi,4687
...,...,...
456,fuckin,673
306,chang,667
723,diamond,653
1202,má,645


## Top Words by Genre

In [213]:
pd.Series(genres_list).unique()

array(['r-b', 'pop', 'rap', 'rock', 'non-music', 'country', 'm'],
      dtype=object)

In [214]:
pop = df_[df_['pop'] == 1]
rb = df_[df_['rb'] == 1]
rap = df_[df_['rap'] == 1]
rock = df_[df_['rock'] == 1]
country = df_[df_['country'] == 1]
m = df_[df_['no_genre'] == 1]

In [215]:
def top_lyrics(df, dfname):
    '''Function to find the top lyric unigrams based on a df containing lyrics'''
    ## getting a list of all lemmed and stemmed keywords without stopwords
    lyrics_wordlist = df['cleaned_lyrics'].tolist()
    words_list = []
    for i in lyrics_wordlist:
        words_list.extend(i)
    len(words_list)
    # Creating a DataFrame of the Word Counts
    lyric_word_frequencies = pd.DataFrame.from_dict(Counter(words_list), orient = 'index').reset_index()
    lyric_word_frequencies = lyric_word_frequencies.rename(columns={'index':'word', 0:'count'})
    lyric_word_frequencies = lyric_word_frequencies.sort_values(by = "count", ascending = False)
    lyric_word_frequencies['genre'] = dfname
    return lyric_word_frequencies

In [216]:
rb_lyrics = top_lyrics(rb, 'r-b')[:15]
rb_lyrics

Unnamed: 0,word,count,genre
0,im,2541,r-b
40,dont,1932,r-b
145,yeah,1915,r-b
33,love,1532,r-b
34,babi,1044,r-b
175,your,760,r-b
109,feel,703,r-b
113,wanna,698,r-b
138,aint,681,r-b
91,girl,664,r-b


In [217]:
pop_lyrics = top_lyrics(pop, 'pop')[:15]

In [218]:
country_lyrics = top_lyrics(country, 'country')[:15]

In [219]:
rock_lyrics = top_lyrics(rock, 'rock')[:15]

In [220]:
rap_lyrics = top_lyrics(rap, 'rap')[:15]

In [232]:
full_lyrics = pd.concat([pop_lyrics,country_lyrics,rock_lyrics,rap_lyrics,rb_lyrics])
full_lyrics

Unnamed: 0,word,count,genre
0,im,7016,pop
40,dont,4766,pop
33,love,4693,pop
214,yeah,4178,pop
34,babi,3043,pop
...,...,...,...
116,ill,478,r-b
111,night,474,r-b
223,fuck,438,r-b
127,life,395,r-b


In [233]:
full_lyrics.to_csv('cleaned_data/lyric_frequencies/top15_all_genres_lyric_frequencies.csv', index = False)

## Top Songs By Genre 

I forgot to get the top songs by genre streams so I am re importing the top 200 files and the previously created OHE (one-hot-encoded) df to create a new df with the streams

In [2]:
import pandas as pd

In [8]:
## OTHER MISC DATA CLEANING 
df1 = pd.read_csv('/Users/daphneyang/Desktop/5YMIDS_SP21/w209/spotify-visualizations/cleaned_data/2017_weekly_all_locations_top200.csv')
df2 = pd.read_csv('/Users/daphneyang/Desktop/5YMIDS_SP21/w209/spotify-visualizations/cleaned_data/2018_weekly_all_locations_top200.csv')
df3 = pd.read_csv('/Users/daphneyang/Desktop/5YMIDS_SP21/w209/spotify-visualizations/cleaned_data/2019_weekly_all_locations_top200.csv')
df4 = pd.read_csv('/Users/daphneyang/Desktop/5YMIDS_SP21/w209/spotify-visualizations/cleaned_data/2020_weekly_all_locations_top200.csv')
df = pd.concat([df1, df2, df3, df4])
df['streams'] = df['streams'].str.replace(",", '').astype(int)
global_df = df[df['country_chart'].str.contains("Global")]
global_df_total = global_df.groupby(["track", 'spotify_link']).sum().reset_index()
lyrics_df = pd.read_csv('/Users/daphneyang/Desktop/5YMIDS_SP21/w209/spotify-visualizations/cleaned_data/OHE_all_top_songs.csv')
merged_df = pd.merge(lyrics_df, global_df_total, "inner", on = "track")
merged_df = merged_df.rename(columns={'streams': "total_streams"})
merged_df

Unnamed: 0,artist,track,genius_artist,genius_track,lyrics,genre,pop,rb,rap,rock,non-music,country,no_genre,spotify_link,rank,total_streams
0,"The Weeknd, Daft Punk",Starboy,The weeknd,Starboy,"I'm tryna put you in the worst mood, ah P1 ...","['r-b', 'pop']",1,1,0,0,0,0,0,https://open.spotify.com/track/5aAx2yezTd8zXrk...,6529.0,486509487
1,"The Weeknd, Daft Punk",Starboy,The weeknd,Starboy,"I'm tryna put you in the worst mood, ah P1 ...","['r-b', 'pop']",1,1,0,0,0,0,0,https://open.spotify.com/track/7MXVkk9YMctZqd1...,912.0,24036547
2,"The Chainsmokers, Halsey",Closer,The chainsmokers,Closer,"Hey, I was doing just fine before I met you...",['pop'],1,0,0,0,0,0,0,https://open.spotify.com/track/7BKLCZ1jbUBVqRi...,19446.0,910179014
3,Clean Bandit,Rockabye (feat. Sean Paul & Anne-Marie),Clean bandit,Rockabye,Call it love and devotion Call it a mom's a...,['pop'],1,0,0,0,0,0,0,https://open.spotify.com/track/5knuzwU65gJK7IF...,4366.0,479452880
4,"DJ Snake, Justin Bieber",Let Me Love You,Dj snake,Let me love you,I used to believe We were burnin' on the ed...,['pop'],1,0,0,0,0,0,0,https://open.spotify.com/track/4pdPtRcBmOSQDlJ...,4072.0,291862091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4231,Sfera Ebbasta,Hollywood (feat. Diplo),Sfera ebbasta,Hollywood,La mia ex mi scrive Mi dice che le manco ...,['pop'],1,0,0,0,0,0,0,https://open.spotify.com/track/4CvW0iHAs7yJhbp...,192.0,4920218
4232,Trio Vegabajeño,CANTARES DE NAVIDAD,Trio vegabajeno,Cantares de navidad,"Navidad que vuelve, tradición del año Uno...",['pop'],1,0,0,0,0,0,0,https://open.spotify.com/track/2Iv6YBslkxslMJQ...,171.0,5842320
4233,"Camilo, El Alfa",BEBÉ,Camilo,Bebe,"Un, dos, tres y El la'o de tu cama que ...",['pop'],1,0,0,0,0,0,0,https://open.spotify.com/track/7D7EH7MGyNHWSkq...,308.0,26124781
4234,Taylor Swift,long story short,Taylor swift,Long story short,missing lyrics,['m'],0,0,0,0,0,0,1,https://open.spotify.com/track/0vVMlbdYx2080Oa...,45.0,13583538


In [48]:
pop = merged_df[merged_df['pop'] == 1][['track', 'artist', 'total_streams', 'spotify_link']].reset_index(drop=True).sort_values(by=['total_streams'], ascending = False)[:11]
pop['genre'] = 'pop'
rb = merged_df[merged_df['rb'] == 1][['track', 'artist', 'total_streams', 'spotify_link']].reset_index(drop=True).sort_values(by=['total_streams'], ascending = False)[:11]
rb['genre'] = 'r-b'
rap = merged_df[merged_df['rap'] == 1][['track', 'artist', 'total_streams', 'spotify_link']].reset_index(drop=True).sort_values(by=['total_streams'], ascending = False)[:13]
rap['genre'] = 'rap'
rock = merged_df[merged_df['rock'] == 1][['track', 'artist', 'total_streams', 'spotify_link']].reset_index(drop=True).sort_values(by=['total_streams'], ascending = False)[:13]
rock['genre'] = 'rock'
country = merged_df[merged_df['country'] == 1][['track', 'artist', 'total_streams', 'spotify_link']].reset_index(drop=True).sort_values(by=['total_streams'], ascending = False)[:12]
country['genre'] = 'country'
df_output = pd.concat([pop, rb, rap, rock, country])

In [49]:
df_output

Unnamed: 0,track,artist,total_streams,spotify_link,genre
164,Shape of You,Ed Sheeran,2561858449,https://open.spotify.com/track/7qiZfU4dY1lWllz...,pop
1181,Dance Monkey,Tones And I,1977350595,https://open.spotify.com/track/1rgnBhdG2JDFTbY...,pop
984,bad guy,Billie Eilish,1626562811,https://open.spotify.com/track/2Fxmhks0bxGSBdJ...,pop
232,Perfect,Ed Sheeran,1525083056,https://open.spotify.com/track/0tgVpDi06FyKpA1...,pop
887,Someone You Loved,Lewis Capaldi,1460020967,https://open.spotify.com/track/7qEHsqek33rTcFN...,pop
751,SAD!,XXXTENTACION,1384600058,https://open.spotify.com/track/3ee8Jmje8o58CHK...,pop
1132,Señorita,"Shawn Mendes, Camila Cabello",1341350079,https://open.spotify.com/track/6v3KW9xbzN5yKLt...,pop
7,Say You Won't Let Go,James Arthur,1297986004,https://open.spotify.com/track/5uCax9HTNlzGybI...,pop
237,Happier,"Marshmello, Bastille",1294102267,https://open.spotify.com/track/2dpaYNEQHiRxtZb...,pop
235,Happier,Ed Sheeran,1294102267,https://open.spotify.com/track/2dpaYNEQHiRxtZb...,pop


In [55]:
df_output.iloc[59][3]

'https://open.spotify.com/track/5f1joOtoMeyppIcJGZQvqJ'

In [24]:
# Change all links to embed links
df_output.to_csv('../cleaned_data/top10_by_genre_all_time.csv', index = False)

### Creating All Topic Songs With Years 

In [1]:
import pandas as pd 

df1 = pd.read_csv('/Users/daphneyang/Desktop/5YMIDS_SP21/w209/spotify-visualizations/cleaned_data/2017_weekly_all_locations_top200.csv')
df1['year'] = '2017'
df2 = pd.read_csv('/Users/daphneyang/Desktop/5YMIDS_SP21/w209/spotify-visualizations/cleaned_data/2018_weekly_all_locations_top200.csv')
df2['year'] = '2018'
df3 = pd.read_csv('/Users/daphneyang/Desktop/5YMIDS_SP21/w209/spotify-visualizations/cleaned_data/2019_weekly_all_locations_top200.csv')
df3['year'] = '2019'
df4 = pd.read_csv('/Users/daphneyang/Desktop/5YMIDS_SP21/w209/spotify-visualizations/cleaned_data/2020_weekly_all_locations_top200.csv')
df4['year'] = '2020'

In [2]:
df = pd.concat([df1, df2, df3, df4])

all_locations_df_max = df.groupby(["track", 'artist','country_chart', 'year']).max().reset_index()[['track','artist',"year", 'streams', "country_chart",'spotify_link']]

all_locations_df_max

Unnamed: 0,track,artist,year,streams,country_chart,spotify_link
0,!,O.S.T.R.,2018,63895,Poland,https://open.spotify.com/track/5dwTUHjTcaMMkCD...
1,!,"Samey, Gleb",2019,35871,Czech Republic,https://open.spotify.com/track/1A05ibu1DXGIt0F...
2,!,"Samey, Gleb",2019,59254,Slovakia,https://open.spotify.com/track/1A05ibu1DXGIt0F...
3,!,"Samey, Gleb",2020,13698,Slovakia,https://open.spotify.com/track/1A05ibu1DXGIt0F...
4,!,Trippie Redd,2019,268954,Canada,https://open.spotify.com/track/1qtHdSxiavDgUPZ...
...,...,...,...,...,...,...
240691,항상 (HANGSANG),"j-hope, Supreme Boi",2018,75326,Indonesia,https://open.spotify.com/track/4mYu3kfBCW6qiTD...
240692,항상 (HANGSANG),"j-hope, Supreme Boi",2018,2705,Lithuania,https://open.spotify.com/track/4mYu3kfBCW6qiTD...
240693,항상 (HANGSANG),"j-hope, Supreme Boi",2018,36333,Malaysia,https://open.spotify.com/track/4mYu3kfBCW6qiTD...
240694,항상 (HANGSANG),"j-hope, Supreme Boi",2018,27905,Taiwan,https://open.spotify.com/track/4mYu3kfBCW6qiTD...


In [3]:
all_locations_df_max.to_csv("cleaned_data/2017_2020_all_locations_max_streams.csv", index = False)