In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
dta = pd.read_csv("master-process-data.csv")

In [10]:
subset = dta[['genre','lyrics']]
dta.dropna(subset=['genre', 'lyrics'], inplace=True)

#replace new line with space
dta = dta.replace({'\n': ' '}, regex=True)

#only have strings as lyrics
dta = dta[dta['lyrics'].apply(type) == str]

In [11]:
#Songs per genre
print(dta['genre'].value_counts())

Rock             109221
rap               53557
Pop               40466
Hip-Hop           24849
Not Available     23934
Metal             23759
Country           14387
Jazz               7971
Electronic         7966
Other              5189
R&B                3401
Indie              3149
Folk               2243
Name: genre, dtype: int64


In [10]:
#Average length of songs in each genre
dta['word_num'] = dta['lyrics'].str.split().str.len()
avg_len = dta.groupby('genre')['word_num'].mean()
dta = dta.drop(columns = ['word_num'])
print(avg_len)

genre
Country          185.787099
Electronic       193.070550
Folk             180.422648
Hip-Hop          489.371645
Indie            196.710384
Jazz             171.952954
Metal            168.438108
Not Available    209.335798
Other            223.667566
Pop              245.412050
R&B              224.786827
Rock             190.341143
rap              310.597662
Name: word_num, dtype: float64


In [117]:
#most common metal word
print("Most Common Metal Words: ")
Counter("Most common metal: \n ".join(dta.loc[dta['genre']=='Metal','lyrics']).split()).most_common(100)

Most Common Metal Words: 


[('the', 210628),
 ('i', 94608),
 ('to', 92835),
 ('of', 86348),
 ('you', 82233),
 ('and', 77236),
 ('a', 66455),
 ('in', 60893),
 ('my', 55688),
 ('your', 43802),
 ('me', 42062),
 ('is', 39997),
 ('for', 32104),
 ('it', 29504),
 ('all', 27296),
 ('we', 25380),
 ('on', 25226),
 ('this', 24877),
 ('that', 24170),
 ('common', 23942),
 ('metal:', 23758),
 ('will', 22767),
 ('with', 22007),
 ('be', 21276),
 ('no', 21137),
 ("i'm", 18065),
 ('are', 18004),
 ('so', 17830),
 ('from', 17778),
 ('but', 15340),
 ('now', 15040),
 ('what', 14323),
 ('as', 13714),
 ("it's", 13663),
 ('life', 13445),
 ('one', 12876),
 ('time', 12860),
 ('by', 12593),
 ("don't", 12582),
 ('they', 12462),
 ('not', 12287),
 ('see', 12222),
 ('our', 12174),
 ('never', 12069),
 ('like', 12005),
 ('when', 11622),
 ('have', 11499),
 ('can', 11465),
 ('out', 11201),
 ('just', 10989),
 ('know', 10474),
 ('up', 10159),
 ('was', 9793),
 ('down', 9731),
 ('at', 9030),
 ('world', 8335),
 ('do', 8230),
 ('am', 8211),
 ('their', 8

In [118]:
#most common rap word
print("Most Common Rap Words: ")
Counter(" ".join(dta.loc[dta['genre']=='rap','lyrics']).split()).most_common(100)

Most Common Rap Words: 


[('the', 581723),
 ('i', 581385),
 ('you', 543237),
 ('and', 341599),
 ('to', 336701),
 ('a', 327513),
 ('me', 269735),
 ('my', 248757),
 ('it', 225884),
 ('in', 202607),
 ('on', 167148),
 ('that', 166719),
 ("i'm", 158552),
 ('of', 140781),
 ('your', 139002),
 ('all', 114892),
 ('like', 113253),
 ('be', 111492),
 ('for', 110151),
 ('we', 106683),
 ('love', 106463),
 ("don't", 106261),
 ('is', 105542),
 ('with', 102690),
 ('know', 101093),
 ('up', 99786),
 ('so', 93774),
 ('but', 92127),
 ('yeah', 91588),
 ('got', 89995),
 ('no', 89927),
 ('just', 87898),
 ('this', 82071),
 ('get', 80879),
 ("it's", 78804),
 ('oh', 75817),
 ('when', 75768),
 ('what', 72382),
 ('do', 71217),
 ('now', 65151),
 ('they', 64454),
 ('baby', 64044),
 ('if', 62639),
 ('can', 62202),
 ('out', 60836),
 ('she', 56029),
 ('was', 55450),
 ('go', 54723),
 ('down', 50390),
 ('one', 49764),
 ("ain't", 49189),
 ('see', 46677),
 ('time', 46552),
 ('never', 46423),
 ("you're", 45625),
 ('want', 44823),
 ("can't", 44071),

In [119]:
#most common rock word
print("Most Common Rock Words: ")
Counter(" ".join(dta.loc[dta['genre']=='Rock','lyrics']).split()).most_common(100)

Most Common Rock Words: 


[('the', 809641),
 ('i', 656646),
 ('you', 653628),
 ('to', 475124),
 ('and', 469616),
 ('a', 402732),
 ('me', 296258),
 ('in', 266901),
 ('my', 249782),
 ('it', 235627),
 ('of', 230703),
 ('your', 198446),
 ('on', 176022),
 ('that', 173440),
 ('all', 162517),
 ("i'm", 154407),
 ('is', 150609),
 ('for', 140552),
 ('be', 134327),
 ('we', 125082),
 ('so', 124034),
 ("don't", 123951),
 ('but', 118421),
 ("it's", 113100),
 ('no', 111772),
 ('know', 109285),
 ('just', 104807),
 ('this', 103436),
 ('with', 101522),
 ('love', 101394),
 ('when', 92817),
 ('like', 92004),
 ('what', 91059),
 ('up', 84689),
 ('can', 82579),
 ('now', 81113),
 ('oh', 78889),
 ('down', 77905),
 ('out', 77462),
 ('do', 75551),
 ("you're", 74413),
 ('got', 73020),
 ('time', 72931),
 ('are', 72930),
 ('if', 72181),
 ('was', 69977),
 ('never', 67879),
 ('one', 67405),
 ('will', 67286),
 ('see', 65942),
 ('not', 64567),
 ('get', 64453),
 ('go', 64414),
 ('have', 59884),
 ('she', 59340),
 ('they', 58957),
 ('want', 58198)

In [120]:
#most common R&B word
print("Most Common R&B Words: ")
Counter(" ".join(dta.loc[dta['genre']=='R&B','lyrics']).split()).most_common(100)

Most Common R&B Words: 


[('you', 32919),
 ('i', 29075),
 ('the', 23521),
 ('to', 18520),
 ('and', 16474),
 ('me', 13747),
 ('a', 13241),
 ('my', 10566),
 ('it', 10561),
 ('in', 8419),
 ('that', 7579),
 ('your', 7550),
 ('love', 7292),
 ('of', 6988),
 ('on', 6863),
 ("i'm", 6468),
 ('be', 6041),
 ('all', 5877),
 ("don't", 5796),
 ('for', 5545),
 ('baby', 5439),
 ('know', 5260),
 ('is', 5249),
 ('we', 5234),
 ('so', 5213),
 ('just', 4964),
 ('no', 4735),
 ('oh', 4672),
 ('do', 4518),
 ('but', 4450),
 ('with', 3970),
 ('when', 3942),
 ("it's", 3891),
 ('what', 3698),
 ('can', 3669),
 ('got', 3618),
 ('like', 3492),
 ('this', 3435),
 ('now', 3418),
 ('get', 3147),
 ('want', 3124),
 ('if', 3108),
 ("you're", 3085),
 ('yeah', 3048),
 ('up', 2820),
 ('time', 2726),
 ("can't", 2693),
 ('go', 2682),
 ('one', 2640),
 ('was', 2596),
 ('have', 2521),
 ('never', 2495),
 ('see', 2461),
 ('make', 2403),
 ('out', 2397),
 ('let', 2373),
 ('down', 2319),
 ('not', 2272),
 ('girl', 2257),
 ('come', 2241),
 ('say', 2223),
 ('will

In [121]:
#most common Hip-Hop word
print("Most Common Hip-Hop Words: ")
Counter(" ".join(dta.loc[dta['genre']=='Hip-Hop','lyrics']).split()).most_common(100)

Most Common Hip-Hop Words: 


[('the', 450218),
 ('i', 353479),
 ('you', 308374),
 ('a', 250858),
 ('and', 234834),
 ('to', 223949),
 ('my', 170342),
 ('it', 159682),
 ('me', 154743),
 ('in', 147375),
 ("i'm", 120826),
 ('on', 118469),
 ('that', 111389),
 ('like', 98684),
 ('your', 88240),
 ('of', 86955),
 ('we', 84634),
 ('with', 82997),
 ('up', 82735),
 ('get', 74615),
 ('for', 73990),
 ('is', 71635),
 ('all', 68589),
 ('got', 67075),
 ('be', 65619),
 ('so', 63652),
 ('this', 62253),
 ("don't", 61965),
 ('know', 61145),
 ('but', 59970),
 ('they', 57558),
 ('no', 56305),
 ('what', 50052),
 ('just', 49478),
 ('when', 49221),
 ("it's", 48499),
 ('out', 47113),
 ('do', 43763),
 ("ain't", 43180),
 ('now', 42529),
 ('nigga', 41514),
 ('if', 40561),
 ('can', 37785),
 ('shit', 35739),
 ('was', 35513),
 ('go', 34011),
 ('see', 33939),
 ('she', 33621),
 ('from', 32929),
 ('down', 32711),
 ('love', 31559),
 ('yeah', 31334),
 ('back', 30983),
 ('niggas', 29988),
 ('ya', 29538),
 ('make', 29401),
 ('at', 29294),
 ('one', 2875

In [122]:
#most common Indie word
print("Most Common Indie Words: ")
Counter(" ".join(dta.loc[dta['genre']=='Indie','lyrics']).split()).most_common(100)

Most Common Indie Words: 


[('the', 24740),
 ('i', 21569),
 ('you', 19957),
 ('and', 15990),
 ('to', 13959),
 ('a', 11280),
 ('me', 8709),
 ('in', 8265),
 ('my', 8157),
 ('it', 7041),
 ('of', 6594),
 ('your', 6177),
 ('that', 5677),
 ('all', 5071),
 ('on', 4788),
 ("i'm", 4507),
 ('for', 4348),
 ('be', 4220),
 ('is', 4205),
 ('we', 4125),
 ('but', 3907),
 ("don't", 3814),
 ('know', 3757),
 ('so', 3727),
 ('love', 3190),
 ("it's", 3148),
 ('with', 3143),
 ('like', 3060),
 ('just', 3033),
 ('when', 3030),
 ('no', 2893),
 ('this', 2865),
 ('up', 2861),
 ('what', 2823),
 ('oh', 2813),
 ('if', 2605),
 ('are', 2470),
 ('out', 2388),
 ('can', 2307),
 ('was', 2276),
 ('will', 2262),
 ("you're", 2213),
 ('not', 2211),
 ('now', 2181),
 ('have', 2138),
 ('down', 2119),
 ('do', 2119),
 ('go', 2085),
 ('time', 2045),
 ('they', 2012),
 ('from', 1932),
 ('never', 1896),
 ('get', 1850),
 ('one', 1784),
 ('come', 1725),
 ('at', 1672),
 ('see', 1670),
 ('as', 1633),
 ("i'll", 1619),
 ('got', 1618),
 ('say', 1571),
 ('way', 1489),

In [123]:
#most common Country word
print("Most Common Country Words: ")
Counter(" ".join(dta.loc[dta['genre']=='Country','lyrics']).split()).most_common(100)

Most Common Country Words: 


[('the', 103946),
 ('i', 91420),
 ('you', 75324),
 ('and', 73807),
 ('a', 62347),
 ('to', 60305),
 ('me', 39825),
 ('my', 37632),
 ('in', 36284),
 ('of', 30986),
 ('that', 29505),
 ('it', 26451),
 ('on', 25861),
 ('love', 20291),
 ('your', 19921),
 ('but', 19511),
 ('be', 19131),
 ("i'm", 19047),
 ('all', 18788),
 ('for', 18742),
 ('just', 16255),
 ('when', 15497),
 ('is', 15311),
 ('with', 14174),
 ("don't", 13762),
 ('so', 13720),
 ('we', 13520),
 ('know', 12836),
 ('like', 12678),
 ('was', 12618),
 ("it's", 12256),
 ('if', 11755),
 ('this', 10946),
 ('she', 10912),
 ('up', 10692),
 ('her', 10548),
 ('no', 10455),
 ('he', 10382),
 ('down', 10304),
 ('one', 10000),
 ('can', 9934),
 ('got', 9531),
 ('what', 9493),
 ('time', 9379),
 ('now', 9061),
 ('oh', 8808),
 ('go', 8795),
 ('out', 8715),
 ('as', 8663),
 ("i'll", 8599),
 ('do', 8576),
 ('never', 8525),
 ('have', 8361),
 ("you're", 8129),
 ('way', 7788),
 ('back', 7770),
 ('from', 7676),
 ("i've", 7544),
 ('at', 7498),
 ('there', 743

In [124]:
#most common Electronic word
print("Most Common Electronic Words: ")
Counter(" ".join(dta.loc[dta['genre']=='Electronic','lyrics']).split()).most_common(100)

Most Common Electronic Words: 


[('the', 54787),
 ('you', 53148),
 ('i', 48938),
 ('to', 32062),
 ('and', 30006),
 ('me', 27230),
 ('a', 27126),
 ('it', 21646),
 ('my', 19177),
 ('in', 18843),
 ('your', 15685),
 ('of', 13780),
 ('is', 12497),
 ('on', 12385),
 ('we', 12359),
 ("i'm", 11794),
 ('that', 11301),
 ('all', 11087),
 ('love', 10569),
 ('be', 10221),
 ('for', 9759),
 ('so', 9459),
 ("don't", 8830),
 ('no', 8691),
 ('this', 8055),
 ('know', 7853),
 ('up', 7845),
 ('with', 7589),
 ('like', 7422),
 ('but', 7379),
 ('just', 7364),
 ('can', 7066),
 ('what', 7047),
 ('do', 7026),
 ("it's", 6967),
 ('oh', 6924),
 ('when', 6670),
 ('now', 6546),
 ('get', 6308),
 ('are', 6224),
 ('go', 5989),
 ('come', 5375),
 ("you're", 5177),
 ('will', 5166),
 ('see', 5111),
 ('want', 5078),
 ('if', 4995),
 ('got', 4964),
 ('out', 4921),
 ('down', 4916),
 ('feel', 4864),
 ('time', 4858),
 ('one', 4823),
 ("can't", 4656),
 ('never', 4611),
 ('have', 4295),
 ('not', 4164),
 ('make', 4106),
 ('take', 4059),
 ('let', 4009),
 ('baby', 38

In [125]:
#most common Folk word
print("Most Common Folk Words: ")
Counter(" ".join(dta.loc[dta['genre']=='Folk','lyrics']).split()).most_common(100)

Most Common Folk Words: 


[('the', 18186),
 ('and', 10907),
 ('i', 10343),
 ('a', 8933),
 ('to', 7949),
 ('you', 7897),
 ('in', 5596),
 ('of', 5079),
 ('my', 4758),
 ('me', 4652),
 ('on', 3119),
 ('it', 3104),
 ('is', 3008),
 ('that', 2914),
 ('for', 2730),
 ('your', 2625),
 ('all', 2434),
 ('we', 2210),
 ('be', 2141),
 ('with', 1952),
 ('but', 1936),
 ('so', 1825),
 ('love', 1705),
 ('when', 1676),
 ('was', 1663),
 ('no', 1616),
 ("i'm", 1590),
 ('are', 1533),
 ('will', 1492),
 ('this', 1428),
 ('he', 1390),
 ('they', 1367),
 ('go', 1366),
 ('like', 1351),
 ('oh', 1344),
 ('from', 1320),
 ("it's", 1295),
 ('as', 1222),
 ('know', 1222),
 ('down', 1193),
 ('her', 1172),
 ('if', 1159),
 ('not', 1143),
 ('can', 1131),
 ('she', 1118),
 ('now', 1117),
 ('just', 1098),
 ('have', 1076),
 ('by', 1075),
 ('up', 1071),
 ("don't", 1058),
 ('out', 1051),
 ('what', 1029),
 ('at', 1027),
 ('never', 1025),
 ('there', 996),
 ('his', 992),
 ('see', 990),
 ('do', 988),
 ('time', 987),
 ('one', 918),
 ('an', 891),
 ('where', 858)

In [126]:
#most common Jazz word
print("Most Common Jazz Words: ")
Counter(" ".join(dta.loc[dta['genre']=='Jazz','lyrics']).split()).most_common(100)

Most Common Jazz Words: 


[('the', 46689),
 ('you', 43949),
 ('i', 42839),
 ('and', 30622),
 ('to', 30230),
 ('a', 28193),
 ('me', 22346),
 ('my', 20166),
 ('in', 17312),
 ('of', 14219),
 ('love', 13746),
 ('that', 13104),
 ('it', 12830),
 ('your', 11238),
 ('be', 11117),
 ('is', 10435),
 ('on', 10062),
 ("i'm", 9656),
 ('for', 9404),
 ('all', 9216),
 ('so', 8624),
 ('but', 8129),
 ('with', 7745),
 ('when', 7724),
 ('just', 6934),
 ('know', 6919),
 ("don't", 6876),
 ('no', 6809),
 ('do', 6258),
 ('we', 6153),
 ("it's", 5908),
 ('oh', 5907),
 ('if', 5587),
 ('like', 5569),
 ('what', 5543),
 ('can', 5403),
 ('got', 5237),
 ('this', 4921),
 ('baby', 4736),
 ('now', 4653),
 ('was', 4577),
 ("you're", 4458),
 ('have', 4388),
 ('up', 4380),
 ('one', 4301),
 ('are', 4167),
 ('never', 4167),
 ('go', 4165),
 ('will', 4098),
 ('come', 4031),
 ('see', 3796),
 ('heart', 3795),
 ('get', 3786),
 ('time', 3781),
 ('as', 3769),
 ('say', 3708),
 ('he', 3704),
 ('down', 3696),
 ("i'll", 3630),
 ('they', 3515),
 ('out', 3497),
 (

In [93]:
def metrics(text, genre):
    #number of distinct words in a lyric
    counter = Counter(text[text['genre'] == genre]['lyrics'])
    distinct_words = len(counter)

    #average number of tokens in a lyric
    len_toks = sum(text[text['genre'] == genre]['t-lyric'].str.len())/len(text[text['genre']== genre]['t-lyric'])
  
    
    #average line length 
    subset = text[text['genre'] == genre]
    songs = list(subset[subset['lyrics'].notnull()]['lyrics'].str.split(pat = "\n"))#songs

    song_line_len = []
    for song in songs:
        line_lens = [] 
        for line in song:
            line_lens.append(len(line.split(" ")))
        avg_line_per_song = sum(line_lens)/len(song)
        song_line_len.append(avg_line_per_song)
    avg_line_len = sum(song_line_len)/len(songs)
    
    
    #average word length
    song_line_len = []
    for song in songs:
        line_lens = []
        for line in song:
            line_lens.append(len(line)/len(line.split(" ")))
        avg_line_per_song = sum(line_lens)/len(song)
        song_line_len.append(avg_line_per_song)
    avg_word_len = sum(song_line_len)/len(songs)

    
    return distinct_words, len_toks, avg_word_len, avg_line_len

In [94]:
metrics(dta, 'Metal')

(22446, 1150.7218741199663, 5.475789991276703, 8.225387365296852)

In [None]:
metric(dta, 'Rock')

In [96]:
metrics(dta, 'Jazz')

(6812, 649.7764040356914, 5.001229703760138, 7.017426795587943)

In [97]:
metrics(dta, 'rap')

(47574, 2413.218287378423, 4.513449685931287, 7.014357455270661)

In [98]:
metrics(dta, 'Indie')

(3024, 859.9513258897418, 4.954726674096404, 7.103142997146679)

In [99]:
metrics(dta, 'Folk')

(1999, 1004.5695772909596, 5.4225414676646935, 6.641886005213627)

In [100]:
metrics(dta, 'Electronic')

(7044, 756.0140080222154, 5.221263824531677, 5.9708978039941)

In [102]:
metrics(dta, 'R&B')

(3156, 1010.1563605728728, 4.8092544596103615, 8.148398223844897)

In [101]:
metrics(dta,'Hip-Hop')

(22613, 2795.0239952892684, 4.955525258445672, 9.301993085782962)