In [51]:
import lucem_illud_2020 #pip install -U git+git://github.com/Computational-Content-Analysis-2020/lucem_illud_2020.git

#All these packages need to be installed from pip
#For ML
import sklearn
import sklearn.naive_bayes
import sklearn.tree
import sklearn.ensemble
import sklearn.neural_network
import sklearn.decomposition

import nltk #For tokenizing and normalizing
import numpy as np #arrays
import matplotlib.pyplot as plt #Plots
import matplotlib.colors # For nice colours
import seaborn #Makes plots look nice, also heatmaps
import scipy as sp #for interp

#These are from the standard library
import collections
import os
import os.path
import random
import re
import glob
import pandas as pd
import requests
import json
import math

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning
%matplotlib inline

In [53]:
lyrics_df = pd.read_csv('cleaned_lyrics_sentiment_repetity.csv')
lyrics_df[:5]

Unnamed: 0,year,artist,genre,lyrics,normalized,sentiment,repetity
0,2009,beyonce-knowles,Pop,"oh baby, how you doing? you know i'm gonna cut...","['oh', 'baby', 'know', 'be', 'gon', 'na', 'cut...",0.9976,1.173333
1,2009,beyonce-knowles,Pop,"playin' everything so easy, it's like you seem...","['playin', 'easy', 'like', 'sure', 'way', 'not...",0.9658,0.86
2,2009,beyonce-knowles,Pop,if you search for tenderness it isn't hard to ...,"['search', 'tenderness', 'hard', 'find', 'love...",0.9696,0.266667
3,2009,beyonce-knowles,Pop,"oh oh oh i, oh oh oh i if i wrote a book abo...","['oh', 'oh', 'oh', 'oh', 'oh', 'oh', 'write', ...",0.9993,2.491803
4,2009,beyonce-knowles,Pop,"party the people, the people the party it's po...","['party', 'people', 'people', 'party', 'pop', ...",0.8659,6.333333


In [12]:
common_year = list(range(2001,2017))
common_year.remove(2003)
common_year

[2001,
 2002,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016]

In [20]:
genres = pd.unique(lyrics_df['genre'])
genres

array(['Pop', 'Hip-Hop', 'Rock', 'Metal', 'Country', 'Jazz', 'Electronic',
       'Folk', 'R&B', 'Indie'], dtype=object)

In [13]:
lyrics_common_df = lyrics_df.loc[lyrics_df['year'].isin(common_year)]
lyrics_common_df

Unnamed: 0,year,artist,genre,lyrics,normalized,sentiment,repetity
0,2009,beyonce-knowles,Pop,"oh baby, how you doing? you know i'm gonna cut...","['oh', 'baby', 'know', 'be', 'gon', 'na', 'cut...",0.9976,1.173333
1,2009,beyonce-knowles,Pop,"playin' everything so easy, it's like you seem...","['playin', 'easy', 'like', 'sure', 'way', 'not...",0.9658,0.860000
2,2009,beyonce-knowles,Pop,if you search for tenderness it isn't hard to ...,"['search', 'tenderness', 'hard', 'find', 'love...",0.9696,0.266667
3,2009,beyonce-knowles,Pop,"oh oh oh i, oh oh oh i if i wrote a book abo...","['oh', 'oh', 'oh', 'oh', 'oh', 'oh', 'write', ...",0.9993,2.491803
4,2009,beyonce-knowles,Pop,"party the people, the people the party it's po...","['party', 'people', 'people', 'party', 'pop', ...",0.8659,6.333333
...,...,...,...,...,...,...,...
210637,2012,edens-edge,Country,"i gotta say boy, after only just a couple of d...","['get', 'ta', 'boy', 'couple', 'date', 'hand',...",0.9545,0.987342
210638,2012,edens-edge,Country,i helped you find her diamond ring you made me...,"['help', 'find', 'diamond', 'ring', 'try', 'to...",-0.9716,1.095238
210639,2012,edens-edge,Country,look at the couple in the corner booth looks a...,"['look', 'couple', 'corner', 'booth', 'look', ...",0.9605,0.593220
210640,2012,edens-edge,Country,when i fly off this mortal earth and i'm measu...,"['fly', 'mortal', 'earth', 'be', 'measure', 'd...",0.9844,0.460000


In [14]:
common_year_groups = lyrics_common_df.groupby(['year'])

In [15]:
common_year_groups.mean()

Unnamed: 0_level_0,sentiment,repetity
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,0.266337,0.935288
2002,0.273494,0.969924
2004,0.198268,1.009129
2005,0.22589,0.991125
2006,0.174423,0.857848
2007,0.080768,0.739456
2008,0.301647,1.030628
2009,0.236103,1.076836
2010,0.214268,1.031949
2011,0.181011,1.146004


In [54]:
from scipy.stats import linregress as reg
from scipy.stats import pearsonr
pearsonr(common_year_groups.mean().index, common_year_groups.mean()['repetity'])

(0.679476199575991, 0.0053313340569050025)

In [55]:
pearsonr(common_year_groups.mean().index, common_year_groups.mean()['sentiment'])

(-0.1915315864451747, 0.49409002613122616)

In [18]:
lyrics_common_groups = {index:df for index,df in list(lyrics_common_df.groupby(['genre','year']))}

In [23]:
sentiment_df = pd.DataFrame(columns=genres, index=sorted(common_year))
for k in genres:
    for y in sorted(common_year):
        sentiment_df.loc[y,k] = lyrics_common_groups[(k,y)]['sentiment'].mean()
sentiment_df        

Unnamed: 0,Pop,Hip-Hop,Rock,Metal,Country,Jazz,Electronic,Folk,R&B,Indie
2001,0.561187,0.0153528,0.12345,-0.256171,0.421255,0.587382,0.583297,0.507067,0.290856,-0.0716444
2002,0.514294,-0.0689954,0.245807,-0.694074,0.732795,0.509867,0.0193773,0.397593,0.985567,0.36719
2004,0.495696,-0.153171,0.177522,-0.287252,0.483398,0.562246,0.266942,0.9808,0.16875,-0.3276
2005,0.448412,0.0868693,0.158518,-0.253346,0.436231,0.548007,0.313839,0.586793,0.732067,0.244826
2006,0.488689,-0.217464,0.203294,-0.428721,0.461107,0.5114,0.257389,0.320655,0.308509,0.3005
2007,0.427405,-0.183011,0.104723,-0.372852,0.465504,0.563342,0.175254,0.249876,0.120563,0.219498
2008,0.484086,0.0133088,0.265256,-0.371675,0.435629,0.616327,0.350729,0.439821,0.535073,0.265636
2009,0.397441,0.211598,0.199051,-0.366247,0.467814,0.457688,0.282119,0.309465,0.204435,0.316415
2010,0.43852,0.119656,0.207419,-0.463523,0.42665,0.517994,0.258067,0.40616,0.152536,0.346716
2011,0.433881,-0.0464636,0.177583,-0.334817,0.414155,0.509723,0.150569,0.2402,0.520635,0.18561


In [26]:
sentiment_df.to_csv('sentiment_time_df.csv',index=None)

In [48]:
sentiment_regression_check = [(c, reg(sentiment_df.index, sentiment_df[c].astype('float'))) for c in sentiment_df.columns]
sorted(sentiment_regression_check, key=lambda x:x[1][3])

[('Pop',
  LinregressResult(slope=-0.008292729981277775, intercept=17.11085635302054, rvalue=-0.8146444214183751, pvalue=0.00021682621179737894, stderr=0.0016374059350652578)),
 ('Folk',
  LinregressResult(slope=-0.02422148937424058, intercept=49.04384637865648, rvalue=-0.5681203378299373, pvalue=0.02714116909574274, stderr=0.00973105442965978)),
 ('Rock',
  LinregressResult(slope=0.003853617747426188, intercept=-7.543884329984611, rvalue=0.4039622964245925, pvalue=0.13535351122762262, stderr=0.0024203084314136088)),
 ('Jazz',
  LinregressResult(slope=-0.004585408687291808, intercept=9.740403066101992, rvalue=-0.39389979400254466, pvalue=0.14629948204844853, stderr=0.002967622207460514)),
 ('Country',
  LinregressResult(slope=-0.005360389837121256, intercept=11.237170283658585, rvalue=-0.3287249180654308, pvalue=0.23157868885094066, stderr=0.004271298036798859)),
 ('R&B',
  LinregressResult(slope=-0.01628836081275763, intercept=33.09020828899795, rvalue=-0.31762902131873877, pvalue=0.2

In [56]:
sentiment_correlation_check = [(c, pearsonr(sentiment_df.index, sentiment_df[c].astype('float'))) for c in sentiment_df.columns]
sorted(sentiment_correlation_check, key=lambda x:x[1][1])

[('Pop', (-0.8146444214183752, 0.00021682621179737832)),
 ('Folk', (-0.5681203378299373, 0.027141169095742757)),
 ('Rock', (0.40396229642459247, 0.13535351122762268)),
 ('Jazz', (-0.3938997940025447, 0.14629948204844853)),
 ('Country', (-0.3287249180654308, 0.2315786888509409)),
 ('R&B', (-0.31762902131873894, 0.2486490889663158)),
 ('Indie', (0.2787234658866497, 0.3144343929685324)),
 ('Hip-Hop', (-0.1464601043220405, 0.6024709352886676)),
 ('Metal', (0.10550858590074721, 0.7082334583158681)),
 ('Electronic', (-0.0726956637918658, 0.7968211077372607))]

In [24]:
repetity_df = pd.DataFrame(columns=genres, index=sorted(common_year))
for k in genres:
    for y in sorted(common_year):
        repetity_df.loc[y,k] = lyrics_common_groups[(k,y)]['repetity'].mean()
repetity_df        

Unnamed: 0,Pop,Hip-Hop,Rock,Metal,Country,Jazz,Electronic,Folk,R&B,Indie
2001,1.2042,0.582258,0.908926,0.647426,0.644645,0.792154,2.36135,0.495613,0.919645,0.452662
2002,1.13512,0.734352,0.959227,0.580729,0.727578,0.899644,0.593097,1.37174,2.82667,1.94907
2004,1.17629,0.793174,0.998166,0.586028,0.695082,0.835781,3.36551,1.18885,1.66594,2.06332
2005,1.14772,0.852224,0.934747,0.628966,0.761637,1.1334,2.20732,0.921475,1.27418,0.907575
2006,1.09977,0.726639,0.844761,0.474312,0.745443,0.875603,1.47205,0.668374,0.957886,0.806259
2007,1.01902,0.720313,0.777589,0.458171,0.688476,0.948129,1.44365,0.619429,0.884628,0.694462
2008,1.21583,0.916766,0.982482,0.742347,0.780983,1.11862,1.87091,0.815363,1.44176,0.841456
2009,1.28722,1.00112,1.04685,0.735527,0.768285,0.99872,1.59923,0.667635,1.24391,0.896671
2010,1.31454,0.89427,1.02016,0.678364,0.787659,0.933265,1.48103,0.909947,0.995205,0.868014
2011,1.39935,0.958637,1.14368,0.790394,0.79541,1.01374,1.7274,0.838822,1.24588,0.784021


In [25]:
repetity_df.to_csv('repetity_time_df.csv',index=None)

In [50]:
repetity_regression_check = [(c, reg(repetity_df.index, repetity_df[c].astype('float'))) for c in repetity_df.columns]
sorted(repetity_regression_check, key=lambda x:x[1][3])

[('Country',
  LinregressResult(slope=0.020898446411215816, intercept=-41.17740844698617, rvalue=0.8962215105502735, pvalue=6.142244710126311e-06, stderr=0.0028689644663784067)),
 ('Hip-Hop',
  LinregressResult(slope=0.0265430894756173, intercept=-52.44399588461116, rvalue=0.8827298474546476, pvalue=1.3144523520969144e-05, stderr=0.003918679385600042)),
 ('Pop',
  LinregressResult(slope=0.020546972883401987, intercept=-40.0202366111583, rvalue=0.7684283642643486, pvalue=0.0008173778250795005, stderr=0.00474579748816495)),
 ('Metal',
  LinregressResult(slope=0.022900428186646227, intercept=-45.30770660123473, rvalue=0.7455840696948226, pvalue=0.0014186349618000279, stderr=0.0056769862448008265)),
 ('Rock',
  LinregressResult(slope=0.01310943816760269, intercept=-25.340752896451704, rvalue=0.6389152601385703, pvalue=0.010346691547866708, stderr=0.004377760103854075)),
 ('R&B',
  LinregressResult(slope=-0.02474088486999247, intercept=51.037101285982956, rvalue=-0.24072494169145953, pvalue

In [58]:
repetity_correlation_check = [(c, pearsonr(repetity_df.index, repetity_df[c].astype('float'))) for c in repetity_df.columns]
sorted(repetity_correlation_check, key=lambda x:x[1][1])

[('Country', (0.8962215105502739, 6.14224471012619e-06)),
 ('Hip-Hop', (0.8827298474546472, 1.3144523520969363e-05)),
 ('Pop', (0.7684283642643488, 0.0008173778250794968)),
 ('Metal', (0.7455840696948226, 0.0014186349618000292)),
 ('Rock', (0.6389152601385703, 0.010346691547866708)),
 ('R&B', (-0.2407249416914595, 0.3874485208010618)),
 ('Indie', (-0.2208646554408409, 0.4289215657871394)),
 ('Jazz', (0.1767798773556231, 0.5285225021246217)),
 ('Electronic', (-0.15867873207006922, 0.5721803008470447)),
 ('Folk', (0.003692810181724668, 0.9895788840854265))]