In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import re 
import numpy as np
from sklearn.linear_model import LinearRegression
from pystout import pystout
import statsmodels.api as sm

In [7]:
df = pd.read_csv("data/full_dataset_sentiment_analysis.csv")

In [9]:
# A small final pre-processing
df['year'] = df['title'].str.extract('(\d{4})', expand=True)
df['country'] = df['title'].str.extract(r'Top Songs of (\d\d\d\d): (.*)')[1]
df = df.drop(columns=["enumerated_list", "lyrics_not_found", "Unnamed: 0.3", "Unnamed: 0.2", "Unnamed: 0.1", "Unnamed: 0"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   songs      952 non-null    object 
 1   artist     952 non-null    object 
 2   title      952 non-null    object 
 3   lyrics     952 non-null    object 
 4   sentiment  952 non-null    object 
 5   neg        952 non-null    float64
 6   neu        952 non-null    float64
 7   pos        952 non-null    float64
 8   compound   952 non-null    float64
 9   year       952 non-null    object 
 10  country    952 non-null    object 
dtypes: float64(4), object(7)
memory usage: 81.9+ KB


In [10]:
df_cov = pd.read_csv("data/lists of songs/full_dataset.csv")

In [11]:
df_cov = df_cov.loc[:, ['danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'year', 'country', 'artist', 'songs']]
df_cov["year"]=df_cov["year"].astype("str")
df_cov["minor_mode"]=np.where(df_cov["mode"]==1, 0, 1)

In [13]:
df_merged = pd.merge(df,df_cov,how='left',left_on=["country", "year", "artist", "songs"], right_on=["country", "year", "artist", "songs"])
df_merged = df_merged.replace(np.nan, 0)
df_merged.replace([np.inf, -np.inf], 0, inplace=True)

In [14]:
# Computing ratio and gloom index

df_merged["ratio"]=df_merged["neg"]/df_merged["pos"]
df_merged["gloom_index"]=(((1-df_merged["valence"])+(1-df_merged['danceability']))/2+df_merged["neg"]*df_merged["speechiness"]*2)

## Implementing a Diff-in-Diff model

Our proposed model is the following:
$Y= \beta_0 + \beta_1covid + \beta_2lockdown + \beta_3(covid*lockdown) + \epsilon$

Where:

+ $Y = \Delta_{sentiment}$, considering as main measure a ratio negative/positive sentiment in song lyrics
+ $\beta_1covid$ are time fixed effects (pre-covid/post covid)
+ $\beta_2lockdown$ are group (country) fixed effects
+ $\beta_3(covid*lockdown)$ are the effects of the treatment variable

Treatment: strict lockdowns at the country level.

In [53]:
def x_y (treated, y_selected, df):
    dataset = df[(df["country"]==treated) | (df["country"]=="New Zealand")]
    dataset = dataset[(dataset["year"]=="2019") | (dataset["year"]=="2020") | (dataset["year"]=="2021")]

    # Time variable
    dataset["time"]=np.where(dataset["year"]=="2019", 0, 1)
    # group variable (treatment)
    dataset['group'] = np.where(dataset['country']=="New Zealand", 0,1)
    #creating interaction variable
    dataset['did'] = dataset['time'] * dataset['group']
    
    #Splitting variables into dependent (x) and independent (y) variables

    x = dataset.loc[:, ["time","group","did"]]
    y = dataset[y_selected]
    y = y.replace(np.nan, 0)
    y.replace([np.inf, -np.inf], 0, inplace=True)


    return x, y

In [54]:
# Instantiating models

x, y= x_y(treated="Australia", y_selected="ratio", df=df_merged)
X2 = sm.add_constant(x)
model1 = sm.OLS(y,X2).fit()

x, y= x_y(treated="USA", y_selected="ratio", df=df_merged)
X2 = sm.add_constant(x)
model2 = sm.OLS(y,X2).fit()

x, y= x_y(treated="Australia", y_selected="gloom_index", df=df_merged)
X2 = sm.add_constant(x)
model3 = sm.OLS(y,X2).fit()

x, y= x_y(treated="USA", y_selected="gloom_index", df=df_merged)
X2 = sm.add_constant(x)
model4 = sm.OLS(y,X2).fit()

In [55]:
print(model4.summary())

                            OLS Regression Results                            
Dep. Variable:            gloom_index   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     4.653
Date:                Fri, 03 Mar 2023   Prob (F-statistic):            0.00324
Time:                        11:41:29   Log-Likelihood:                 158.98
No. Observations:                 464   AIC:                            -310.0
Df Residuals:                     460   BIC:                            -293.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4768      0.020     24.249      0.0

In [None]:
# =============================================================================
# Print result
# =============================================================================
pystout(models=[model1,model2,model3,model4],
        file='test_table.tex',
        addnotes=['* Significant at the 0.05 level'],
        digits=2,
        endog_names=['AUS','USA','AUS','USA'],
        varlabels={'const':'Constant','time':'Time (Covid)','group':'Country', 'did':'Treatment(DiD)'},
        #addrows={'Test':['A','Test','Row','Here','Too']},
        mgroups={'Sentiment':[1,2],'Gloom Index':[3,4]},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}','fvalue':'F-stat'}
        )

Robustness checks: 
1) taking 2022 as treatment year

In [17]:
def x_y_22 (treated, y_selected, df):
    dataset = df[(df["country"]==treated) | (df["country"]=="New Zealand")]
    dataset = dataset[(dataset["year"]=="2019") | (dataset["year"]=="2022")]

    # Time variable
    dataset["time"]=np.where(dataset["year"]=="2019", 0, 1)
    # group variable (treatment)
    dataset['group'] = np.where(dataset['country']=="New Zealand", 0,1)
    #creating interaction variable
    dataset['did'] = dataset['time'] * dataset['group']
    
    #Splitting variables into dependent (x) and independent (y) variables

    x = dataset.loc[:, ["time","group","did"]]
    y = dataset[y_selected]
    y = y.replace(np.nan, 0)
    y.replace([np.inf, -np.inf], 0, inplace=True)


    return x, y

In [19]:
# Instantiating models

x, y= x_y_22(treated="Australia", y_selected="ratio", df=df_merged)
X2 = sm.add_constant(x)
model1 = sm.OLS(y,X2).fit()

x, y= x_y_22(treated="USA", y_selected="ratio", df=df_merged)
X2 = sm.add_constant(x)
model2 = sm.OLS(y,X2).fit()

x, y= x_y_22(treated="Australia", y_selected="gloom_index", df=df_merged)
X2 = sm.add_constant(x)
model3 = sm.OLS(y,X2).fit()

x, y= x_y_22(treated="USA", y_selected="gloom_index", df=df_merged)
X2 = sm.add_constant(x)
model4 = sm.OLS(y,X2).fit()

In [24]:
# =============================================================================
# Print result
# =============================================================================
pystout(models=[model1,model2,model3,model4],
        file='test_table_2022.tex',
        addnotes=['* Significant at the 0.05 level'],
        digits=2,
        endog_names=['AUS','USA','AUS','USA'],
        varlabels={'const':'Constant','time':'Time (Covid)','group':'Country', 'did':'Treatment(DiD)'},
        #addrows={'Test':['A','Test','Row','Here','Too']},
        mgroups={'Sentiment':[1,2],'Gloom Index':[3,4]},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}','fvalue':'F-stat'}
        )

  options = options.append(pd.DataFrame([r],index=[value]))
  options = options.append(pd.DataFrame([r],index=[value]))
  options = options.append(pd.DataFrame([r],index=[value]))


2) Taking Canada instead of US as high stringency country. Their mean score is similar, so the model should translate

In [65]:
df_can = pd.read_csv("data/ds_sentiment_canada.csv")

In [66]:
df_cov = pd.read_csv("data/lists of songs/dataframe_song_features_canada.csv")

In [67]:
# A small final pre-processing
df_can['year'] = df_can['title'].str.extract('(\d{4})', expand=True)
df_can['country'] = df_can['title'].str.extract(r'Top Songs of (\d\d\d\d): (.*)')[1]
df_can.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0.2      198 non-null    int64  
 1   Unnamed: 0.1      198 non-null    int64  
 2   Unnamed: 0        198 non-null    int64  
 3   songs             198 non-null    object 
 4   artist            198 non-null    object 
 5   title             198 non-null    object 
 6   lyrics            198 non-null    object 
 7   enumerated_list   198 non-null    object 
 8   lyrics_not_found  198 non-null    bool   
 9   sentiment         198 non-null    object 
 10  neg               198 non-null    float64
 11  neu               198 non-null    float64
 12  pos               198 non-null    float64
 13  compound          198 non-null    float64
 14  year              198 non-null    object 
 15  country           198 non-null    object 
dtypes: bool(1), float64(4), int64(3), object(8)


In [69]:
df_cov['year'] = df_cov['title'].str.extract('(\d{4})', expand=True)
df_cov['country'] = df_cov['title'].str.extract(r'Top Songs of (\d\d\d\d): (.*)')[1]

In [70]:
df_cov = df_cov.loc[:, ['danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'year', 'country', 'artist', 'songs']]

df_cov["minor_mode"]=np.where(df_cov["mode"]==1, 0, 1)

In [71]:
df_merged_2 = pd.merge(df_can,df_cov,how='left',left_on=["country", "year", "artist", "songs"], right_on=["country", "year", "artist", "songs"])
df_merged_2 = df_merged_2.replace(np.nan, 0)
df_merged_2.replace([np.inf, -np.inf], 0, inplace=True)

In [72]:
# Computing ratio and gloom index

df_merged_2["ratio"]=df_merged_2["neg"]/df_merged_2["pos"]
df_merged_2["gloom_index"]=(((1-df_merged_2["valence"])+(1-df_merged_2['danceability']))/2+df_merged_2["neg"]*df_merged_2["speechiness"]*2)

In [73]:
canada = df_merged_2[["country", "year", "ratio", "gloom_index"]]

In [75]:
others= df_merged[["country", "year", "ratio", "gloom_index"]]

In [76]:
final = pd.concat([canada, others])

In [77]:
final["country"].value_counts()

Australia      335
New Zealand    328
USA            303
Canada         198
Name: country, dtype: int64

In [78]:
# Instantiating models

x, y= x_y(treated="Australia", y_selected="ratio", df=final)
X2 = sm.add_constant(x)
model1 = sm.OLS(y,X2).fit()

x, y= x_y(treated="Canada", y_selected="ratio", df=final)
X2 = sm.add_constant(x)
model2 = sm.OLS(y,X2).fit()

x, y= x_y(treated="Australia", y_selected="gloom_index", df=final)
X2 = sm.add_constant(x)
model3 = sm.OLS(y,X2).fit()

x, y= x_y(treated="Canada", y_selected="gloom_index", df=final)
X2 = sm.add_constant(x)
model4 = sm.OLS(y,X2).fit()

In [83]:
print(model4.summary())

                            OLS Regression Results                            
Dep. Variable:            gloom_index   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.014
Method:                 Least Squares   F-statistic:                     3.067
Date:                Fri, 03 Mar 2023   Prob (F-statistic):             0.0278
Time:                        11:45:40   Log-Likelihood:                 130.82
No. Observations:                 441   AIC:                            -253.6
Df Residuals:                     437   BIC:                            -237.3
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4768      0.021     23.154      0.0

In [84]:
# =============================================================================
# Print result
# =============================================================================
pystout(models=[model1,model2,model3,model4],
        file='test_table_canada.tex',
        addnotes=['* Significant at the 0.05 level'],
        digits=2,
        endog_names=['Mid','High','Mid','High'],
        varlabels={'const':'Constant','time':'Time (Covid)','group':'Country', 'did':'Treatment(DiD)'},
        #addrows={'Test':['A','Test','Row','Here','Too']},
        mgroups={'Sentiment':[1,2],'Gloom Index':[3,4]},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}','fvalue':'F-stat'}
        )

  options = options.append(pd.DataFrame([r],index=[value]))
  options = options.append(pd.DataFrame([r],index=[value]))
  options = options.append(pd.DataFrame([r],index=[value]))
