In [1]:
import pandas as pd
import pickle5 as pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# to visualize the column transformer and pipeline
from sklearn import set_config
set_config(display='diagram')

## Load the dicts with the lyrics

In [2]:
with open('./data/lyrics_fa.pkl', 'rb') as f:
    all_lyrics_fr = pickle.load(f)

In [3]:
with open('./data/lyrics_ga.pkl', 'rb') as f:
    all_lyrics_ga = pickle.load(f)

In [4]:
with open('./data/lyrics_lc.pkl', 'rb') as f:
    all_lyrics_lc = pickle.load(f)

# 1. Compare two rock bands

## 1.1 Create the corpus with 30 songs each (Rock/Rock)

In [5]:
lyrics_list_fr = [value for value in all_lyrics_fr.values()][:30]

In [6]:
lyrics_list_ga = [value for value in all_lyrics_ga.values()][:30]

In [7]:
#Create the corpus by concatenating the two list
corpus_rr = lyrics_list_fr + lyrics_list_ga

In [8]:
labels_rr = ["Frightened Rabbit"] * 30 + ["Gaslight Anthem"] * 30
len(labels_rr)

60

## 1.2 CountVectorize() the corpus (Rock/Rock)

In [12]:
#Pure Vecotrizer
vectorizer = CountVectorizer()
X_rr = vectorizer.fit_transform(corpus_rr)
X_df_rr = pd.DataFrame(X_rr.todense(), columns=vectorizer.get_feature_names_out(), index=labels_rr)
X_df_rr.shape

(60, 1422)

In [13]:
#Remove 'stop words'
vectorizer = CountVectorizer(stop_words="english")
X_rr = vectorizer.fit_transform(corpus_rr)
X_df_rr = pd.DataFrame(X_rr.todense(), columns=vectorizer.get_feature_names_out(), index=labels_rr)
X_df_rr.shape

(60, 1229)

In [14]:
#Remove words that occure in more than x% of the songs
vectorizer = CountVectorizer(stop_words="english", max_df=0.1)
X_rr = vectorizer.fit_transform(corpus_rr)
X_df_rr = pd.DataFrame(X_rr.todense(), columns=vectorizer.get_feature_names_out(), index=labels_rr)
X_df_rr.shape

(60, 1143)

## 1.3 Apply TI-IDF (Rock/Rock)

In [15]:
tf = TfidfTransformer()
X_norm_rr = tf.fit_transform(X_rr)

## 1.4 Classification Model (Rock/Rock)

In [16]:
X_rr=X_norm_rr
y_rr=labels_rr

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_rr,y_rr, random_state=42)

In [19]:
m = LogisticRegression()
m.fit(X_train, y_train)

In [20]:
print(f"Comparison of two Rock artists:\n"
      f"The R2-score for the test data is {round(m.score(X_test, y_test),3)} and {round(m.score(X_train, y_train), 3)} for the train data")

Comparison of two Rock artists:
The R2-score for the test data is 0.067 and 0.756 for the train data


# 2. Compare a rock band and a rap artist

## 2.1 Create the corpus with 30 rock and 27 rap songs

In [22]:
lyrics_list_lc = [value for value in all_lyrics_lc.values()]

In [23]:
#Create the corpus by concatenating the two list
corpus_rh = lyrics_list_ga + lyrics_list_lc

In [26]:
labels_rh = ["Gaslight Anthem"] * 30 + ["Loyale Carner"] * 27

## 2.2 CountVectorize() the corpus (Rock/Rap)

In [21]:
#Pure Vecotrizer
vectorizer = CountVectorizer()
X_rh = vectorizer.fit_transform(corpus_rh)
X_df_rh = pd.DataFrame(X_rh.todense(), columns=vectorizer.get_feature_names_out(), index=labels_rh)
X_df_rh.shape

(57, 2860)

In [27]:
#Remove 'stop words'
vectorizer = CountVectorizer(stop_words="english")
X_rh = vectorizer.fit_transform(corpus_rh)
X_df_rh = pd.DataFrame(X_rh.todense(), columns=vectorizer.get_feature_names_out(), index=labels_rh)
X_df_rh.shape

(57, 2632)

In [28]:
#Remove words that occure in more than x% of the songs
vectorizer = CountVectorizer(stop_words="english", max_df=0.1)
X_rh = vectorizer.fit_transform(corpus_rh)
X_df_rh = pd.DataFrame(X_rh.todense(), columns=vectorizer.get_feature_names_out(), index=labels_rh)
X_df_rh.shape

(57, 2459)

## 2.3 Apply TI-IDF (Rock/Rap)

In [31]:
tf = TfidfTransformer()
X_norm_rh = tf.fit_transform(X_rh)

## 2.4 Classification Model (Rock/Rap)

In [32]:
X_rh=X_norm_rh
y_rh=labels_rh

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_rh,y_rh, random_state=42)

In [34]:
m = LogisticRegression()
m.fit(X_train, y_train)

In [35]:
print(f"Comparison of a rock group and a rap artist:\n"
      f"The R2-score for the test data is {round(m.score(X_test, y_test),3)} and {round(m.score(X_train, y_train), 3)} for the train data")

Comparison of a rock group and a rap artist:
The R2-score for the test data is 0.667 and 1.0 for the train data
