In [43]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

df_raw = pd.read_csv(
    "lyrics/lyrics.csv",
    index_col=0,
    names=["song_name", "year", "interpreter", "genre", "lyrics"],
)
# Set the default maximum width of columns
pd.set_option("display.max_colwidth", 100)

# Data quality

In [44]:
# replace "-" by " "
df_raw["interpreter"] = df_raw["interpreter"].str.replace("-", " ").str.title()
df_raw["song_name"] = df_raw["song_name"].str.replace("-", " ").str.title()

# Identify which interpreters are the same

In [45]:
def is_same(name1, name2):
    set1 = set(name1.split())
    set2 = set(name2.split())
    if set1 == set2:
        return True    
    return False

interpreters = df_raw['interpreter'].unique().tolist()

interpreter_mapping={}
for first in interpreters:
    for second in interpreters:
        if first != second and is_same(first, second) and len(first) <= len(second):
            if not second in interpreter_mapping:
                if not first in interpreter_mapping:
                    interpreter_mapping[second] = first
                else:
                    if interpreter_mapping[first] != second:
                        interpreter_mapping[second] = interpreter_mapping[first]
            else:
                if not first in interpreter_mapping:
                    if not second in interpreter_mapping:
                        interpreter_mapping[first] = second
                    else:
                        if interpreter_mapping[second] != first:
                            interpreter_mapping[first] = interpreter_mapping[second]

In [46]:
from fuzzywuzzy import fuzz

# threshold is defined so that Beyonce and Beyonce Knowles are similar enough
threshold = 63
grouped = df_raw.groupby(['song_name', 'genre'])
interpreter_mapping={}
for name, group in grouped:
    for i in range(len(group)):
        for j in range(i+1, len(group)):
            interpreter_score = fuzz.ratio(group.iloc[i]['interpreter'], group.iloc[j]['interpreter'])
            if (interpreter_score > threshold):
                first=group.iloc[i]['interpreter']
                second=group.iloc[j]['interpreter']
                if first != second:
                    if len(first) < len(second):
                        if not second in interpreter_mapping:
                            if not first in interpreter_mapping:
                                interpreter_mapping[second] = first
                            else:
                                interpreter_mapping[second] = interpreter_mapping[first]
                    else:
                        if not first in interpreter_mapping:
                            if not second in interpreter_mapping:
                                interpreter_mapping[first] = second
                            else:
                                interpreter_mapping[first] = interpreter_mapping[second]

In [47]:
df_raw['interpreter'] = df_raw['interpreter'].replace(interpreter_mapping)

In [48]:
df_raw['interpreter'] = df_raw['interpreter'].replace(interpreter_mapping)

In [49]:
df = df_raw.dropna()
df = df[(df["year"] > 1960) & (df["year"] < 2023)]

In [50]:
df["lyrics"] = df["lyrics"].str.replace(r"\[.*?\]", "", regex=True)
df["lyrics"] = df["lyrics"].str.replace(r"\(.*?\)", "", regex=True)

In [51]:
df['lyrics_length'] = df['lyrics'].str.len()
df['has_lyrics'] = df['lyrics'].apply(lambda x: 1 if x != '' else 0)
df = df.sort_values(['has_lyrics', 'year', 'lyrics_length'], ascending=[False, True, False])

df = df.drop_duplicates(subset=['song_name', 'interpreter'], keep='first')
df = df[(df['has_lyrics'] == 1)]
df = df.drop(columns=['lyrics_length', 'has_lyrics'])

# Add new features

In [52]:
df["word_count"] = df["lyrics"].apply(lambda x: len(str(x).split()))

df["unique_word_count"] = df["lyrics"].apply(lambda x: len(set(str(x).split())))

df["average_word_length"] = df["lyrics"].apply(
    lambda x: np.mean([len(word) for word in x.split() if word])
)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [53]:
df = df[df['genre'] != 'Not Available']

interpreter_counts = df['interpreter'].value_counts()
interpreters_with_many_songs = interpreter_counts[interpreter_counts > 100].index
df = df[df['interpreter'].isin(interpreters_with_many_songs)]


# Language detection

In [55]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

DetectorFactory.seed = 0
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return None

df['language'] = df['lyrics'].apply(detect_language)

In [56]:
df.to_csv('preprocessed_filtered.csv', index=False)