<a href="https://colab.research.google.com/github/figgarnold/Eurovision/blob/main/AudioFeatures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set Up

Installations:

In [None]:
# System packages needed for Essentia
!apt-get install -y build-essential libfftw3-dev libeigen3-dev libsamplerate0-dev

# Clone Essentia and build it
!git clone https://github.com/MTG/essentia.git
%cd essentia
!./waf configure --with-python
!./waf build
!./waf install

# Add Essentia to Python path
import sys
sys.path.append("/usr/local/lib/python3.8/dist-packages")  # adjust if needed

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
libfftw3-dev is already the newest version (3.3.8-2ubuntu8).
libsamplerate0-dev is already the newest version (0.2.2-1build1).
libeigen3-dev is already the newest version (3.4.0-2ubuntu2).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
fatal: destination path 'essentia' already exists and is not an empty directory.
/content/essentia
  # libs_paths = [';packaging\win32_3rdparty\\' + lib + '\lib\pkgconfig' for lib in libs_3rdparty]
[32m[0mSetting top to                           :[0m [0m[32m[32m/content/essentia[0m [0m
[32m[0mSetting out to                           :[0m [0m[32m[32m/content/essentia/build[0m [0m
→ configuring the project in /content/essentia
→ Building in release mode
[32m[0mChecking for 'g++' (C++ compiler)        :[0m [0m[32m[32m/usr/bin/g++[0m [0m
[32m[0mChecking for 'gc

In [1]:
!pip install Essentia

Collecting Essentia
  Downloading essentia-2.1b6.dev1389-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading essentia-2.1b6.dev1389-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Essentia
Successfully installed Essentia-2.1b6.dev1389


In [2]:
!pip install yt_dlp
!apt-get install -y nodejs



Collecting yt_dlp
  Downloading yt_dlp-2025.11.12-py3-none-any.whl.metadata (180 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/180.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m174.1/180.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.0/180.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.11.12-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt_dlp
Successfully installed yt_dlp-2025.11.12
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  javascript-common libc-ares2 libjs-highlight.js libnode72 nodejs-doc
Suggested packages:
  apache2 | lighttpd | httpd npm
Th

Imports:

In [3]:
import yt_dlp
import pandas as pd
import essentia.standard as es
import os
import glob

In [4]:
print(os.getcwd())

/content


In [14]:
df = pd.read_csv("/content/tokenisedlyrics.csv")
df = df[df["top3"] == True]

# Cookies

In [17]:
from google.colab import files
uploaded = files.upload()  # choose cookies.txt

Saving cookies.txt to cookies.txt


# Helper Functions

Download Helper:

In [12]:
def download_audio(youtube_url, base_name, cookies_file="cookies.txt", cookie_from_browser=None):
    """
    Download audio from YouTube as WAV using yt-dlp.
    Always returns the correct .wav filename.
    """
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': f'/content/audio/%{base_name}.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'extractor_args': {'youtube': {'player_client': ['default']}},
        'quiet': True,
        'ignoreerrors': True
    }
    if cookies_file:
        ydl_opts['cookiefile'] = cookies_file
    if cookie_from_browser:
        ydl_opts['cookiefrombrowser'] = cookie_from_browser

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(youtube_url, download=True)
        final_path = ydl.prepare_filename(info)
        # force extension to .wav
        final_path = os.path.splitext(final_path)[0] + ".wav"
        return final_path

Feature Helper:

In [13]:
def extract_essentia_features(audio_file):
    audio = es.MonoLoader(filename=audio_file)()

    danceability = es.Danceability()(audio)
    bpm, _, _, _, rhythm_strength = es.RhythmExtractor2013(method="multifeature")(audio)

    tonal_features = es.TonalExtractor()(audio)
    key = tonal_features[0]
    scale = tonal_features[1]

    return {
        "danceability": danceability,
        "bpm": bpm,
        "rhythm_strength": rhythm_strength,
        "key": key,
        "scale": scale
    }


# Pipeline

Batch Run:

In [21]:
results = []
batch_size = 10
cookies_path = "cookies.txt"  # optional, if uploaded

# resume at a specific index if needed
start_index = 200

for start in range(start_index, len(df), batch_size):
    batch = df.iloc[start:start+batch_size]
    print(f"Processing batch {start}–{start+len(batch)-1}")

    for idx, row in batch.iterrows():
        url = row["youtube_url"]
        title = str(row.get("title", f"track_{idx}"))

        try:
            audio_path = download_audio(url, base_name=title, cookies_file=cookies_path)

            if not os.path.exists(audio_path):
                print(f"✗ No audio file created for row {idx} (title='{title}')")
                continue

            feats = extract_essentia_features(audio_path)
            feats["title"] = title
            feats["youtube_url"] = url
            results.append(feats)

            os.remove(audio_path)  # free disk space
            print(f"✓ {title} processed (index {idx})")

        except Exception as e:
            print(f"Error with {title} (index {idx}): {e}")
            continue

    # Save after each batch
    pd.DataFrame(results).to_csv("features_partial.csv", mode="a", header=False, index=False)
    results = []  # clear memory before next batch
    print(f"✔ Batch {start}–{start+len(batch)-1} saved")

Processing batch 140–149




✓ track_809 processed (index 809)




✓ track_815 processed (index 815)




✓ track_826 processed (index 826)




✓ track_831 processed (index 831)




✓ track_838 processed (index 838)




✓ track_861 processed (index 861)




✓ track_863 processed (index 863)




✓ track_864 processed (index 864)




✓ track_866 processed (index 866)




✓ track_872 processed (index 872)
✔ Batch 140–149 saved
Processing batch 150–159


ERROR: [youtube] 0b3ywwJMZLQ: Video unavailable


Error with track_884 (index 884): 'NoneType' object has no attribute 'setdefault'




✓ track_887 processed (index 887)




✓ track_892 processed (index 892)




✓ track_899 processed (index 899)




✓ track_910 processed (index 910)




✓ track_919 processed (index 919)




✓ track_924 processed (index 924)




✓ track_930 processed (index 930)




✓ track_953 processed (index 953)




✓ track_954 processed (index 954)
✔ Batch 150–159 saved
Processing batch 160–169




✓ track_969 processed (index 969)




✓ track_999 processed (index 999)




✓ track_1002 processed (index 1002)




✓ track_1006 processed (index 1006)




✓ track_1041 processed (index 1041)




✓ track_1043 processed (index 1043)




✓ track_1044 processed (index 1044)




✓ track_1086 processed (index 1086)




✓ track_1089 processed (index 1089)




✓ track_1092 processed (index 1092)
✔ Batch 160–169 saved
Processing batch 170–179




✓ track_1118 processed (index 1118)




✓ track_1122 processed (index 1122)




✓ track_1131 processed (index 1131)




✓ track_1167 processed (index 1167)




✓ track_1172 processed (index 1172)




✓ track_1175 processed (index 1175)




✓ track_1199 processed (index 1199)




✓ track_1204 processed (index 1204)




✓ track_1211 processed (index 1211)




✓ track_1241 processed (index 1241)
✔ Batch 170–179 saved
Processing batch 180–189




✓ track_1252 processed (index 1252)




✓ track_1259 processed (index 1259)




✓ track_1295 processed (index 1295)




✓ track_1297 processed (index 1297)




✓ track_1299 processed (index 1299)




✓ track_1327 processed (index 1327)




✓ track_1329 processed (index 1329)








✓ track_1340 processed (index 1340)




✓ track_1363 processed (index 1363)




✓ track_1378 processed (index 1378)
✔ Batch 180–189 saved
Processing batch 190–199




✓ track_1380 processed (index 1380)




✓ track_1406 processed (index 1406)




✓ track_1411 processed (index 1411)








✓ track_1414 processed (index 1414)




✓ track_1442 processed (index 1442)




✓ track_1446 processed (index 1446)




✓ track_1460 processed (index 1460)




✓ track_1482 processed (index 1482)




✓ track_1499 processed (index 1499)




✓ track_1502 processed (index 1502)
✔ Batch 190–199 saved
Processing batch 200–209








✓ track_1525 processed (index 1525)




✓ track_1532 processed (index 1532)




✓ track_1542 processed (index 1542)




✓ track_1613 processed (index 1613)








✓ track_1622 processed (index 1622)




✓ track_1626 processed (index 1626)
Error with track_1651 (index 1651): expected string or bytes-like object, got 'float'




✓ track_1653 processed (index 1653)






KeyboardInterrupt: 

# Consolidate

Merge features into original data frame and output new csv:

In [None]:
all_files = glob.glob("features_partial*.csv")
dfs = [pd.read_csv(f) for f in all_files]
final_df = pd.concat(dfs, ignore_index=True)

final_df.to_csv("features_all.csv", index=False)
print(final_df.head())

In [None]:
lyrics_df = pd.read_csv("translatedlyrics.csv")
features_df = pd.read_csv("features_all.csv")

print(lyrics_df.columns)
print(features_df.columns)


In [None]:
merged_df = pd.merge(
    lyrics_df,
    features_df,
    on=["title", "youtube_url"],   # join on both to be safe
    how="left"                     # keep all lyrics, even if audio missing
)

In [None]:
merged_df.to_csv("lyrics_audio_merged.csv", index=False)
print(merged_df.head())