In [None]:
import sys
!{sys.executable} -m pip install boto3

Collecting boto3
  Downloading boto3-1.42.40-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<1.43.0,>=1.42.40 (from boto3)
  Downloading botocore-1.42.40-py3-none-any.whl.metadata (5.9 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.1.0-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (from boto3)
  Downloading s3transfer-0.16.0-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.42.40-py3-none-any.whl (140 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.42.40-py3-none-any.whl (14.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m89.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.1.0-py3-none-any.whl (20 kB)
Downloading s3transfer-0.16.0-py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m12.3 MB/s[0m eta [36m0:

## Connect to S3 Bucket

In [None]:
import boto3
from google.colab import userdata

# Pull credentials from Colab Secrets (no hardcoding)
aws_access_key_id = userdata.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = userdata.get('AWS_SECRET_ACCESS_KEY')

s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name='us-west-2'
)

# Quick sanity check — list objects in your bucket
response = s3.list_objects_v2(Bucket='mids-capstone-music-ad-matching-2026', Prefix='raw-data/')
for obj in response.get('Contents', []):
    print(obj['Key'])

raw-data/
raw-data/fma/
raw-data/meta-ads/
raw-data/million-song/
raw-data/synthetic-annotations/


In [None]:
# Test 1: List buckets
response = s3.list_buckets()
print("Buckets:", [b['Name'] for b in response['Buckets']])

Buckets: ['mids-capstone-music-ad-matching-2026']


## Load Million Song Subset to S3

In [None]:
from google.colab import drive, userdata
import boto3, os, tarfile

# Mount Drive
drive.mount('/content/drive')

# S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=userdata.get('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=userdata.get('AWS_SECRET_ACCESS_KEY'),
    region_name='us-west-2'
)

BUCKET = 'mids-capstone-music-ad-matching-2026'

# Update this path to wherever you dropped it in Drive
TARZ_PATH = '/content/drive/MyDrive/Colab_Notebooks/MIDS/210/millionsongsubset.tar.gz'

# Extract
print("Extracting...")
with tarfile.open(TARZ_PATH, 'r:gz') as tar:
    tar.extractall('/content/msd_subset')

# Upload HDF5 files
print("Uploading HDF5 files to S3...")
count = 0
for root, dirs, files in os.walk('/content/msd_subset'):
    for f in files:
        if f.endswith('.h5'):
            local_path = os.path.join(root, f)
            # Preserve the A/B/C/trackid.h5 structure
            rel_path = local_path.split('msd_subset/')[-1]
            s3_key = f'raw-data/million-song/{rel_path}'
            s3.upload_file(local_path, BUCKET, s3_key)
            count += 1
            if count % 500 == 0:
                print(f"  ...{count} files uploaded")

print(f"  ✓ Done: {count} HDF5 files")

# Upload SQLite additional files
print("Uploading SQLite files...")
for root, dirs, files in os.walk('/content/msd_subset'):
    for f in files:
        if f.endswith('.db'):
            local_path = os.path.join(root, f)
            s3_key = f'raw-data/million-song/{f}'
            s3.upload_file(local_path, BUCKET, s3_key)
            print(f"  ✓ {s3_key}")

# Verify
print("\n--- Verification ---")
response = s3.list_objects_v2(Bucket=BUCKET, Prefix='raw-data/million-song/')
objects = response.get('Contents', [])
total_mb = sum(o['Size'] for o in objects) / 1e6
print(f"raw-data/million-song/: {len(objects)} files, {total_mb:.1f} MB")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extracting...


  tar.extractall('/content/msd_subset')


Uploading HDF5 files to S3...
  ...500 files uploaded
  ...1000 files uploaded
  ...1500 files uploaded
  ...2000 files uploaded
  ...2500 files uploaded
  ...3000 files uploaded
  ...3500 files uploaded
  ...4000 files uploaded
  ...4500 files uploaded
  ...5000 files uploaded
  ...5500 files uploaded
  ...6000 files uploaded
  ...6500 files uploaded
  ...7000 files uploaded
  ...7500 files uploaded
  ...8000 files uploaded
  ...8500 files uploaded
  ...9000 files uploaded
  ...9500 files uploaded
  ...10000 files uploaded
  ✓ Done: 10000 HDF5 files
Uploading SQLite files...

--- Verification ---
raw-data/million-song/: 1000 files, 268.4 MB


## Load FMA Full Dataset to S3

In [None]:
import boto3, os, zipfile, hashlib
import urllib.request
from google.colab import userdata

# S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=userdata.get('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=userdata.get('AWS_SECRET_ACCESS_KEY'),
    region_name='us-west-2'
)

BUCKET = 'mids-capstone-music-ad-matching-2026'
FMA_URL = 'https://os.unil.cloud.switch.ch/fma/fma_metadata.zip'
EXPECTED_SHA1 = 'f0df49ffe5f2a6008d7dc83c6915b31835dfe733'

# Download
print("Downloading fma_metadata.zip (342 MB)...")
urllib.request.urlretrieve(FMA_URL, 'fma_metadata.zip')
print(f"  Downloaded: {os.path.getsize('fma_metadata.zip') / 1e6:.1f} MB")

# Verify integrity
print("Verifying sha1...")
sha1 = hashlib.sha1()
with open('fma_metadata.zip', 'rb') as f:
    while chunk := f.read(8192):
        sha1.update(chunk)
actual_sha1 = sha1.hexdigest()
assert actual_sha1 == EXPECTED_SHA1, f"SHA1 mismatch! Expected {EXPECTED_SHA1}, got {actual_sha1}"
print(f"  ✓ SHA1 verified: {actual_sha1}")

# Extract
print("Extracting...")
with zipfile.ZipFile('fma_metadata.zip', 'r') as z:
    z.extractall('/content/fma_metadata')
    print(f"  Contents: {z.namelist()}")

# Upload CSVs to S3
FMA_FILES = ['tracks.csv', 'genres.csv', 'features.csv', 'echonest.csv']
print("Uploading to S3...")
for f in FMA_FILES:
    local_path = f'/content/fma_metadata/fma_metadata/{f}'  # nested folder
    s3_key = f'raw-data/fma/{f}'
    if os.path.exists(local_path):
        s3.upload_file(local_path, BUCKET, s3_key)
        print(f"  ✓ {s3_key} ({os.path.getsize(local_path) / 1e6:.1f} MB)")
    else:
        print(f"  ✗ {f} not found")

# Verify S3
print("\n--- S3 Verification ---")
response = s3.list_objects_v2(Bucket=BUCKET, Prefix='raw-data/fma/')
for obj in response.get('Contents', []):
    print(f"  {obj['Key']} ({obj['Size'] / 1e6:.1f} MB)")

Downloading fma_metadata.zip (342 MB)...
  Downloaded: 358.4 MB
Verifying sha1...
  ✓ SHA1 verified: f0df49ffe5f2a6008d7dc83c6915b31835dfe733
Extracting...
  Contents: ['fma_metadata/README.txt', 'fma_metadata/checksums', 'fma_metadata/not_found.pickle', 'fma_metadata/raw_genres.csv', 'fma_metadata/raw_albums.csv', 'fma_metadata/raw_artists.csv', 'fma_metadata/raw_tracks.csv', 'fma_metadata/tracks.csv', 'fma_metadata/genres.csv', 'fma_metadata/raw_echonest.csv', 'fma_metadata/echonest.csv', 'fma_metadata/features.csv']
Uploading to S3...
  ✓ raw-data/fma/tracks.csv (260.4 MB)
  ✓ raw-data/fma/genres.csv (0.0 MB)
  ✓ raw-data/fma/features.csv (951.1 MB)
  ✓ raw-data/fma/echonest.csv (44.0 MB)

--- S3 Verification ---
  raw-data/fma/ (0.0 MB)
  raw-data/fma/echonest.csv (44.0 MB)
  raw-data/fma/features.csv (951.1 MB)
  raw-data/fma/genres.csv (0.0 MB)
  raw-data/fma/tracks.csv (260.4 MB)


## Load IAB Content Taxonomy to S3

In [None]:
import boto3, os
import urllib.request
from google.colab import userdata

s3 = boto3.client(
    's3',
    aws_access_key_id=userdata.get('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=userdata.get('AWS_SECRET_ACCESS_KEY'),
    region_name='us-west-2'
)

BUCKET = 'mids-capstone-music-ad-matching-2026'

# Raw GitHub URL for the TSV
IAB_URL = 'https://raw.githubusercontent.com/InteractiveAdvertisingBureau/Taxonomies/develop/Content%20Taxonomies/Content%20Taxonomy%203.1.tsv'

print("Downloading IAB Content Taxonomy 3.1...")
urllib.request.urlretrieve(IAB_URL, 'Content_Taxonomy_3.1.tsv')
size = os.path.getsize('Content_Taxonomy_3.1.tsv')
print(f"  Downloaded: {size / 1e3:.1f} KB")

# Upload to meta-ads folder since this is the ad taxonomy
s3_key = 'raw-data/meta-ads/IAB_Content_Taxonomy_3.1.tsv'
s3.upload_file('Content_Taxonomy_3.1.tsv', BUCKET, s3_key)
print(f"  ✓ Uploaded: {s3_key}")

# Quick peek at the structure
import pandas as pd
df = pd.read_csv('Content_Taxonomy_3.1.tsv', sep='\t')
print(f"\n  Shape: {df.shape}")
print(f"  Columns: {df.columns.tolist()}")
print(f"\n{df.head(10)}")

Downloading IAB Content Taxonomy 3.1...
  Downloaded: 47.3 KB
  ✓ Uploaded: raw-data/meta-ads/IAB_Content_Taxonomy_3.1.tsv

  Shape: (705, 8)
  Columns: ['Relational ID System', 'Unnamed: 1', 'Unnamed: 2', 'Content Taxonomy v3.1 Tiered Categories', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Extension']

  Relational ID System Unnamed: 1                        Unnamed: 2  \
0            Unique ID     Parent                              Name   
1                  150        NaN                       Attractions   
2                  151        150         Amusement and Theme Parks   
3                  179        150                Bars & Restaurants   
4                  181        150                Casinos & Gambling   
5                  153        150  Historic Site and Landmark Tours   
6                  154        150          Malls & Shopping Centers   
7                  155        150               Museums & Galleries   
8                  158        150                       

In [None]:
# Test: Upload test csv to your bucket
s3.put_object(
    Bucket='mids-capstone-music-ad-matching-2026',
    Key='test-uploads/test-from-jason.txt',
    Body='Hello from Jason - IAM user test'
)
print("Upload successful!")

Upload successful!


## Consolidate Million Song HDF5 Files to Metadata CSV

In [None]:
!pip install h5py -q

import h5py
import pandas as pd
import boto3, os
from google.colab import userdata

s3 = boto3.client(
    's3',
    aws_access_key_id=userdata.get('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=userdata.get('AWS_SECRET_ACCESS_KEY'),
    region_name='us-west-2'
)

BUCKET = 'mids-capstone-music-ad-matching-2026'

# First, let's look at one HDF5 file to see what fields are available
# Download a single file to inspect
sample_key = None
response = s3.list_objects_v2(Bucket=BUCKET, Prefix='raw-data/million-song/', MaxKeys=10)
for obj in response.get('Contents', []):
    if obj['Key'].endswith('.h5'):
        sample_key = obj['Key']
        print(f"Found: {sample_key}")
        break

s3.download_file(BUCKET, sample_key, 'sample.h5')

with h5py.File('sample.h5', 'r') as f:
    def print_structure(name, obj):
        print(name)
    f.visititems(print_structure)

Found: raw-data/million-song/MillionSongSubset/A/A/A/TRAAAAW128F429D538.h5
analysis
analysis/bars_confidence
analysis/bars_start
analysis/beats_confidence
analysis/beats_start
analysis/sections_confidence
analysis/sections_start
analysis/segments_confidence
analysis/segments_loudness_max
analysis/segments_loudness_max_time
analysis/segments_loudness_start
analysis/segments_pitches
analysis/segments_start
analysis/segments_timbre
analysis/songs
analysis/tatums_confidence
analysis/tatums_start
metadata
metadata/artist_terms
metadata/artist_terms_freq
metadata/artist_terms_weight
metadata/similar_artists
metadata/songs
musicbrainz
musicbrainz/artist_mbtags
musicbrainz/artist_mbtags_count
musicbrainz/songs


In [None]:
with h5py.File('sample.h5', 'r') as f:
    print("=== metadata/songs ===")
    print(f['metadata/songs'].dtype.names)
    print(f['metadata/songs'][0])

    print("\n=== analysis/songs ===")
    print(f['analysis/songs'].dtype.names)
    print(f['analysis/songs'][0])

    print("\n=== artist_terms ===")
    print(f['metadata/artist_terms'][:])

    print("\n=== artist_terms_freq ===")
    print(f['metadata/artist_terms_freq'][:])

=== metadata/songs ===
('analyzer_version', 'artist_7digitalid', 'artist_familiarity', 'artist_hotttnesss', 'artist_id', 'artist_latitude', 'artist_location', 'artist_longitude', 'artist_mbid', 'artist_name', 'artist_playmeid', 'genre', 'idx_artist_terms', 'idx_similar_artists', 'release', 'release_7digitalid', 'song_hotttnesss', 'song_id', 'title', 'track_7digitalid')
(b'', 165270, 0.5817937658450281, 0.4019975433642836, b'ARD7TVE1187B99BFB1', nan, b'California - LA', nan, b'e77e51a5-4761-45b3-9847-2051f811e366', b'Casual', 4479, b'', 0, 0, b'Fear Itself', 300848, 0.6021199899057548, b'SOMZWCG12A8C13C480', b"I Didn't Mean To", 3401791)

=== analysis/songs ===
('analysis_sample_rate', 'audio_md5', 'danceability', 'duration', 'end_of_fade_in', 'energy', 'idx_bars_confidence', 'idx_bars_start', 'idx_beats_confidence', 'idx_beats_start', 'idx_sections_confidence', 'idx_sections_start', 'idx_segments_confidence', 'idx_segments_loudness_max', 'idx_segments_loudness_max_time', 'idx_segments_

In [None]:
import h5py
import pandas as pd
import boto3, os
from google.colab import userdata

s3 = boto3.client(
    's3',
    aws_access_key_id=userdata.get('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=userdata.get('AWS_SECRET_ACCESS_KEY'),
    region_name='us-west-2'
)

BUCKET = 'mids-capstone-music-ad-matching-2026'

def extract_song(filepath):
    """Extract relevant fields from a single MSD HDF5 file."""
    try:
        with h5py.File(filepath, 'r') as f:
            meta = f['metadata/songs'][0]
            analysis = f['analysis/songs'][0]

            # Artist terms (tags) — join as pipe-delimited string
            terms = f['metadata/artist_terms'][:]
            terms_freq = f['metadata/artist_terms_freq'][:]

            # Decode byte strings
            terms_str = '|'.join([t.decode('utf-8', errors='ignore') for t in terms])

            return {
                # Metadata
                'track_id': analysis['track_id'].decode(),
                'song_id': meta['song_id'].decode(),
                'artist_id': meta['artist_id'].decode(),
                'artist_name': meta['artist_name'].decode('utf-8', errors='ignore'),
                'title': meta['title'].decode('utf-8', errors='ignore'),
                'release': meta['release'].decode('utf-8', errors='ignore'),
                'artist_location': meta['artist_location'].decode('utf-8', errors='ignore'),
                'artist_latitude': meta['artist_latitude'],
                'artist_longitude': meta['artist_longitude'],
                'artist_familiarity': meta['artist_familiarity'],
                'artist_hotttnesss': meta['artist_hotttnesss'],
                'song_hotttnesss': meta['song_hotttnesss'],
                'genre': meta['genre'].decode('utf-8', errors='ignore'),

                # Audio analysis
                'duration': analysis['duration'],
                'tempo': analysis['tempo'],
                'loudness': analysis['loudness'],
                'danceability': analysis['danceability'],
                'energy': analysis['energy'],
                'key': analysis['key'],
                'key_confidence': analysis['key_confidence'],
                'mode': analysis['mode'],
                'mode_confidence': analysis['mode_confidence'],
                'time_signature': analysis['time_signature'],
                'time_signature_confidence': analysis['time_signature_confidence'],
                'start_of_fade_out': analysis['start_of_fade_out'],
                'end_of_fade_in': analysis['end_of_fade_in'],

                # Tags
                'artist_terms': terms_str,
                'num_artist_terms': len(terms),
            }
    except Exception as e:
        print(f"  Error reading {filepath}: {e}")
        return None

# Download all HDF5 files and extract
print("Listing all HDF5 files in S3...")
paginator = s3.get_paginator('list_objects_v2')
h5_keys = []
for page in paginator.paginate(Bucket=BUCKET, Prefix='raw-data/million-song/'):
    for obj in page.get('Contents', []):
        if obj['Key'].endswith('.h5'):
            h5_keys.append(obj['Key'])
print(f"  Found {len(h5_keys)} HDF5 files")

# Download and extract in batches
rows = []
batch_size = 500
for i, key in enumerate(h5_keys):
    local_path = f'/content/msd_temp/{os.path.basename(key)}'
    os.makedirs('/content/msd_temp', exist_ok=True)
    s3.download_file(BUCKET, key, local_path)

    row = extract_song(local_path)
    if row:
        rows.append(row)

    # Clean up to save disk space
    os.remove(local_path)

    if (i + 1) % batch_size == 0:
        print(f"  ...processed {i + 1}/{len(h5_keys)} files")

# Build DataFrame
print(f"\nBuilding DataFrame from {len(rows)} songs...")
df = pd.DataFrame(rows)
print(df.shape)
print(df.head())
print(df.dtypes)

# Save locally and upload to processed-data/
output_path = '/content/million_song_subset.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved: {os.path.getsize(output_path) / 1e6:.1f} MB")

s3_key = 'processed-data/million_song_subset.csv'
s3.upload_file(output_path, BUCKET, s3_key)
print(f"  ✓ Uploaded: {s3_key}")

Listing all HDF5 files in S3...
  Found 10000 HDF5 files
  ...processed 500/10000 files
  ...processed 1000/10000 files
  ...processed 1500/10000 files
  ...processed 2000/10000 files
  ...processed 2500/10000 files
  ...processed 3000/10000 files
  ...processed 3500/10000 files
  ...processed 4000/10000 files
  ...processed 4500/10000 files
  ...processed 5000/10000 files
  ...processed 5500/10000 files
  ...processed 6000/10000 files
  ...processed 6500/10000 files
  ...processed 7000/10000 files
  ...processed 7500/10000 files
  ...processed 8000/10000 files
  ...processed 8500/10000 files
  ...processed 9000/10000 files
  ...processed 9500/10000 files
  ...processed 10000/10000 files

Building DataFrame from 10000 songs...
(10000, 28)
             track_id             song_id           artist_id  \
0  TRAAAAW128F429D538  SOMZWCG12A8C13C480  ARD7TVE1187B99BFB1   
1  TRAAABD128F429CF47  SOCIWDW12A8C13D406  ARMJAGH1187FB546F3   
2  TRAAADZ128F9348C2E  SOXVLOJ12AB0189215  ARKRRTF1187B9

## Extract Million Songs Segments Data to CSV

In [None]:
import h5py
import pandas as pd
import boto3, os
from google.colab import userdata

s3 = boto3.client(
    's3',
    aws_access_key_id=userdata.get('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=userdata.get('AWS_SECRET_ACCESS_KEY'),
    region_name='us-west-2'
)

BUCKET = 'mids-capstone-music-ad-matching-2026'

def extract_segments(filepath):
    """Extract per-segment timbre, pitch, and loudness arrays."""
    try:
        with h5py.File(filepath, 'r') as f:
            song_id = f['metadata/songs'][0]['song_id'].decode()

            segments_start = f['analysis/segments_start'][:]
            segments_timbre = f['analysis/segments_timbre'][:]    # (n_segments, 12)
            segments_pitches = f['analysis/segments_pitches'][:]  # (n_segments, 12)
            segments_loudness_max = f['analysis/segments_loudness_max'][:]

            rows = []
            for i in range(len(segments_start)):
                row = {
                    'song_id': song_id,
                    'segment_idx': i,
                    'segment_start': segments_start[i],
                    'loudness_max': segments_loudness_max[i],
                }
                # Timbre dimensions
                for j in range(12):
                    row[f'timbre_{j}'] = segments_timbre[i][j]
                # Pitch chroma dimensions
                for j in range(12):
                    row[f'pitch_{j}'] = segments_pitches[i][j]
                rows.append(row)

            return rows
    except Exception as e:
        print(f"  Error: {filepath}: {e}")
        return []

# List all HDF5 keys
print("Listing HDF5 files...")
paginator = s3.get_paginator('list_objects_v2')
h5_keys = []
for page in paginator.paginate(Bucket=BUCKET, Prefix='raw-data/million-song/'):
    for obj in page.get('Contents', []):
        if obj['Key'].endswith('.h5'):
            h5_keys.append(obj['Key'])
print(f"  Found {len(h5_keys)} HDF5 files")

# Extract segments
all_rows = []
for i, key in enumerate(h5_keys):
    local_path = f'/content/msd_temp/{os.path.basename(key)}'
    os.makedirs('/content/msd_temp', exist_ok=True)
    s3.download_file(BUCKET, key, local_path)

    rows = extract_segments(local_path)
    all_rows.extend(rows)
    os.remove(local_path)

    if (i + 1) % 500 == 0:
        print(f"  ...processed {i + 1}/{len(h5_keys)} files, {len(all_rows):,} segments so far")

# Build and save
print(f"\nBuilding DataFrame from {len(all_rows):,} segments...")
df_segments = pd.DataFrame(all_rows)
print(f"  Shape: {df_segments.shape}")
print(df_segments.head())

output_path = '/content/million_song_subset_segments.csv'
df_segments.to_csv(output_path, index=False)
print(f"\nSaved: {os.path.getsize(output_path) / 1e6:.1f} MB")

s3_key = 'processed-data/million_song_subset_segments.csv'
s3.upload_file(output_path, BUCKET, s3_key)
print(f"  ✓ Uploaded: {s3_key}")

Listing HDF5 files...
  Found 10000 HDF5 files
  ...processed 500/10000 files, 424,567 segments so far
  ...processed 1000/10000 files, 846,275 segments so far
  ...processed 1500/10000 files, 1,279,106 segments so far
  ...processed 2000/10000 files, 1,715,918 segments so far
  ...processed 2500/10000 files, 2,127,161 segments so far
  ...processed 3000/10000 files, 2,558,562 segments so far
  ...processed 3500/10000 files, 2,987,276 segments so far
  ...processed 4000/10000 files, 3,428,612 segments so far
  ...processed 4500/10000 files, 3,858,940 segments so far
  ...processed 5000/10000 files, 4,291,283 segments so far
  ...processed 5500/10000 files, 4,718,649 segments so far
  ...processed 6000/10000 files, 5,150,237 segments so far
  ...processed 6500/10000 files, 5,578,771 segments so far
  ...processed 7000/10000 files, 6,013,690 segments so far
  ...processed 7500/10000 files, 6,432,548 segments so far
  ...processed 8000/10000 files, 6,846,573 segments so far
  ...processed