I removed several directories prior to this step:

Any of Melon's "Did the Internet kill the album review?" videos (since they're not relevant) and his "Elzhi- Elmatic" review since it doesn't have a thumbnail.

In [1]:
%matplotlib inline

import os
import glob
import json
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
album_dirs = glob.glob("../data/album_reviews/*")
print(len(album_dirs))
print(album_dirs[0:5])

1991
['../data/album_reviews/20100308_The Knife- Tomorrow, In a Year Album Review', '../data/album_reviews/20100505_Flying Lotus- Cosmogramma ALBUM REVIEW', '../data/album_reviews/20100509_Little Women- Throat ALBUM REVIEW', '../data/album_reviews/20100510_Holy Fuck- Latin ALBUM REVIEW', '../data/album_reviews/20100511_The National- High Violet ALBUM REVIEW']


In [3]:
search_string = "../data/album_reviews/(\d+)_(.*) ?- ?(.*) ALBUM REVIEW"

def extract_date_artist_title(s):
    try:
        results = re.search(
            search_string, album, flags=re.IGNORECASE
        ).groups()
    except AttributeError:
        results = (np.nan, np.nan, np.nan)
        
    return results

def extract_score(s):
    try:
        score = re.search("\n?(.*/10)[\n| ]", s).groups()[0]
    except AttributeError:
        score = np.nan
    return score

def get_json_data(info):
    likes = info["like_count"]
    dislikes = info["dislike_count"]
    views = info["view_count"]
    duration = info["duration"]
    return [likes, dislikes, views, duration]

metadata = []
for album in album_dirs:
    date, artist, title = extract_date_artist_title(album)
    
    # glob doesn't work with brackets and I'm too lazy to figure it out
    desc_file = f"{album}/{next(x for x in os.listdir(album) if x.endswith('description'))}"
    with open(desc_file, "r") as f:
        description = f.read()
        
    score = extract_score(description)
    
    json_file = f"{album}/{next(x for x in os.listdir(album) if x.endswith('json'))}"
    with open(json_file, "r") as f:
        info = json.load(f)
    
    results = get_json_data(info)
    
    filename = album.split("/")[-1] + ".jpg"
    
    metadata.append([filename, date, artist, title, score] + results)

In [4]:
metadata_df = pd.DataFrame(
    metadata,
    columns=[
        "filename",
        "date",
        "artist",
        "title",
        "score",
        "likes",
        "dislikes",
        "views",
        "duration",
    ],
)

metadata_df.head()

Unnamed: 0,filename,date,artist,title,score,likes,dislikes,views,duration
0,"20100308_The Knife- Tomorrow, In a Year Album ...",20100308,The Knife,"Tomorrow, In a Year",3/10,210,61,25696,397
1,20100505_Flying Lotus- Cosmogramma ALBUM REVIE...,20100505,Flying Lotus,Cosmogramma,8/10,1517,40,132687,471
2,20100509_Little Women- Throat ALBUM REVIEW.jpg,20100509,Little Women,Throat,9/10,249,7,26120,467
3,20100510_Holy Fuck- Latin ALBUM REVIEW.jpg,20100510,Holy Fuck,Latin,7/10,96,4,10291,339
4,20100511_The National- High Violet ALBUM REVIE...,20100511,The National,High Violet,6/10,270,243,53457,354


In [5]:
metadata_df.loc[metadata_df.isna().any(axis=1),]

Unnamed: 0,filename,date,artist,title,score,likes,dislikes,views,duration
84,20101011_Belle and Sebastian Write About Love ...,,,,6/10,139,8,16671,318
96,20101029_Small Black- New Chain ALBUM REVIEW.jpg,20101029,Small Black,New Chain,,411,9,8139,128
103,20101107_Matt & Kim- Sidewalks ALBUM REVIEW.jpg,20101107,Matt & Kim,Sidewalks,,206,6,15901,345
136,20110124_Deerhoof- Deerhoof vs Evil ALBUM REVI...,20110124,Deerhoof,Deerhoof vs Evil,,240,7,21211,324
193,20110511_The Lonely Island- Turtleneck and Cha...,20110511,The Lonely Island,Turtleneck and Chain,,1350,108,134950,343
...,...,...,...,...,...,...,...,...,...
1574,20180114_Aphex Twin - Selected Ambient Works 8...,20180114,Aphex Twin - Selected Ambient Works 85,92,,7473,148,288587,451
1581,20180126_EDEN - vertigo ALBUM REVIEW.jpg,20180126,EDEN,vertigo,,2847,113,82332,206
1633,20180427_Felix Blume - Death In Haiti - Funera...,20180427,Felix Blume - Death In Haiti,Funeral Brass Bands & Sounds From Port Au Prince,,1799,33,65243,209
1862,20190711_Blarf - Cease & Desist ALBUM REVIEW.jpg,20190711,Blarf,Cease & Desist,,13857,1317,232838,249


In [6]:
metadata_df.loc[metadata_df["date"].isnull()]

Unnamed: 0,filename,date,artist,title,score,likes,dislikes,views,duration
84,20101011_Belle and Sebastian Write About Love ...,,,,6/10,139,8,16671,318
798,20140410_SZA - Z EP_ALBUM REVIEW ft. Cakes Da ...,,,,4/10,1203,267,97086,472


In [7]:
for i, row in metadata_df.loc[metadata_df["date"].isnull()].iterrows():
    album = album_dirs[i]
    print(i, album)

84 ../data/album_reviews/20101011_Belle and Sebastian Write About Love ALBUM REVIEW
798 ../data/album_reviews/20140410_SZA - Z EP_ALBUM REVIEW ft. Cakes Da Killa


In [8]:
metadata_df.at[84, "date"] = "20101011"
metadata_df.at[84, "artist"] = "Belle and Sebastian"
metadata_df.at[84, "title"] = "Write About Love"

metadata_df.at[802, "date"] = "20140410"
metadata_df.at[802, "artist"] = "SZA"
metadata_df.at[802, "title"] = "Z EP"

In [9]:
metadata_df.loc[metadata_df["score"].isnull()].shape

(97, 9)

In [10]:
metadata_df.to_csv("../data/album_metadata.tsv", sep="\t", index=False)