## Imports

In this section, we'll import:
- `json` to parse the JSON documents
- `wordcloud` for the creation of a word cloud
- `matplotlib` to show the word cloud
- `lookup_json` (local) to allow the extraction of all lyrics from the tracks 

In [None]:
import os
import json
import re

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd

import lookup_json

## Read the text

In [None]:
with open(os.curdir + "/../data/backup-tracks.json", 'r') as f:
    obj = json.load(f)

In [None]:
text = " ".join(lookup_json.dump(obj, ["*", "lyrics", "*", "content"]))

# Create the wordcloud object
wordcloud = WordCloud(width=480, height=480, margin=0).generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
# plt.savefig("wordcloud.svg")

## Descriptive Statistics

In [None]:
def get_present_values(obj, path):
    missing = []
    
    values = list(lookup_json.dump(obj, path, missing))
    present_values = list(filter(lambda x: x is not missing, values))
    
    return (present_values, values)

def print_missing(obj, path, name):
    present_values, values = get_present_values(obj, path)
    print(f"Missing {name} (%): {100 - (100 * len(present_values) / len(values))}")
    

In [None]:
print_missing(obj, ["*", "publishedAt"], "publishedAt")
print_missing(obj, ["*", "album"], "album")
print_missing(obj, ["*", "album", "image"], "album.image")
print_missing(obj, ["*", "lyrics", "0", "title"], "lyrics.0.title")
print_missing(obj, ["*", "duration"], "duration")
print_missing(obj, ["*", "publishedAt"], "publishedAt")

df_durations = pd.DataFrame(get_present_values(obj, ["*", "duration"])[0], columns=["duration"])
df_durations.describe()

In [None]:
wc_regex = re.compile(r"\b\S+\b")

def get_words(text):
    yield from map(lambda x: x.group(), wc_regex.finditer(text))

In [None]:

word_counts = []
section_counts = []

weird_songs = []
for index, lyrics in enumerate(lookup_json.dump(obj, ["*", "lyrics"])):
    sections = list(lookup_json.dump(lyrics, ["*", "content"]))
    
    text = "\n".join(sections)
    num_words = len(list(get_words(text)))
    if num_words == 0:
        weird_songs.append(lookup_json.dump(obj, [str(index), "lyrics"]).__next__())
    
    section_counts.append(len(sections))
    word_counts.append(num_words)
    
# print(weird_songs)
    
df_word_counts = pd.DataFrame(word_counts, columns=["word_count"])
print(df_word_counts.describe().style.to_latex())

df_section_counts = pd.DataFrame(section_counts, columns=["section_count"])
# df_section_counts.describe().style.to_latex()

In [None]:
genre_counts = {}
for genre in lookup_json.dump(obj, ["*", "genres", "*"]):
    if genre in genre_counts:
        genre_counts[genre] += 1
    else:
        genre_counts[genre] = 1
 
       
genre_items = list(sorted(genre_counts.items(), key=lambda x: x[1], reverse=True))[0:20]

df_tag_counts = pd.DataFrame(genre_items, columns=["Genre", "Number of tracks"])
df_tag_counts.describe()

print(df_tag_counts.to_latex())

figure = df_tag_counts.plot.barh(x="Genre", y="Number of tracks").get_figure()
figure.savefig("genre_counts.svg", pad_inches=0.1, bbox_inches="tight")
# df_tag_counts.head(20).plot.barh(x="Genre", y="Number of tracks")


In [None]:
year_counts = {}

dates = lookup_json.dump(obj, ["*", "publishedAt"])
for date in dates:
    year = date.split(",")[0].split(" ")[-1]
    if year in year_counts:
        year_counts[year] += 1
    else:
        year_counts[year] = 1
        
year_items = list(sorted(year_counts.items(), key=lambda x: x[0], reverse=False))
df_year_counts = pd.DataFrame(year_items, columns=["Year", "Number of tracks"])
df_year_counts.describe()

figure = df_year_counts.plot.barh(x="Year", y="Number of tracks", figsize=(20, 10)).get_figure()
figure.savefig("wiki_tracks_publish.svg", pad_inches=0.1, bbox_inches="tight")