In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import calendar
import requests
from pathlib import Path
from matplotlib.collections import PatchCollection
import hashlib
from time import sleep
import spacy
from spacy.matcher import Matcher
from sklearn.metrics.pairwise import cosine_similarity
from data import read_history
from functools import lru_cache
from matplotlib.patches import Rectangle
import squarify
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
from matplotlib.offsetbox import (TextArea, DrawingArea, OffsetImage,
                                  AnnotationBbox)
import os
import en_core_web_sm
nlp = en_core_web_sm.load()
kg_caches = Path("kgcaches")
kg_caches.name

In [None]:
!mkdir -p $kg_caches.name

In [None]:
streaming_history = read_history()
print(len(streaming_history))
streaming_history.sample(10)

In [None]:
len(streaming_history["artistName"].unique())

## The Google KnowledgeGraph API

Remember to activate your api key

In [None]:
def query_kg(artist_name):
    hash_object = hashlib.md5(artist_name.encode('utf-8'))
    file = kg_caches / f"{hash_object.hexdigest()}.json"
    if file.exists():
        with open(file) as readable:
            return json.load(readable)
    
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    params = {
        'query': artist_name,
        'limit': 30,
        'indent': True,
        'key': os.environ["GOOGLE_KG_API_KEY"],
    }
    response = requests.get(service_url, params=params)
    response.raise_for_status()
    document = json.loads(response.text)

    with open(file, "w") as writable:
        json.dump(document, writable, indent=4)
    
    sleep(0.5)
    return document

In [None]:
artists_info = dict()
for i, artist in enumerate(streaming_history["artistName"].unique()):
    if i % 300 == 0:
        print(i, artist)
    artists_info[artist] = query_kg(artist)

In [None]:
import collections

def flatten(d, parent_key='', sep='.'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)
        
allowed_types = set(["MusicGroup", "Person"])
artistNames = []
results = []
for artist, info in artists_info.items():
    item_list = info["itemListElement"]
    for result in (item["result"] for item in item_list if "result" in item):
        types = set(result.get("@type",[]))
        if types & allowed_types:
            results.append(flatten(result))
            artistNames.append(artist)

In [None]:
details = pd.DataFrame(results)
details["artistName"] = artistNames
details = details[~details["detailedDescription.articleBody"].isna()].reset_index(drop=True)
print(len(details))
details.sample(5, random_state=10)

In [None]:
labels = set(["GPE", "NORP", "LANGUAGE", "LOC"])
with open("nationalities.csv") as readable:
    nationalities = set([nat.strip().lower() for nat in readable])

def extract_nationality(description):
    # This one could obviously be improved!
    doc = nlp(description)
    for ent in filter(
        lambda entity: (entity.label_ in labels) and (entity.text.lower() in nationalities), 
        doc.ents):
        return ent.text
    return None

In [None]:
artist_nationality = {}
for _, row in details.iterrows():
    artistName = row["artistName"]
    if artist_nationality.get(artistName):
        continue
    artist_nationality[artistName] = extract_nationality(row["detailedDescription.articleBody"])

In [None]:
artist_nationality["Red Hot Chili Peppers"]

## Add nationality

In [None]:
streaming_history["nationality"] = streaming_history["artistName"].apply(lambda artist: artist_nationality.get(artist))

In [None]:
contry_counts = streaming_history[~streaming_history["artistName"].duplicated()].groupby("nationality").count()

In [None]:
top_X = contry_counts.sort_values(by="msPlayed").tail(10)
top_X.sort_values(by="artistName", ascending=False)

In [None]:
nationality_artist_text = {
    nationality: " ".join(streaming_history.query(f"nationality == '{nationality}'")["artistName"].values) 
    for nationality in top_X["artistName"].index
}

nationality_artist_text["British"][:100]

In [None]:
nationalities_colors = {
    "Japanese": "Reds",
    "English": "Oranges",
    "American": "Blues",
    "Mexican": "Greens",
    "British": "RdPu",
    "Australian": "PuBu",
    "Canadian": "Purples",
    "German": "GnBu",
    "French": "OrRd",
    "Spanish": "YlOrBr",
}

In [None]:
width, height = 2000, 2000
#width, height = 500, 500
values = squarify.normalize_sizes(top_X["artistName"], width, height)
rects = squarify.squarify(values, 0., 0., width, height)
word_clouds = [
    WordCloud(
        width=int(rect["dx"]),
        height=int(rect["dy"]),
        max_font_size=50,
        max_words=600,
        repeat=True,
        colormap=nationalities_colors[nationality],
        background_color="rgba(0, 0, 0, 0)"
    ).generate(nationality_artist_text[nationality]).to_array()
    for nationality, rect in zip(top_X["artistName"].index, rects)
]

In [None]:
offset = 50
full_image = np.zeros((height + 2 * offset, width + 2 * offset, 3))

for wc, rect in zip(word_clouds, rects):
    x, y = int(rect["x"]) + offset, int(rect["y"]) + offset
    dx, dy = int(rect["dx"]), int(rect["dy"])
    full_image[y:y+dy,x:x+dx,:] = wc

    
fig = plt.figure(figsize=(50,50))
ax = plt.subplot()
ax.imshow(full_image/255)
ax.axis("off")
plt.tight_layout()
plt.savefig("words.png", dpi=300)