In [379]:
import pandas as pd
from pprint import pprint

csv = pd.read_csv("anime_cleaned.csv", usecols = ["studio", "anime_id", "score", "genre"])
studios = set(csv["studio"])
genres = {'Comedy','Action','Fantasy','Adventure','Drama'}
score = {9.0, 8.0,7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0}

In [380]:
studio_dic = dict()
for s in studios:
    
    studio_dic[s] = dict()
    studio_dic[s]["length"] = len(csv[csv["studio"].str.contains(s)])
    for genre in genres:
        studio_dic[s][genre] = list(csv[csv["genre"].str.contains(genre, na=False)][csv["studio"].str.contains(s)]["anime_id"])


  import sys


In [178]:
def meanScorePerGenre(animes):
    scores = dict()
    for s in score:
        scores[s] = 0
    for anime in animes:
        mean = round(csv.loc[csv["anime_id"] == anime]["score"])
        if int(mean) in score:
            scores[int(mean)] += 1
    return scores

In [201]:
def initGenreDico():
    dico = dict()
    for g in genres:
        dico[g] = 0
    return dico

def initNotesDico():
    dico = dict()
    for n in score:
        dico[n] = 0
    return dico

In [332]:
def sortAlphab(x, y, xTo="", yTo=""):
    if x <= y and xTo <= yTo: return -1
    return 1
    
def checkIfSameComponent(x, y):
    if(("studioPr" in x and "studioPr" in y) or
      ("genre" in x and "genre" in y) or
      ("info" in x and "info" in y)): 
        return sortAlphab(x["from"], y["from"])
    if(("to" in x and "-0" in x["id"] and "to" in y and "-0" in y["id"]) or
      ("to" in x and "-1" in x["id"] and "to" in y and "-1" in y["id"])):
        return sortAlphab(x["from"], y["from"], x["to"], y["to"])
    return 10

def sankeyDatasort(x, y):
    sameComp = checkIfSameComponent(x, y) 
    if ("studioPr" in x): return -1 if (sameComp == 10) else sameComp
    elif ("genre" in x and not("studioPr" in y)): return -1 if (sameComp == 10) else sameComp
    elif ("note" in x and (not("studioPr" in y or "genre" in y))): return -1 if (sameComp == 10) else sameComp
    elif ("to" in x and "-0" in x["id"] and (not("studioPr" in y or "genre" in y or "note" in y))): return -1 if (sameComp == 10) else sameComp
    else: return 1

In [395]:
from functools import cmp_to_key

dataAll = []

topStudios = sorted(studio_dic.items(), reverse=True, key=lambda x: x[1]["length"])[:10]

countGenre = initGenreDico()
countNotes = initNotesDico()
                    
for studio, rest in topStudios:
    dataAll.append({"from" : studio, "value": 0, "id":"none", "info": "Number of animes producted : "+str(studio_dic[studio]["length"]), "studioPr": ""})
    for genre, animes in rest.items():
        if(genre != "length"):
            currId = ''.join(filter(str.isalpha, studio))[:2].join(filter(str.isalpha, studio))[-3:]+genre[:2]
            countGenre[genre] += len(animes)
            if(len(animes) != 0):
                dataAll.append({"from": studio, "to": genre, "value": len(animes), "id": currId+"-0" })
            scores = meanScorePerGenre(animes)
            for note, mean in scores.items():
                countNotes[note] += mean
                if(mean != 0):
                    dataAll.append({"from": genre, "to": str(note), "value": mean, "id": currId+"-1" })
                    
for g, count in countGenre.items():
    dataAll.append({"from" : g, "value": 0, "id":"none", "info": "Number of animes : "+str(count), "genre": ""})
for n, count in countNotes.items():
    dataAll.append({"from" : str(n), "value": 0, "id":"none", "info": "Number of animes : "+str(count), "note": ""})
dataAll.append({"from": "Fantasy", "to": '0.0', "value": 1, "id": "none"})
dataAll.append({"from": "Fantasy", "to": '1.0', "value": 1, "id": "none"})
dataAll = sorted(dataAll, key=cmp_to_key(sankeyDatasort))

In [396]:
import json

with open('C:/Users/Joachim/Documents/Git/repository/com-480-project-worldwideweebz/app/public/data/sankey_dataset.json', 'w') as outfile:
    json.dump(dataAll, outfile)

In [377]:
pprint(dataAll)

[{'from': 'A-1 Pictures',
  'id': 'none',
  'info': 'Number of animes producted : 174',
  'studioPr': ''},
 {'from': 'AIC',
  'id': 'none',
  'info': 'Number of animes producted : 161',
  'studioPr': ''},
 {'from': 'J.C.Staff',
  'id': 'none',
  'info': 'Number of animes producted : 251',
  'studioPr': ''},
 {'from': 'Madhouse',
  'id': 'none',
  'info': 'Number of animes producted : 277',
  'studioPr': ''},
 {'from': 'Production I.G',
  'id': 'none',
  'info': 'Number of animes producted : 222',
  'studioPr': ''},
 {'from': 'Studio Deen',
  'id': 'none',
  'info': 'Number of animes producted : 223',
  'studioPr': ''},
 {'from': 'Studio Pierrot',
  'id': 'none',
  'info': 'Number of animes producted : 248',
  'studioPr': ''},
 {'from': 'Sunrise',
  'id': 'none',
  'info': 'Number of animes producted : 291',
  'studioPr': ''},
 {'from': 'TMS Entertainment',
  'id': 'none',
  'info': 'Number of animes producted : 172',
  'studioPr': ''},
 {'from': 'Toei Animation',
  'id': 'none',
  'inf