In [35]:
# import data

import json

rating = json.load(open('rating_200k.json'))
animedata = json.load(open('anime.json'))

In [36]:
from collections import defaultdict

def get_genre_by_anime(anime_dataset):
    
    genre_by_anime = defaultdict(list)      # list of animes corresponding to certain genre

    for entry in anime_dataset: 
        for genre in entry['genre']:
            genre_by_anime[genre].append(entry['anime_id'])
            
    return genre_by_anime

In [37]:
def get_anime_view_dict(rating_dataset):

    anime_view_dict = defaultdict(list)     # list of users that watched certain anime
    
    for entry in rating:
        anime_id = entry['anime_id']
        user_id = entry['user_id']
        anime_view_dict[anime_id].append(user_id)
        
    return anime_view_dict

In [38]:
def get_genreCount(anime_dataset):
    genres = set()    # set of all genres
    genreCount = defaultdict(int)     # count of each genre
    
    for entry in animedata:
        for genre in entry['genre']:
            genres.add(genre)
            genreCount[genre] += 1
            
    return genreCount

In [39]:
def get_typeCount(anime_dataset):
    typeCount = defaultdict(int)

    for entry in animedata:
        typeCount[entry['type']] += 1
        
    return typeCount

In [40]:
japanese_stopwords = {"a", "i", "u", "e", "o", "ka", "ki", "ku", "ke", "ko", "ga", "gi", "gu", "ge", "go",
                      "sa", "si", "su", "se", "so", "za", "zi", "zu", "ze", "zo",
                        "ta", "ti","tsu", "te", "to", "da", "di", "du", "de", "do", "na", "no", "ni","nu", "ne",
                      "ha", "hi", "hu", "he", "ho", "ba", "bi", "bu", "be", "bo", "ma", "mi", "mu", "me", "mo",
                      "ya", "yo","yu", "ra", "ri", "ru", "re", "ro", "wa", "wo", "n"}

number = {"1","2","3","4","5","6","7","8","9"}

In [41]:
# Most popular word occuring in title

import string
from nltk.stem.porter import *
from nltk.corpus import stopwords
from collections import defaultdict

def most_popular_word():

    ps = PorterStemmer()

    popWordDict = defaultdict(int)


    for entry in animedata:
        title = ''.join([c for c in entry['name'].lower() if not c in string.punctuation])
        for w in title.split():
            if w not in stopwords.words('english') and w not in japanese_stopwords and w not in number:
                popWordDict[w] += 1

    popWord = []
    for w in popWordDict:
        popWord.append([popWordDict[w],w])
    popWord.sort()
    popWord.reverse()
    
    return popWord

In [42]:
# Average number of members by genres
# Average ratings by genre

# construct average numbers by genre and average rating by genre

def get_average_members_by_genre():

    average_member_by_genres = defaultdict(int)
    
    for entry in animedata:
        members = entry['members']
        genres = []
        for genre in entry['genre']:
            average_member_by_genres[genre] += members
            
    genreCount = get_genreCount(animedata)

    for genre in average_member_by_genres:
        average_member_by_genres[genre] /= genreCount[genre]
        
    return average_member_by_genres


In [43]:
def get_average_rating_by_genre():
    
    average_rating_by_genres = defaultdict(int)

    for entry in animedata:
        ratings = entry['rating']
        genres = []
        for genre in entry['genre']:
            average_rating_by_genres[genre] += ratings
            
    genreCount = get_genreCount(animedata)

    for genre in average_rating_by_genres:
        average_rating_by_genres[genre] /= genreCount[genre]
        
    return average_rating_by_genres

In [44]:
def write_average_rating_by_genre():

    arbg = open('average_rating_by_genres.csv', 'w')

    arbg.write("Genre\tAverage Rating\n")
    
    average_rating_by_genres = get_average_rating_by_genre()
    
    for l in average_rating_by_genres:
        if l != '':
            arbg.write(l + "\t"+ str(average_rating_by_genres[l]) + "\n")

    arbg.close()

In [45]:
def write_popular_word():

    pw = open('popular_words.csv', 'w')

    pw.write("word\tcount\n")
    popWord = most_popular_words()
    for l in popWord:
        pw.write(str(l[1]) + "\t"+ str(l[0]) + "\n")

    pw.close()

In [46]:
def write_average_member_by_genre():

    ambg = open('average_member_by_genres.csv', 'w')

    ambg.write("Genre\tAverage Rating\n")
    
    average_member_by_genres = get_average_members_by_genre()
    for l in average_member_by_genres:
        if l != '':
            ambg.write(l + "\t"+ str(average_member_by_genres[l]) + "\n")

    ambg.close()