In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

data = Path("../data")
plot_path = data / "plots"

map_path = data / "graph3_map"
geojson_path = map_path / "geojson"
stat_path = map_path / "stats"
country_stat_path = stat_path / "countries"
studio_stat_path = stat_path / "studios"

# Prepare data

In [2]:
import dask.dataframe as dd

## User-Country

In [3]:
user_country = pd.read_csv(map_path / "user_country.csv")
user_country = dd.from_pandas(user_country[["username", "country"]], npartitions=1).persist()
user_country

Unnamed: 0_level_0,username,country
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,object,object
133910,...,...


## Animes

We load the plain Animes database as we don't need studio information for this analysis.

In [4]:
animes = pd.read_csv(data / "AnimeList.csv")
# sort the animes by title (ascending) and write the anime_id and title to a csv file
animes = animes.sort_values(by=["title"], ascending=True)
animes[["anime_id", "title"]].to_csv(data / "anime_id_title.csv", index=False)

## User-Animes

In [5]:
user_animes = dd.read_csv(data /
    "UserAnimeList.csv",
    dtype={
        "username": "object",
        "anime_id": "int64",
        "my_watched_episodes": "int64",
        "my_start_date": "str",
        "my_finish_date": "str",
        "my_score": "int64",
        "my_status": "int64",
        "my_rewatching": "float64",
        "my_rewatching_ep": "int64",
        "my_last_updated": "int64",
        "my_tags": "object",
    },
    usecols=[
        "username",
        "anime_id",
        "my_watched_episodes",
        "my_start_date",
        "my_finish_date",
        "my_score",
        "my_status",
        "my_rewatching",
        "my_rewatching_ep",
        "my_last_updated",
        "my_tags",
    ],
).persist()

user_animes

Unnamed: 0_level_0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
npartitions=78,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,object,int64,int64,object,object,int64,int64,float64,int64,int64,object
,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...


## User-Country-Animes

In [6]:
user_animes["username"].nunique().compute(), user_country["username"].nunique().compute()

(283044, 133794)

In [7]:
# Merge user_animes with user_country
user_country_animes = user_country.merge(user_animes, on="username", how="inner").persist()
print(user_country_animes["username"].nunique().compute())
user_country_animes

130445


Unnamed: 0_level_0,username,country,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
npartitions=78,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
,object,object,int64,int64,object,object,int64,int64,float64,int64,int64,object
,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...


# Statistics

## Country/Animes

### Country-Most popular animes

In [8]:
# Group country_users_animes by country,country_aff, then for each group add columns "anime_id" and "num_users" where num_users is the number of users who have watched anime_id
country_top_animes = (
    user_country_animes.groupby(["country", "anime_id"])
    .agg({"username": "count"})
    .rename(columns={"username": "num_ratings"})
    .sort_values(["country", "num_ratings"], ascending=[True, False])
    .persist()
)
country_top_animes.to_csv(country_stat_path / "country_top_animes.csv")
country_top_animes

Unnamed: 0_level_0,num_ratings
npartitions=1,Unnamed: 1_level_1
,int64
,...


### Country-Most popular animes (3)

In [9]:
# For each country, keep only the first 3 anime_ids
country_top_animes_3 = (
    country_top_animes.reset_index()
    .groupby(["country"])
    .apply(lambda x: x.nlargest(3, "num_ratings"))
)
country_top_animes_3.to_csv(country_stat_path / "country_top_animes_3.csv")

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  .apply(lambda x: x.nlargest(3, "num_ratings"))


['/home/julien/Documents/project-2023-vizmoica/data/graph3_map/stats/countries/country_top_animes_3.csv/0.part']

## Country/Studios

### Prepare data

#### Anime-Studios

We load the cleaned dataset, which duplicates the rows of animes that have multiple studios

In [10]:
anime_studios = dd.read_csv(data / "AnimeList_clean.csv").persist()

#### User-Country-Animes-Studios

In [11]:
user_country_animes_studios = (
    user_country_animes[["username", "country", "anime_id"]]
    .merge(anime_studios[["anime_id", "studio"]], on="anime_id", how="inner")
    .persist()
)
print(user_country_animes_studios["username"].nunique().compute())
user_country_animes_studios

130417


Unnamed: 0_level_0,username,country,anime_id,studio
npartitions=78,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,object,int64,object
,...,...,...,...
...,...,...,...,...
,...,...,...,...
,...,...,...,...


### Statistics

#### Studio-Country-Number of ratings for that studio in that country

In [12]:
studio_country_num_ratings = dd.read_csv(studio_stat_path / "studio_country_num_ratings.csv/0.part").persist()
studio_country_num_ratings

Unnamed: 0_level_0,studio,country,num_ratings
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,object,object,int64
,...,...,...


#### Studio-Country-Most popular animes (3)

In [13]:
studio_country_top_animes_3 = (
    user_country_animes_studios.groupby(["studio", "country", "anime_id"])
    .agg({"username": "count"})
    .rename(columns={"username": "num_ratings"})
    .sort_values(["studio", "country", "num_ratings"], ascending=[True, True, False])
    .reset_index()
    .groupby(["studio", "country"])
    .apply(lambda x: x.nlargest(3, "num_ratings"))
    # .reset_index(drop=True)
    .persist()
)

studio_country_top_animes_3.to_csv(studio_stat_path / "studio_country_top_animes_3.csv")
studio_country_top_animes_3

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  .apply(lambda x: x.nlargest(3, "num_ratings"))


Unnamed: 0_level_0,studio,country,anime_id,num_ratings
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,object,int64,int64
,...,...,...,...


#### Country-Most popular studios

In [14]:
country_top_studios = (
    user_country_animes_studios.groupby(["country", "studio"])
    .agg({"username": "count"})
    .rename(columns={"username": "num_ratings"})
    .sort_values(["country", "num_ratings", "studio"], ascending=[True, False, True])
    .persist()
)
country_top_studios.to_csv(country_stat_path / "country_top_studios.csv")
country_top_studios

Unnamed: 0_level_0,num_ratings
npartitions=1,Unnamed: 1_level_1
,int64
,...


## Country/Reviews

In [15]:
user_country_animes[
    (user_country_animes["country"] == "France") & 
    (user_country_animes["anime_id"] == 1535) & 
    (user_country_animes["my_tags"].notnull())
].compute()

Unnamed: 0,username,country,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
88309,Elyasis,France,1535,20,0000-00-00,0000-00-00,7,4,,0,1445193740,"Mystery, Police, Psychological, Supernatural, ..."
442277,Sinope_K,France,1535,11,2015-11-01,0000-00-00,0,1,0.0,0,1459627501,(<2009)
468329,Myou-Myou,France,1535,37,2011-06-30,2011-07-03,8,2,0.0,0,1317745450,last arc is meh
502863,Mayuri-Nyan,France,1535,37,0000-00-00,0000-00-00,10,2,,0,1328812789,10
521806,Yamichan,France,1535,37,0000-00-00,0000-00-00,8,2,0.0,0,1437060882,"Psychological, Supernatural, Thriller"
...,...,...,...,...,...,...,...,...,...,...,...,...
404403,Levitacus,France,1535,37,0000-00-00,0000-00-00,7,2,0.0,0,1477449618,Mid 7
110998,EternalPhoenix,France,1535,37,0000-00-00,0000-00-00,7,2,0.0,0,1325506271,15.5/20
126872,KloWh,France,1535,37,0000-00-00,0000-00-00,7,2,,0,1379520124,Stop after the 1 st arc
285158,Anjolras,France,1535,37,0000-00-00,0000-00-00,10,2,0.0,0,1357126510,Thrilling story with suspens


In [16]:
user_country_animes[
    (user_country_animes["country"] == "Brazil") & 
    (user_country_animes["anime_id"] == 1535) & 
    (user_country_animes["my_tags"].notnull())
]["my_tags"].compute()

29867     Esse anime no começo não botei muita fé,mas qu...
46335     se não fosse essa parte final broxante, eu ter...
68371                                          L DEAD YEAH!
106872                                     shounen, mystery
282358    Até é bem inteligente, mas o anime perde a gra...
                                ...                        
287831    Mystery, Supernatural, Police, Psychological, ...
319803    Uma das primeiras obras que eu assisti, me apa...
448004    Muito bom! Premissa genial, com uma execução f...
208436                                           Perfeição.
420325                        Kira &gt; L. (não me julguem)
Name: my_tags, Length: 712, dtype: object

In [17]:
user_country_animes[
    (user_country_animes["country"] == "Iran") & 
    (user_country_animes["anime_id"] == 1535) & 
    (user_country_animes["my_tags"].notnull())
]["my_tags"].compute()

47853                        death note,death god,shinigami
219573    Mystery, Psychological thriller, Supernatural ...
501022                                           =|ED&PS|x1
84747                                            Death Note
267034                   legend of the galactic heroes lite
Name: my_tags, dtype: object