In [1]:
import pandas as pd 
import numpy as np
import collections 
import json

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer

# Helper Functions 

In [2]:
def get_multiple_options(df, field):
    all_options = df[field].apply(eval).dropna().tolist()
    all_options_flat = [item for sublist in all_options for item in sublist]
    counter = collections.Counter(all_options_flat)
    counter_df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
    return counter_df

# Data analysis 

In [3]:
df = pd.read_csv("mal_scrape_reduced.csv")
df = df.drop_duplicates(subset=['mal_id']).set_index("mal_id")

In [4]:
df.head()

Unnamed: 0_level_0,title,type,episodes,members,genres_clean,source,air_year,air_season,airing_start_clean
mal_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6654,Namakura Gatana,Movie,1.0,5505,"['Comedy', 'Samurai']",Original,1917,spring,1917-06-29
10742,Saru to Kani no Gassen,Movie,1.0,604,['Drama'],Other,1917,spring,1917-05-19
23187,Chamebou Shin Gachou: Nomi Fuufu Shikaeshi no ...,Movie,1.0,276,['Comedy'],Original,1917,spring,1917-04-27
18457,Hanasaka Jijii,Movie,1.0,760,"['Comedy', 'Drama']",Other,1917,summer,1917-08-25
23183,Itazura Post,Movie,1.0,282,['Comedy'],Original,1917,summer,1917-07-27


In [5]:
df.describe()

Unnamed: 0,episodes,members,air_year
count,13172.0,13451.0,13451.0
mean,13.786213,40995.61,2006.572448
std,53.305836,128859.2,12.753138
min,1.0,0.0,1917.0
25%,1.0,655.5,2001.0
50%,3.0,3776.0,2010.0
75%,13.0,21206.5,2016.0
max,3057.0,2341800.0,2021.0


### Create datetime features

In [6]:
# create year + month column
df['airing_start_yearmonth'] = pd.to_datetime(df['airing_start_clean']).dt.to_period('M')
# create year column 
df['airing_start_year'] = pd.to_datetime(df['airing_start_clean']).dt.year
# create month column 
df['airing_start_month'] = pd.to_datetime(df['airing_start_clean']).dt.month

### Inspect the categorical columns

In [7]:
df['type'].value_counts()

TV         4590
OVA        3104
Movie      2151
Special    1933
ONA        1672
-             1
Name: type, dtype: int64

In [8]:
df['source'].value_counts()

Manga            3447
Original         3303
-                2663
Visual novel      957
Game              764
Light novel       697
Other             414
Novel             412
4-koma manga      269
Web manga         230
Picture book      117
Book               71
Card game          57
Music              27
Digital manga      14
Radio               9
Name: source, dtype: int64

### Clean the genres and count by genre 

In [9]:
def get_multiple_options(df, field):
    all_options = df[field].apply(eval).dropna().tolist()
    all_options_flat = [item for sublist in all_options for item in sublist]
    counter = collections.Counter(all_options_flat)
    counter_df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
    counter_df.columns = ['genre', 'number']
    genres = counter_df.sort_values(by='number', ascending=False)
    return genres

In [10]:
genres = get_multiple_options(df, "genres_clean")

In [11]:
genres.head(10)

Unnamed: 0,genre,number
0,Comedy,5427
8,Action,3413
6,Fantasy,2761
9,Adventure,2578
16,Sci-Fi,2251
2,Drama,2094
18,Shounen,1812
12,Romance,1700
15,Slice of Life,1666
23,School,1515


### Count anime per season - avg, min and max
- What is the average number of new anime per month over time? 

In [18]:
def get_min_max_avg(df):
    # find average number of anime per month across all the years 
    # count of anime per month divided by total number of years in the dataset
    avg_anime_per_month = (df.groupby('airing_start_month').count() / len(df['airing_start_year'].unique()))[['title']]
    
    # find the min number of anime per month, i.e. the min value across all years 
    anime_per_month_year = df.groupby(['airing_start_year', 'airing_start_month']).count().reset_index()[['airing_start_year', 'airing_start_month', 'title']]
    maxes = {}
    for i in range(12):
        maxes[i+1] = anime_per_month_year[anime_per_month_year['airing_start_month']==(i+1)].nlargest(1, 'title')['title'].tolist()[0]
    # the min number of anime is actually 1 for all seasons 
    mins = {}
    for i in range(12):
        mins[i+1] = 1
        
    maxes_df = pd.DataFrame.from_dict(maxes, orient='index', columns=['max'])
    mins_df = pd.DataFrame.from_dict(mins, orient='index', columns=['min'])
    maxes_mins_df = maxes_df.join(mins_df)
    
    # create a df with the avg, min and max anime 
    avg_min_max = maxes_mins_df.join(avg_anime_per_month)
    avg_min_max.columns = ['max', 'min', 'avg']
    avg_min_max['month'] = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    
    return avg_min_max

In [19]:
avg_min_max = get_min_max_avg(df)

In [20]:
avg_min_max

Unnamed: 0,max,min,avg,month
1,95,1,15.116279,Jan
2,46,1,8.476744,Feb
3,68,1,15.093023,Mar
4,116,1,23.627907,Apr
5,48,1,7.290698,May
6,48,1,8.848837,Jun
7,108,1,20.023256,Jul
8,58,1,10.0,Aug
9,49,1,9.453488,Sep
10,116,1,21.093023,Oct


Exporting to JSON

In [21]:
avg_min_max.reset_index().to_dict(orient='records')

[{'index': 1, 'max': 95, 'min': 1, 'avg': 15.116279069767442, 'month': 'Jan'},
 {'index': 2, 'max': 46, 'min': 1, 'avg': 8.476744186046512, 'month': 'Feb'},
 {'index': 3, 'max': 68, 'min': 1, 'avg': 15.093023255813954, 'month': 'Mar'},
 {'index': 4, 'max': 116, 'min': 1, 'avg': 23.627906976744185, 'month': 'Apr'},
 {'index': 5, 'max': 48, 'min': 1, 'avg': 7.290697674418604, 'month': 'May'},
 {'index': 6, 'max': 48, 'min': 1, 'avg': 8.848837209302326, 'month': 'Jun'},
 {'index': 7, 'max': 108, 'min': 1, 'avg': 20.023255813953487, 'month': 'Jul'},
 {'index': 8, 'max': 58, 'min': 1, 'avg': 10.0, 'month': 'Aug'},
 {'index': 9, 'max': 49, 'min': 1, 'avg': 9.453488372093023, 'month': 'Sep'},
 {'index': 10,
  'max': 116,
  'min': 1,
  'avg': 21.093023255813954,
  'month': 'Oct'},
 {'index': 11, 'max': 46, 'min': 1, 'avg': 7.174418604651163, 'month': 'Nov'},
 {'index': 12, 'max': 67, 'min': 1, 'avg': 10.209302325581396, 'month': 'Dec'}]

### Genres prevalence over time 
- What proportion of genres of any given year/year season period were of a given genre? 

In [22]:
def get_prop_genres_per_year(year):
    prop_genres = {}
    for genre in genres['genre'].tolist():
        df_year = df[df['air_year']==year]
        prop_genre = len(df_year[df_year['genres_clean'].str.contains(genre)])/len(df_year)
        prop_genres[genre] = prop_genre
    prop_genres_year = pd.DataFrame.from_dict(prop_genres, orient='index')
    prop_genres_year.columns = [str(year)]
    return prop_genres_year

In [23]:
prop_genres_years = []
for year in df['air_year'].unique():
    prop_genres_years.append(get_prop_genres_per_year(year))
prop_genres_years_df = pd.concat(prop_genres_years, axis=1).T

In [24]:
# get number of genres per year, i.e. how many times has the genre appeared in anime for given year 
def get_number_genres_per_year(year):
    number_genres = {}
    for genre in genres['genre'].tolist():
        df_year = df[df['air_year']==year]
        number_genre = len(df_year[df_year['genres_clean'].str.contains(genre)])
        number_genres[genre] = number_genre
    number_genres_year = pd.DataFrame.from_dict(number_genres, orient='index')
    number_genres_year.columns = [str(year)]
    return number_genres_year

In [25]:
def get_number_genres_per_decade():
    number_genres_years = []
    for year in df['air_year'].unique():
        number_genres_years.append(get_number_genres_per_year(year))
    number_genres_years_df = pd.concat(number_genres_years, axis=1).T
    
    sum_genres_decades = {}
    for i in range(12):
        lower_range = str(i*10 + 1910)
        upper_range = str(i*10 + 1920 - 1)
        sum_genres_decade = number_genres_years_df.T.loc[:, lower_range:upper_range].sum(axis=1)
        sum_genres_decades[lower_range] = sum_genres_decade
    return pd.DataFrame(sum_genres_decades).T

In [26]:
genres_per_decade_df = get_number_genres_per_decade()

In [27]:
genres_per_decade_df

Unnamed: 0,Comedy,Action,Fantasy,Adventure,Sci-Fi,Drama,Shounen,Romance,Slice of Life,School,...,Samurai,Dementia,Thriller,Vampire,Cars,Josei,Shounen Ai,Shoujo Ai,Yaoi,Yuri
1910,8,0,1,0,0,7,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1920,2,2,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1930,5,5,3,7,0,1,0,1,0,0,...,3,0,0,0,0,0,0,0,0,0
1940,4,2,2,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1950,2,0,3,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1960,33,39,20,57,32,15,23,1,10,5,...,4,1,0,0,1,0,0,0,0,0
1970,60,85,39,116,91,74,50,13,31,13,...,2,1,1,0,5,0,0,0,0,0
1980,287,263,149,282,275,191,155,95,66,46,...,6,1,1,4,5,0,4,0,0,2
1990,680,452,391,507,373,316,319,224,124,107,...,15,10,7,10,13,4,10,4,6,8
2000,1402,780,695,706,673,660,515,570,281,331,...,44,29,37,32,27,18,20,28,19,15
