In [1]:
import pandas as pd 
import numpy as np
import collections 

In [2]:
from matplotlib.pyplot import imshow
from PIL import Image
%matplotlib inline
import colorgram
import calmap

from bokeh.io import output_file, show, output_notebook
from bokeh.models import ColumnDataSource, RangeTool
from bokeh.palettes import Category20b, cividis, inferno
from bokeh.plotting import figure
from bokeh.transform import factor_cmap, cumsum
from bokeh.layouts import column
from math import pi

from sklearn.decomposition import LatentDirichletAllocation as LDA
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis
from sklearn.feature_extraction.text import CountVectorizer

# Helper Functions 

## Viz

### Bar Chart generic

In [3]:
def barchart(counted_data, counts, output_here=True, 
             save_filename="plot", plot_height=350, plot_width=800, title="Title", bgcolor = "#2b2b2b"):
    # either display in the notebook or save file 
    if output_here:
        output_notebook()
    else:
        output_file(save_filename+".html")
            
    source = ColumnDataSource(data=dict(counted_data=counted_data, counts=counts))
    p = figure(x_range=counted_data, plot_height=plot_height, plot_width=plot_width, toolbar_location=None, title=title)
    
    p.vbar(x='counted_data',
           top='counts', 
           width=0.8, 
           source=source, 
           line_color='white', 
           fill_color=factor_cmap('counted_data', palette=cividis(len(counted_data)), factors=counted_data))
    
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    p.y_range.start = 0
    p.xaxis.major_label_orientation = 1.2
    p.background_fill_color = bgcolor
    
    show(p)

In [4]:
barchart(['word1', 'word2', 'word3'], [10, 50, 100])

### Pie Chart generic

In [5]:
def piechart(df, df_items_col, df_values_col, title="Title"):
    # create columns for the angle for each slice and its colour
    df['angle'] = df[df_values_col] / df[df_values_col].sum() * 2*pi
    df['color'] = cividis(len(df))
    
    p = figure(plot_height=350, title=title, toolbar_location=None,
           tools="hover", tooltips="@{}: @{}".format(df_items_col, df_values_col), x_range=(-0.5, 1.0))
    p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', legend_field=df_items_col, source=df)

    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color = None

    show(p)

In [6]:
data = pd.DataFrame.from_dict({"item1": 10, "item2": 30, "item3": 50}, orient='index')
data = data.reset_index()
data.columns = ['items_col', 'values_col']
piechart(data, "items_col", "values_col")

### Line graph with range 

In [7]:
def line_range(dates, data, start_date_number=0, end_date_number=30, label="Label", line_colour="maroon"):
    source = ColumnDataSource(data=dict(date=dates, data=data))
    
    p = figure(plot_height=300, plot_width=1000, tools="xpan", toolbar_location=None,
           x_axis_type="datetime", x_axis_location="above",
           background_fill_color="#efefef", x_range=(dates[start_date_number], dates[end_date_number]))
    
    p.line('date', 'data', source=source, color=line_colour, line_width=2)
    p.yaxis.axis_label = label
    
    select = figure(title="Drag slider",
                plot_height=130, plot_width=1000, y_range=p.y_range,
                x_axis_type="datetime", y_axis_type=None,
                tools="", toolbar_location=None, 
                background_fill_color= "#2b2b2b")
                #background_fill_color="#efefef")
    
    range_tool = RangeTool(x_range=p.x_range)
    range_tool.overlay.fill_color = "white"
    range_tool.overlay.fill_alpha = 0.2
    
    select.line('date', 'data', source=source)
    select.ygrid.grid_line_color = None
    select.add_tools(range_tool)
    select.toolbar.active_multi = range_tool
    
    show(column(p, select))

In [8]:
dates = np.array(['2000-03-01', '2000-03-02', '2000-03-03',
                 '2000-03-04', '2000-03-05', '2000-03-06',
                 '2000-03-07', '2000-03-08', '2000-03-09'], dtype=np.datetime64)
data = np.array([100, 300, 100, 100, 300, 100, 100, 300, 20])
line_range(dates, data, 0, 3, label="My Label", line_colour="deepskyblue")

  return new == old
  return new == old


## Other 

In [9]:
def get_multiple_options(df, field):
    all_options = df[field].apply(eval).dropna().tolist()
    all_options_flat = [item for sublist in all_options for item in sublist]
    counter = collections.Counter(all_options_flat)
    counter_df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
    return counter_df

# Data analysis 

In [10]:
df = pd.read_csv("mal_scrape.csv", index_col=0)
df = df.drop_duplicates(subset=['mal_id']).set_index("mal_id")

# create year + month column
df['airing_start_yearmonth'] = pd.to_datetime(df['airing_start']).dt.to_period('M')
# create year column 
df['airing_start_year'] = pd.to_datetime(df['airing_start']).dt.year
# create month column 
df['airing_start_month'] = pd.to_datetime(df['airing_start']).dt.month



In [11]:
print("Number of entries: {}".format(len(df)))

Number of entries: 13451


What types of anime are there? 

In [12]:
df['type'].value_counts()

TV         4590
OVA        3104
Movie      2151
Special    1933
ONA        1672
-             1
Name: type, dtype: int64

What are the sources of the anime? 

In [13]:
df['source'].value_counts()

Manga            3447
Original         3303
-                2663
Visual novel      957
Game              764
Light novel       697
Other             414
Novel             412
4-koma manga      269
Web manga         230
Picture book      117
Book               71
Card game          57
Music              27
Digital manga      14
Radio               9
Name: source, dtype: int64

What are all the genres? 

In [14]:
genres = get_multiple_options(df, "genres_clean")
genres.columns = ['genre', 'number']
genres = genres.sort_values(by='number', ascending=False)

In [15]:
genres.head()

Unnamed: 0,genre,number
0,Comedy,5427
8,Action,3413
6,Fantasy,2761
9,Adventure,2578
16,Sci-Fi,2251


## Count anime per season - avg, min and max
- What is the average number of new anime per month over time? 

In [16]:
def get_min_max_avg(df):
    # find average number of anime per month across all the years 
    # count of anime per month divided by total number of years in the dataset
    avg_anime_per_month = (df.groupby('airing_start_month').count() / len(df['airing_start_year'].unique().tolist()))[['url']]
    
    # find the min number of anime per month, i.e. the min value across all years 
    anime_per_month_year = df.groupby(['airing_start_year', 'airing_start_month']).count().reset_index()[['airing_start_year', 'airing_start_month', 'url']]
    maxes = {}
    for i in range(12):
        maxes[i+1] = anime_per_month_year[anime_per_month_year['airing_start_month']==(i+1)].nlargest(1, 'url')['url'].tolist()[0]
    # the min number of anime is actually 1 for all. seasons 
    mins = {}
    for i in range(12):
        mins[i+1] = 1
        
    maxes_df = pd.DataFrame.from_dict(maxes, orient='index', columns=['max'])
    mins_df = pd.DataFrame.from_dict(mins, orient='index', columns=['min'])
    maxes_mins_df = maxes_df.join(mins_df)
    
    # create a df with the avg, min and max anime 
    avg_min_max = maxes_mins_df.join(avg_anime_per_month)
    avg_min_max.columns = ['max', 'min', 'avg']
    avg_min_max['month'] = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    
    return avg_min_max

In [17]:
avg_min_max = get_min_max_avg(df)

In [18]:
avg_min_max

Unnamed: 0,max,min,avg,month
1,95,1,15.116279,Jan
2,46,1,8.476744,Feb
3,68,1,15.093023,Mar
4,116,1,23.627907,Apr
5,48,1,7.290698,May
6,48,1,8.848837,Jun
7,108,1,20.023256,Jul
8,58,1,10.0,Aug
9,49,1,9.453488,Sep
10,116,1,21.093023,Oct


In [19]:
# {‘dict’, ‘list’, ‘series’, ‘split’, ‘records’, ‘index’}
avg_min_max.reset_index().to_dict(orient='records')

[{'index': 1, 'max': 95, 'min': 1, 'avg': 15.116279069767442, 'month': 'Jan'},
 {'index': 2, 'max': 46, 'min': 1, 'avg': 8.476744186046512, 'month': 'Feb'},
 {'index': 3, 'max': 68, 'min': 1, 'avg': 15.093023255813954, 'month': 'Mar'},
 {'index': 4, 'max': 116, 'min': 1, 'avg': 23.627906976744185, 'month': 'Apr'},
 {'index': 5, 'max': 48, 'min': 1, 'avg': 7.290697674418604, 'month': 'May'},
 {'index': 6, 'max': 48, 'min': 1, 'avg': 8.848837209302326, 'month': 'Jun'},
 {'index': 7, 'max': 108, 'min': 1, 'avg': 20.023255813953487, 'month': 'Jul'},
 {'index': 8, 'max': 58, 'min': 1, 'avg': 10.0, 'month': 'Aug'},
 {'index': 9, 'max': 49, 'min': 1, 'avg': 9.453488372093023, 'month': 'Sep'},
 {'index': 10,
  'max': 116,
  'min': 1,
  'avg': 21.093023255813954,
  'month': 'Oct'},
 {'index': 11, 'max': 46, 'min': 1, 'avg': 7.174418604651163, 'month': 'Nov'},
 {'index': 12, 'max': 67, 'min': 1, 'avg': 10.209302325581396, 'month': 'Dec'}]

## Genres prevalence over time 
- What proportion of genres of any given year/year season period were of a given genre? 

In [20]:
def get_prop_genres_per_year(year):
    prop_genres = {}
    for genre in genres['genre'].tolist():
        df_year = df[df['air_year']==year]
        prop_genre = len(df_year[df_year['genres_clean'].str.contains(genre)])/len(df_year)
        prop_genres[genre] = prop_genre
    prop_genres_year = pd.DataFrame.from_dict(prop_genres, orient='index')
    prop_genres_year.columns = [str(year)]
    return prop_genres_year

In [None]:
prop_genres_years = []
for year in df['air_year'].unique():
    prop_genres_years.append(get_prop_genres_per_year(year))
prop_genres_years_df = pd.concat(prop_genres_years, axis=1).T

In [None]:
# get number of genres per year, i.e. how many times has the genre appeared in anime for given year 
def get_number_genres_per_year(year):
    number_genres = {}
    for genre in genres['genre'].tolist():
        df_year = df[df['air_year']==year]
        number_genre = len(df_year[df_year['genres_clean'].str.contains(genre)])
        number_genres[genre] = number_genre
    number_genres_year = pd.DataFrame.from_dict(number_genres, orient='index')
    number_genres_year.columns = [str(year)]
    return number_genres_year

In [None]:
def get_number_genres_per_decade():
    number_genres_years = []
    for year in df['air_year'].unique():
        number_genres_years.append(get_number_genres_per_year(year))
    number_genres_years_df = pd.concat(number_genres_years, axis=1).T
    
    sum_genres_decades = {}
    for i in range(12):
        lower_range = str(i*10 + 1910)
        upper_range = str(i*10 + 1920 - 1)
        sum_genres_decade = number_genres_years_df.T.loc[:, lower_range:upper_range].sum(axis=1)
        sum_genres_decades[lower_range] = sum_genres_decade
    return pd.DataFrame(sum_genres_decades).T

In [None]:
prop_genres_years_df

In [None]:
genres_per_decade_df = get_number_genres_per_decade()

In [None]:
genres_per_decade_df

In [None]:
# visualise the proportions here 
prop_genres_years_df

In [None]:
dates = np.array(prop_genres_years_df.index.tolist(), dtype=np.datetime64)
data = np.array(prop_genres_years_df['School'].tolist())
line_range(dates, data, 0, 3, label="My Label", line_colour="deepskyblue")