In [46]:
import pandas as pd 
import numpy as np
import collections
import re

import matplotlib.pyplot as plt
%matplotlib inline

from bokeh.io import output_file, show, output_notebook
from bokeh.models import ColumnDataSource, RangeTool
from bokeh.palettes import Category20b, cividis, inferno
from bokeh.plotting import figure
from bokeh.transform import factor_cmap, cumsum
from bokeh.layouts import column
from math import pi

## Helper Functions

In [None]:
def get_multiple_choice_counts_df(df, fields):
    multiple_choice = []
    for field in fields: 
        #pd.DataFrame(df[field].value_counts()).index.tolist()
        multiple_choice.extend(df[field].dropna().tolist())
    multiple_choice_list = []
    mchoice = [item.replace('。', '').strip() for item in multiple_choice]
    for item in mchoice: 
        multiple_choice_list.extend(item.split(', '))
    unique_values = list(set(multiple_choice_list)) 
    
    counter=collections.Counter(multiple_choice_list)
    mchoice_df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
    mchoice_df.columns = ['field', 'freq']
    return mchoice_df.sort_values(by='freq', ascending=False)

### Graphs 

In [None]:
def line_range(dates, data, start_date_number=0, end_date_number=30, label="Label", line_colour="maroon"):
    source = ColumnDataSource(data=dict(date=dates, data=data))
    
    p = figure(plot_height=300, plot_width=1000, tools="xpan", toolbar_location=None,
           x_axis_type="datetime", x_axis_location="above",
           background_fill_color="#efefef", x_range=(dates[start_date_number], dates[end_date_number]))
    
    p.line('date', 'data', source=source, color=line_colour, line_width=2)
    p.yaxis.axis_label = label
    
    select = figure(title="Drag slider",
                plot_height=130, plot_width=1000, y_range=p.y_range,
                x_axis_type="datetime", y_axis_type=None,
                tools="", toolbar_location=None, 
                background_fill_color= "#2b2b2b")
                #background_fill_color="#efefef")
    
    range_tool = RangeTool(x_range=p.x_range)
    range_tool.overlay.fill_color = "white"
    range_tool.overlay.fill_alpha = 0.2
    
    select.line('date', 'data', source=source)
    select.ygrid.grid_line_color = None
    select.add_tools(range_tool)
    select.toolbar.active_multi = range_tool
    
    show(column(p, select))

In [2]:
data1 = pd.read_csv("anime.csv")

In [3]:
data1.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
data2 = pd.read_csv("dataanime.csv")

In [5]:
data2.head()

Unnamed: 0,Title,Type,Episodes,Status,Start airing,End airing,Starting season,Broadcast time,Producers,Licensors,Studios,Sources,Genres,Duration,Rating,Score,Scored by,Members,Favorites,Description
0,Fullmetal Alchemist: Brotherhood,TV,64,Finished Airing,2009-4-5,2010-7-4,Spring,Sundays at 17:00 (JST),"Aniplex,Square Enix,Mainichi Broadcasting Syst...","Funimation,Aniplex of America",Bones,Manga,"Action,Military,Adventure,Comedy,Drama,Magic,F...",24 min. per ep.,R,9.25,719706,1176368,105387,"""In order for something to be obtained, someth..."
1,Kimi no Na wa.,Movie,1,Finished Airing,2016-8-26,-,-,-,"Kadokawa Shoten,Toho,Sound Team Don Juan,Lawso...","Funimation,NYAV Post",CoMix Wave Films,Original,"Supernatural,Drama,Romance,School",1 hr. 46 min.,PG-13,9.19,454969,705186,33936,"Mitsuha Miyamizu, a high school girl, yearns t..."
2,Gintama°,TV,51,Finished Airing,2015-4-8,2016-3-30,Spring,Wednesdays at 18:00 (JST),"TV Tokyo,Aniplex,Dentsu","Funimation,Crunchyroll",Bandai Namco Pictures,Manga,"Action,Comedy,Historical,Parody,Samurai,Sci-Fi...",24 min. per ep.,R,9.16,70279,194359,5597,"Gintoki, Shinpachi, and Kagura return as the f..."
3,Steins;Gate 0,TV,23,Currently Airing,2018-4-12,-,Spring,Thursdays at 01:35 (JST),Nitroplus,Funimation,White Fox,Visual novel,"Sci-Fi,Thriller",23 min. per ep.,PG-13,9.16,12609,186331,1117,The dark untold story of Steins;Gate that lead...
4,Steins;Gate,TV,24,Finished Airing,2011-4-6,2011-9-14,Spring,Wednesdays at 02:05 (JST),"Frontier Works,Media Factory,Movic,AT-X,Kadoka...",Funimation,White Fox,Visual novel,"Sci-Fi,Thriller",24 min. per ep.,PG-13,9.14,552791,990419,90365,The self-proclaimed mad scientist Rintarou Oka...


In [29]:
len(data2)

1563

In [21]:
# convert to datetimes for start and end 
data2['start_date'] = pd.to_datetime(data2['Start airing'],errors='coerce')
data2['end_date'] = pd.to_datetime(data2['End airing'],errors='coerce')

In [22]:
# remove the duplicates ans join 
data1 = data1.drop_duplicates(subset=["name"]).set_index("name")
data2 = data2.drop_duplicates(subset=["Title"]).set_index("Title")

In [26]:
# join the two dataframes
df = data1.join(data2, how="inner")

In [36]:
# add the year 
df['year'] = df['start_date'].dt.year
# keep only the anime that you have the data for 
df = df.dropna(subset=['year'])
df['year'] = df["year"].astype(int)

### Genres 

In [49]:
genres = get_multiple_choice_counts_df(df, ["genre"])

In [55]:
genres.head(5)

Unnamed: 0,field,freq
10,Comedy,636
4,Action,462
0,Drama,432
9,Shounen,380
5,Adventure,319


In [63]:
df_genres = []
for genre in genres['field'].tolist():
    df_genre = df[df['genre'].str.contains(genre)].groupby("year").count()[['anime_id']]
    df_genre.columns = [genre]
    df_genres.append(df_genre)

In [73]:
df_genres = pd.concat(df_genres, axis="columns")

In [74]:
df_genres.head()

Unnamed: 0_level_0,Comedy,Action,Drama,Shounen,Adventure,Romance,Fantasy,Supernatural,Slice of Life,Sci-Fi,...,Samurai,Game,Vampire,Shounen Ai,Harem,Cars,Shoujo Ai,Dementia,Ecchi,Yaoi
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970,,,1.0,,,,,,,,...,,,,,,,,,,
1971,1.0,1.0,,1.0,1.0,,,,,,...,,,,,,,,,,
1974,,1.0,1.0,,1.0,,,,,1.0,...,,,,,,,,,,
1975,,,,1.0,,,,,,1.0,...,,,,,,,,,,
1976,,,1.0,,,,,,1.0,,...,,,,,,,,,,


In [97]:
def get_line_chart_per_genre(genre):
    porg = df_genres[[genre]].dropna()
    dates = np.array(porg.index.tolist())
    data = np.array(porg[genre])
    line_range(dates, data, 0, 30, label=genre, line_colour="deepskyblue")

In [98]:
for genre in genres.head(10)['field'].tolist():
    get_line_chart_per_genre(genre)

IndexError: index 30 is out of bounds for axis 0 with size 23