In [1]:
import pandas as pd
import numpy as np
import altair as alt
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
df = pd.read_csv("dataanime.csv")
df.insert(0, "anime_id", df.index)
df = df[~df.isna().any(axis = 1)]
df.head()

df = df.apply(lambda x: x.replace('-',np.nan))
df = df[~df.isna().any(axis = 1)]

df["day"] = df["Broadcast time"].map(lambda x: x.split()[0])
df = df[~(df["day"] == "Not")]

df["time"] = df["Broadcast time"].map(lambda x: x.split()[2])
df = df.apply(lambda x: x.replace('Unknown',np.nan))
df = df[~df.isna().any(axis = 1)]

df.index = range(369)
df.head()

week = {
    "Mondays":1,
    "Tuesdays":2,
    "Wednesdays":3,
    "Thursdays":4,
    "Fridays":5,
    "Saturdays":6,
    "Sundays":7
}

df["day_n"] = df["day"].map(lambda x: week[x])

season = {
    "Spring":1,
    "Fall":2,
    "Summer":3,
    "Winter":4
}

df["season_n"] = df["Starting season"].map(lambda x: season[x])

df["time_n"] = df["time"].map(lambda x: x.replace(":",""))
df["time_n"] = df["time_n"].map(lambda x: np.int64(x))

def categorize_time(time):
    if time >= 1800:
        # it is night
        x = 1
    elif time >= 1200:
        # it is midday
        x = 2
    elif time >= 600:
        # it is morning
        x = 3
    else:
        # it is midnight
        x = 4
    return x

timeframe = {
    1: "6:00PM~11:59PM",
    2:"12:00PM ~ 5:59PM",
    3: "6:00AM ~ 11:59AM",
    4: "12:00AM ~ 6:00AM"
}

df["time_n"] = df["time_n"].map(lambda x: categorize_time(x))
df["time_s"] = df["time_n"].map(lambda x: timeframe[x])

check_season = pd.get_dummies(df['Starting season']).rename(columns=lambda x:'is_' +str(x))
check_season = check_season.applymap(lambda x: np.int8(x))

reversed_week = {value : key for (key, value) in week.items()}
check_day = pd.get_dummies(df['day_n']).rename(columns=lambda x:'is_' + reversed_week[x])
check_day = check_day.applymap(lambda x: np.int8(x))

brod_time = {
    1: "night",
    2: "midday",
    3: "morning",
    4: "midnight"
}
check_time = pd.get_dummies(df['time_n']).rename(columns=lambda x:'is_' + brod_time[x])
check_time = check_time.applymap(lambda x: np.int8(x))

df2 = pd.concat([check_day,check_time,check_season], axis=1)
df = pd.concat([df, df2], axis=1)

df.head()
df.columns

Index(['anime_id', 'Title', 'Type', 'Episodes', 'Status', 'Start airing',
       'End airing', 'Starting season', 'Broadcast time', 'Producers',
       'Licensors', 'Studios', 'Sources', 'Genres', 'Duration', 'Rating',
       'Score', 'Scored by', 'Members', 'Favorites', 'Description', 'day',
       'time', 'day_n', 'season_n', 'time_n', 'time_s', 'is_Mondays',
       'is_Tuesdays', 'is_Wednesdays', 'is_Thursdays', 'is_Fridays',
       'is_Saturdays', 'is_Sundays', 'is_night', 'is_midday', 'is_morning',
       'is_midnight', 'is_Fall', 'is_Spring', 'is_Summer', 'is_Winter'],
      dtype='object')

In [25]:
df["Genres"].map(type)
df["Genres"].map(lambda x: list(x))
df["Genres"].map(lambda x: x.replace(", ",""))

mylist = []
for i in range(len(df["Genres"])):
  mylist.append(df.iloc[i,13].split(","))

df["Genres"] = mylist
genres_list = sorted(list(set(df["Genres"].sum())))

genres_list

['Action',
 'Adventure',
 'Cars',
 'Comedy',
 'Dementia',
 'Demons',
 'Drama',
 'Fantasy',
 'Game',
 'Harem',
 'Historical',
 'Horror',
 'Josei',
 'Kids',
 'Magic',
 'Martial Arts',
 'Mecha',
 'Military',
 'Music',
 'Mystery',
 'Parody',
 'Police',
 'Psychological',
 'Romance',
 'Samurai',
 'School',
 'Sci-Fi',
 'Seinen',
 'Shoujo',
 'Shoujo Ai',
 'Shounen',
 'Shounen Ai',
 'Slice of Life',
 'Space',
 'Sports',
 'Super Power',
 'Supernatural',
 'Thriller',
 'Vampire']

In [26]:
train_data1 = df

week = {
    "Mondays":1,
    "Tuesdays":2,
    "Wednesdays":3,
    "Thursdays":4,
    "Fridays":5,
    "Saturdays":6,
    "Sundays":7
}

train_data1["day"] = train_data1["day"].map(lambda x: week[x])

season = {
    "Spring":1,
    "Fall":2,
    "Summer":3,
    "Winter":4
}

train_data1["season"] = train_data1["Starting season"].map(lambda x: season[x])

train_data1["time"] = train_data1["time"].map(lambda x: x.replace(":",""))
train_data1["time"] = train_data1["time"].map(lambda x: np.int64(x))

def categorize_time(time):
    if time >= 1800:
        # it is night
        x = 1
    elif time >= 1200:
        # it is midday
        x = 2
    elif time >= 600:
        # it is morning
        x = 3
    else:
        # it is midnight
        x = 4
    return x

timeframe = {
    1: "6:00PM~11:59PM",
    2:"12:00PM ~ 5:59PM",
    3: "6:00AM ~ 11:59AM",
    4: "12:00AM ~ 6:00AM"
}

train_data1["time"] = train_data1["time"].map(lambda x: categorize_time(x))
train_data1["time_s"] = train_data1["time"].map(lambda x: timeframe[x])
train_data1.head()

Unnamed: 0,anime_id,Title,Type,Episodes,Status,Start airing,End airing,Starting season,Broadcast time,Producers,...,is_Sundays,is_night,is_midday,is_morning,is_midnight,is_Fall,is_Spring,is_Summer,is_Winter,season
0,0,Fullmetal Alchemist: Brotherhood,TV,64,Finished Airing,2009-4-5,2010-7-4,Spring,Sundays at 17:00 (JST),"Aniplex,Square Enix,Mainichi Broadcasting Syst...",...,1,0,1,0,0,0,1,0,0,1
1,2,Gintama°,TV,51,Finished Airing,2015-4-8,2016-3-30,Spring,Wednesdays at 18:00 (JST),"TV Tokyo,Aniplex,Dentsu",...,0,1,0,0,0,0,1,0,0,1
2,4,Steins;Gate,TV,24,Finished Airing,2011-4-6,2011-9-14,Spring,Wednesdays at 02:05 (JST),"Frontier Works,Media Factory,Movic,AT-X,Kadoka...",...,0,0,0,0,1,0,1,0,0,1
3,7,Hunter x Hunter (2011),TV,148,Finished Airing,2011-10-2,2014-9-24,Fall,Sundays at 10:55 (JST),"VAP,Nippon Television Network,Shueisha",...,1,0,0,1,0,1,0,0,0,2
4,8,3-gatsu no Lion 2nd Season,TV,22,Finished Airing,2017-10-14,2018-3-31,Fall,Saturdays at 23:00 (JST),"Aniplex,Dentsu,NHK,Hakusensha,Asmik Ace Entert...",...,0,1,0,0,0,1,0,0,0,2


In [27]:
find_time = {
    1: "6:00PM~11:59PM",
    2:"12:00PM ~ 5:59PM",
    3: "6:00AM ~ 11:59AM",
    4: "12:00AM ~ 6:00AM"
}
find_time[1]

'6:00PM~11:59PM'

In [28]:
df['time_n']

0      2
1      1
2      4
3      3
4      1
      ..
364    4
365    4
366    4
367    4
368    4
Name: time_n, Length: 369, dtype: int64

In [29]:
reversed_week[5]

'Fridays'

In [30]:
check_season = pd.get_dummies(df['Starting season']).rename(columns=lambda x:'is_' +str(x))
check_season = check_season.applymap(lambda x: np.int8(x))

reversed_week = {value : key for (key, value) in week.items()}
check_day = pd.get_dummies(df['day_n']).rename(columns=lambda x:'is_' + reversed_week[x])
check_day = check_day.applymap(lambda x: np.int8(x))

brod_time = {
    1: "night",
    2: "midday",
    3: "morning",
    4: "midnight"
}
check_time = pd.get_dummies(df['time_n']).rename(columns=lambda x:'is_' + brod_time[x])
check_time = check_time.applymap(lambda x: np.int8(x))

df2 = pd.concat([check_day,check_time,check_season], axis=1)
df = pd.concat([df, df2], axis=1)

df.index = range(369)
df.head()

Unnamed: 0,anime_id,Title,Type,Episodes,Status,Start airing,End airing,Starting season,Broadcast time,Producers,...,is_Saturdays,is_Sundays,is_night,is_midday,is_morning,is_midnight,is_Fall,is_Spring,is_Summer,is_Winter
0,0,Fullmetal Alchemist: Brotherhood,TV,64,Finished Airing,2009-4-5,2010-7-4,Spring,Sundays at 17:00 (JST),"Aniplex,Square Enix,Mainichi Broadcasting Syst...",...,0,1,0,1,0,0,0,1,0,0
1,2,Gintama°,TV,51,Finished Airing,2015-4-8,2016-3-30,Spring,Wednesdays at 18:00 (JST),"TV Tokyo,Aniplex,Dentsu",...,0,0,1,0,0,0,0,1,0,0
2,4,Steins;Gate,TV,24,Finished Airing,2011-4-6,2011-9-14,Spring,Wednesdays at 02:05 (JST),"Frontier Works,Media Factory,Movic,AT-X,Kadoka...",...,0,0,0,0,0,1,0,1,0,0
3,7,Hunter x Hunter (2011),TV,148,Finished Airing,2011-10-2,2014-9-24,Fall,Sundays at 10:55 (JST),"VAP,Nippon Television Network,Shueisha",...,0,1,0,0,1,0,1,0,0,0
4,8,3-gatsu no Lion 2nd Season,TV,22,Finished Airing,2017-10-14,2018-3-31,Fall,Saturdays at 23:00 (JST),"Aniplex,Dentsu,NHK,Hakusensha,Asmik Ace Entert...",...,1,0,1,0,0,0,1,0,0,0


In [31]:
numeric_cols1 = [c for c in train_data1.columns if is_numeric_dtype(train_data1[c])]
train_data1[numeric_cols1].head()

scaler = StandardScaler()
scaler.fit(train_data1[numeric_cols1])
train_data1[numeric_cols1] = scaler.transform(train_data1[numeric_cols1])
train_data1[numeric_cols1]

clf = LogisticRegression()
y = train_data1["Genres"].map(lambda genres_list:"Action" if "Action" in genres_list else "Not Action")
X = train_data1[["day","time","season"]]

clf.fit(X,y)
clf.predict(X)

clf.predict_proba(X.iloc[:5])

array([[0.59079817, 0.40920183],
       [0.42495733, 0.57504267],
       [0.31487018, 0.68512982],
       [0.53498918, 0.46501082],
       [0.56213574, 0.43786426]])

In [32]:
numeric_cols = [c for c in df.columns if is_numeric_dtype(df[c])]
scaler = StandardScaler()
scaler.fit(df[numeric_cols])
df[numeric_cols] = scaler.transform(df[numeric_cols])

clf = LogisticRegression()
X = df[["is_Mondays","is_Tuesdays","is_Wednesdays","is_Thursdays","is_Fridays","is_Saturdays","is_Sundays","is_night","is_midday","is_morning","is_midnight","is_Fall","is_Spring","is_Summer","is_Winter","Score"]]


def predict_genre(index):
    
    anime_genre_pred = []

    for i in genres_list:
        y = df["Genres"].map(lambda genres_list: i if i in genres_list else "Not " + i)
        clf = LogisticRegression()
        clf.fit(X,y)
        pred = clf.predict(X)
        anime_genre_pred.append(pred[index])
    
    anime_genre_pred = [c for c in anime_genre_pred if "Not" not in c]
    
    return anime_genre_pred


In [33]:
def get_index(title):
    return df[df.Title == title].index[0]

index = get_index("Gintama")

In [34]:
predict_genre(index)

['Comedy', 'Drama']

In [35]:
predict_genre(1)[1]

'Drama'

In [36]:
output = predict_genre(index)[0]
for i in range(1,len(predict_genre(index))):
    output = output + ", " +  predict_genre(index)[i]
    
output

'Comedy, Drama'

In [37]:
actual_genres = df.iloc[1,13]

In [42]:
actual_genres
output2 = actual_genres[0]
for i in range(1,len(actual_genres)):
    output2 = output2 + ", " +  actual_genres[i]
    
output2

'Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen'

In [18]:
"yes " + actual_genres

TypeError: can only concatenate str (not "list") to str

In [333]:
output.replace(" ", ", ")

', Comedy, Drama, Shounen'

In [310]:
print(df.iloc[1,1])
print(df.iloc[1,13])
predict_genre(1)

Gintama°
Action,Comedy,Historical,Parody,Samurai,Sci-Fi,Shounen


['Comedy', 'Drama', 'Shounen']

In [22]:
numeric_cols2 = [c for c in train_data2.columns if is_numeric_dtype(train_data2[c])]

scaler.fit(train_data2[numeric_cols2])
train_data2[numeric_cols2] = scaler.transform(train_data2[numeric_cols2])

clf = LogisticRegression()
X = train_data2[["is_Mondays","is_Tuesdays","is_Wednesdays","is_Thursdays","is_Fridays","is_Saturdays","is_Sundays","is_night","is_midday","is_morning","is_midnight","is_Fall","is_Spring","is_Summer","is_Winter","Score"]]

clf.fit(X,y)
clf.predict(X)

clf.predict_proba(X.iloc[:5])

NameError: name 'train_data2' is not defined

In [306]:
numeric_cols = [c for c in df.columns if is_numeric_dtype(df[c])]

scaler.fit(df[numeric_cols])
df[numeric_cols] = scaler.transform(df[numeric_cols])

clf = LogisticRegression()
X = df[["is_Mondays","is_Tuesdays","is_Wednesdays","is_Thursdays","is_Fridays","is_Saturdays","is_Sundays","is_night","is_midday","is_morning","is_midnight","is_Fall","is_Spring","is_Summer","is_Winter","Score"]]

clf.fit(X,y)
clf.predict(X)

clf.predict_proba(X.iloc[:5])

array([[0.66791781, 0.33208219],
       [0.20403268, 0.79596732],
       [0.15738772, 0.84261228],
       [0.47810084, 0.52189916],
       [0.64593144, 0.35406856]])

In [230]:
train_data2.loc[15]

anime_id                                                   -1.321375
Title                                Code Geass: Hangyaku no Lelouch
Type                                                              TV
Episodes                                                          25
Status                                               Finished Airing
Start airing                                               2006-10-6
End airing                                                 2007-7-29
Starting season                                                 Fall
Broadcast time                                Fridays at 01:25 (JST)
Producers          Bandai Visual,Mainichi Broadcasting System,Ban...
Licensors                            Funimation,Bandai Entertainment
Studios                                                      Sunrise
Sources                                                     Original
Genres             [Action, Military, Sci-Fi, Super Power, Drama,...
Duration                          

In [23]:
def predict_genre(index):
    
    anime_genre_pred = []
    #function starts
    X = train_data2[["is_Mondays","is_Tuesdays","is_Wednesdays","is_Thursdays","is_Fridays","is_Saturdays","is_Sundays","is_night","is_midday","is_morning","is_midnight","is_Fall","is_Spring","is_Summer","is_Winter","Score"]]
    
    for i in genres_list:
        y = train_data2["Genres"].map(lambda genres_list: i if i in genres_list else "Not " + i)
        clf = LogisticRegression()
        clf.fit(X,y)
        pred = clf.predict(X)
        anime_genre_pred.append(pred[index])
    
    anime_genre_pred = [c for c in anime_genre_pred if "Not" not in c]
    
    return anime_genre_pred

print(train_data2.iloc[1,1])
print(train_data2.iloc[1,13])

predict_genre(1)

NameError: name 'train_data2' is not defined

In [23]:
train_data2[train_data2.Title == "Steins;Gate"].index[0]

2

### Anime Recommendation Engine
https://www.youtube.com/watch?v=XoTwndOgXBM

In [41]:
df.dtypes

anime_id           float64
Title               object
Type                object
Episodes            object
Status              object
Start airing        object
End airing          object
Starting season     object
Broadcast time      object
Producers           object
Licensors           object
Studios             object
Sources             object
Genres              object
Duration            object
Rating              object
Score              float64
Scored by          float64
Members            float64
Favorites          float64
Description         object
day                float64
time               float64
season             float64
Action              object
dtype: object

In [54]:
# recommendation search engine
def get_index(title):
    return df[df.Title == title].index[0]
def get_title(index):
    return df[df.index == index]["Title"].values[0]

# select features
features = ['Episodes','Genres','Starting season','Broadcast time','Producers','Rating','Score']

# combine features
def combine_features(row):
    return str(row["Episodes"]) + " " + str(row["Genres"]) + " " + str(row["Starting season"]) + " " + str(row["Broadcast time"]) + " " + str(row["Producers"]) + " " + str(row["Rating"]) + " " + str(row["Score"])

df["combined_features"] = df.apply(combine_features, axis = 1)


# create count matrix from the combined column
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])

# compute cosine similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix)
user_anime = "Steins;Gate"

# get index of the anime from its title
anime_index = get_index(user_anime)
similar_anime = list(enumerate(cosine_sim[anime_index]))

# get a list of similar animes in descending order of similarity score
sorted_similar_anime = sorted(similar_anime,key=lambda x: x[1],reverse=True)

# print titles of first 5 animes
i = 0
for anime in sorted_similar_anime:
    rec = get_title(anime[0])
    index  = get_index(rec)
    print(rec)
    print(df.iloc[index,16])
    print(df.iloc[index,13])
    print(df.iloc[index,20])
    i = i + 1
    if i > 5:
        break

Steins;Gate
2.884483343687529
['Sci-Fi', 'Thriller']
The self-proclaimed mad scientist Rintarou Okabe rents out a room in a rickety old building in Akihabara, where he indulges himself in his hobby of inventing prospective "future gadgets" with fellow lab members: Mayuri Shiina, his air-headed childhood friend, and Hashida Itaru, a perverted hacker nicknamed "Daru." The three pass the time by tinkering with their most promising contraption yet, a machine dubbed the "Phone Microwave," which performs the strange function of morphing bananas into piles of green gel.

Though miraculous in itself, the phenomenon doesn't provide anything concrete in Okabe's search for a scientific breakthrough; that is, until the lab members are spurred into action by a string of mysterious happenings before stumbling upon an unexpected success—the Phone Microwave can send emails to the past, altering the flow of history.

Adapted from the critically acclaimed visual novel by 5pb. and Nitroplus, Steins;Gate 

In [55]:
sorted_similar_anime

[(2, 1.0000000000000007),
 (176, 0.5199469468957452),
 (353, 0.4949747468305831),
 (129, 0.4800000000000002),
 (120, 0.4618802153517007),
 (39, 0.45355736761107274),
 (197, 0.45355736761107274),
 (225, 0.4526019054848144),
 (81, 0.44907311951024936),
 (264, 0.44907311951024936),
 (173, 0.44566881162492444),
 (58, 0.4456688116249244),
 (287, 0.44000000000000017),
 (345, 0.44000000000000017),
 (181, 0.43817804600413296),
 (105, 0.4264014327112209),
 (9, 0.4242640687119284),
 (330, 0.41702882811414965),
 (73, 0.41294832096701123),
 (198, 0.41159660434202117),
 (265, 0.40166320883712187),
 (300, 0.40000000000000013),
 (326, 0.3849001794597506),
 (339, 0.3849001794597506),
 (249, 0.38430756913220915),
 (150, 0.3837612894400988),
 (213, 0.3837612894400988),
 (251, 0.3837612894400988),
 (185, 0.3829708431025352),
 (333, 0.37796447300922725),
 (349, 0.3771236166328254),
 (332, 0.3714285714285715),
 (168, 0.37139067635410367),
 (169, 0.3674234614174768),
 (356, 0.3674234614174768),
 (72, 0.3670

In [86]:
rec_genres_list = df["Genres"][index]
rec_geners = str(rec_genres_list[0])

In [87]:
rec_genres

['Slice of Life', 'Comedy', 'School']

In [88]:
for anime in sorted_similar_anime[1:]:
    rec = get_title(anime[0])
    index  = get_index(rec)

    rec_genres_list = df.iloc[index,13]
    rec_genres = rec_genres_list[0]

    for j in range(1,len(rec_genres_list)):
        rec_genres = rec_genres + ", " +  rec_genres_list[j]
    
    st.markdown("_______")
    st.subheader(rec)
    st.markdown("**Genres:** " + rec_genres)
    st.write("**Score:** " + str(df["Score"][index].round(decimals = 2)))
    st.write(df.iloc[index,20])

    i = i + 1
    if i == 5:
        break

NameError: name 'st' is not defined

In [47]:
df.columns

Index(['anime_id', 'Title', 'Type', 'Episodes', 'Status', 'Start airing',
       'End airing', 'Starting season', 'Broadcast time', 'Producers',
       'Licensors', 'Studios', 'Sources', 'Genres', 'Duration', 'Rating',
       'Score', 'Scored by', 'Members', 'Favorites', 'Description', 'day',
       'time', 'day_n', 'season_n', 'time_n', 'time_s', 'is_Mondays',
       'is_Tuesdays', 'is_Wednesdays', 'is_Thursdays', 'is_Fridays',
       'is_Saturdays', 'is_Sundays', 'is_night', 'is_midday', 'is_morning',
       'is_midnight', 'is_Fall', 'is_Spring', 'is_Summer', 'is_Winter',
       'season', 'is_Mondays', 'is_Tuesdays', 'is_Wednesdays', 'is_Thursdays',
       'is_Fridays', 'is_Saturdays', 'is_Sundays', 'is_night', 'is_midday',
       'is_morning', 'is_midnight', 'is_Fall', 'is_Spring', 'is_Summer',
       'is_Winter', 'combined_features'],
      dtype='object')

## Altair Bargraphs

https://www.geeksforgeeks.org/how-to-make-a-simple-histogram-with-altair-in-python/



In [260]:
X = 'Starting season'
factor = "Drama"

df[factor] = df["Genres"].map(lambda x: factor if factor in x else "Not " + factor)

alt.Chart(df).mark_bar(size = 40).encode(
   x = alt.X(X, sort = ["Winter","Summer","Fall","Spring"]),
    y = 'count()',
    color = factor,
).properties(
    width = 500,
    height = 300
)

In [261]:
X = 'day'
factor = "Drama"

df[factor] = df["Genres"].map(lambda x: factor if factor in x else "Not " + factor)

alt.Chart(df).mark_bar(size = 40).encode(
   x = alt.X(X, sort = ["Sundays","Mondays","Tuesdays","Wednesdays","Thursdays","Fridays","Saturdays"]),
    y = 'count()',
    color = factor,
).properties(
    width = 500,
    height = 300
)

In [249]:
timeframe

{1: '6:00PM~11:59PM',
 2: '12:00PM ~ 5:59PM',
 3: '6:00AM ~ 11:59AM',
 4: '12:00AM ~ 6:00AM'}

In [264]:
X = 'time_s'
factor = "Drama"

train_data1[factor] = train_data1["Genres"].map(lambda x: factor if factor in x else "Not " + factor)

alt.Chart(train_data1).mark_bar(size = 40).encode(
   x = alt.X(X,sort = ['6PM~11:59PM','12PM ~ 5:59PM','6AM ~ 11:59AM','12AM ~ 6:00AM']),
    y = 'count()',
    color = factor,
).properties(
    width = 800,
    height = 300
)

In [265]:
df.head()

Unnamed: 0,anime_id,Title,Type,Episodes,Status,Start airing,End airing,Starting season,Broadcast time,Producers,...,Duration,Rating,Score,Scored by,Members,Favorites,Description,day,time,Drama
0,0,Fullmetal Alchemist: Brotherhood,TV,64,Finished Airing,2009-4-5,2010-7-4,Spring,Sundays at 17:00 (JST),"Aniplex,Square Enix,Mainichi Broadcasting Syst...",...,24 min. per ep.,R,9.25,719706,1176368,105387,"""In order for something to be obtained, someth...",Sundays,17:00,Drama
1,2,Gintama°,TV,51,Finished Airing,2015-4-8,2016-3-30,Spring,Wednesdays at 18:00 (JST),"TV Tokyo,Aniplex,Dentsu",...,24 min. per ep.,R,9.16,70279,194359,5597,"Gintoki, Shinpachi, and Kagura return as the f...",Wednesdays,18:00,Not Drama
2,4,Steins;Gate,TV,24,Finished Airing,2011-4-6,2011-9-14,Spring,Wednesdays at 02:05 (JST),"Frontier Works,Media Factory,Movic,AT-X,Kadoka...",...,24 min. per ep.,PG-13,9.14,552791,990419,90365,The self-proclaimed mad scientist Rintarou Oka...,Wednesdays,02:05,Not Drama
3,7,Hunter x Hunter (2011),TV,148,Finished Airing,2011-10-2,2014-9-24,Fall,Sundays at 10:55 (JST),"VAP,Nippon Television Network,Shueisha",...,23 min. per ep.,PG-13,9.11,395162,705225,63324,Hunter x Hunter is set in a world where Hunter...,Sundays,10:55,Not Drama
4,8,3-gatsu no Lion 2nd Season,TV,22,Finished Airing,2017-10-14,2018-3-31,Fall,Saturdays at 23:00 (JST),"Aniplex,Dentsu,NHK,Hakusensha,Asmik Ace Entert...",...,25 min. per ep.,PG-13,9.1,26284,80166,1961,"Now in his second year of high school, Rei Kir...",Saturdays,23:00,Drama


### put train_data1, and train_data2 all together into one dataframe

In [278]:
df = pd.read_csv("dataanime.csv")
df.insert(0, "anime_id", df.index)
df = df[~df.isna().any(axis = 1)]
df.head()

df = df.apply(lambda x: x.replace('-',np.nan))
df = df[~df.isna().any(axis = 1)]

df["day"] = df["Broadcast time"].map(lambda x: x.split()[0])
df = df[~(df["day"] == "Not")]

df["time"] = df["Broadcast time"].map(lambda x: x.split()[2])
df = df.apply(lambda x: x.replace('Unknown',np.nan))
df = df[~df.isna().any(axis = 1)]

df.index = range(369)
df.head()

week = {
    "Mondays":1,
    "Tuesdays":2,
    "Wednesdays":3,
    "Thursdays":4,
    "Fridays":5,
    "Saturdays":6,
    "Sundays":7
}

df["day_n"] = df["day"].map(lambda x: week[x])

season = {
    "Spring":1,
    "Fall":2,
    "Summer":3,
    "Winter":4
}

df["season_n"] = df["Starting season"].map(lambda x: season[x])

df["time_n"] = df["time"].map(lambda x: x.replace(":",""))
df["time_n"] = df["time_n"].map(lambda x: np.int64(x))

def categorize_time(time):
    if time >= 1800:
        # it is night
        x = 1
    elif time >= 1200:
        # it is midday
        x = 2
    elif time >= 600:
        # it is morning
        x = 3
    else:
        # it is midnight
        x = 4
    return x

timeframe = {
    1: "6:00PM~11:59PM",
    2:"12:00PM ~ 5:59PM",
    3: "6:00AM ~ 11:59AM",
    4: "12:00AM ~ 6:00AM"
}

df["time_n"] = df["time_n"].map(lambda x: categorize_time(x))
df["time_s"] = df["time_n"].map(lambda x: timeframe[x])


df.head()

Unnamed: 0,anime_id,Title,Type,Episodes,Status,Start airing,End airing,Starting season,Broadcast time,Producers,...,Scored by,Members,Favorites,Description,day,time,day_n,season_n,time_n,time_s
0,0,Fullmetal Alchemist: Brotherhood,TV,64,Finished Airing,2009-4-5,2010-7-4,Spring,Sundays at 17:00 (JST),"Aniplex,Square Enix,Mainichi Broadcasting Syst...",...,719706,1176368,105387,"""In order for something to be obtained, someth...",Sundays,17:00,7,1,2,12:00PM ~ 5:59PM
1,2,Gintama°,TV,51,Finished Airing,2015-4-8,2016-3-30,Spring,Wednesdays at 18:00 (JST),"TV Tokyo,Aniplex,Dentsu",...,70279,194359,5597,"Gintoki, Shinpachi, and Kagura return as the f...",Wednesdays,18:00,3,1,1,6:00PM~11:59PM
2,4,Steins;Gate,TV,24,Finished Airing,2011-4-6,2011-9-14,Spring,Wednesdays at 02:05 (JST),"Frontier Works,Media Factory,Movic,AT-X,Kadoka...",...,552791,990419,90365,The self-proclaimed mad scientist Rintarou Oka...,Wednesdays,02:05,3,1,4,12:00AM ~ 6:00AM
3,7,Hunter x Hunter (2011),TV,148,Finished Airing,2011-10-2,2014-9-24,Fall,Sundays at 10:55 (JST),"VAP,Nippon Television Network,Shueisha",...,395162,705225,63324,Hunter x Hunter is set in a world where Hunter...,Sundays,10:55,7,2,3,6:00AM ~ 11:59AM
4,8,3-gatsu no Lion 2nd Season,TV,22,Finished Airing,2017-10-14,2018-3-31,Fall,Saturdays at 23:00 (JST),"Aniplex,Dentsu,NHK,Hakusensha,Asmik Ace Entert...",...,26284,80166,1961,"Now in his second year of high school, Rei Kir...",Saturdays,23:00,6,2,1,6:00PM~11:59PM


In [279]:
X = 'Starting season'
factor = "Drama"

df[factor] = df["Genres"].map(lambda x: factor if factor in x else "Not " + factor)

alt.Chart(df).mark_bar(size = 40).encode(
   x = alt.X(X, sort = ["Winter","Summer","Fall","Spring"]),
    y = 'count()',
    color = factor,
).properties(
    width = 500,
    height = 300
)

In [280]:
X = 'day'
factor = "Drama"

df[factor] = df["Genres"].map(lambda x: factor if factor in x else "Not " + factor)

alt.Chart(df).mark_bar(size = 40).encode(
   x = alt.X(X, sort = ["Sundays","Mondays","Tuesdays","Wednesdays","Thursdays","Fridays","Saturdays"]),
    y = 'count()',
    color = factor,
).properties(
    width = 500,
    height = 300
)

In [284]:
X = 'time_s'
factor = "Drama"

df[factor] = df["Genres"].map(lambda x: factor if factor in x else "Not " + factor)

alt.Chart(df).mark_bar(size = 40).encode(
   x = alt.X(X,sort = ['6PM~11:59PM','12PM ~ 5:59PM','6AM ~ 11:59AM','12AM ~ 6:00AM']),
    y = 'count()',
    color = factor,
).properties(
    width = 800,
    height = 300
)

In [89]:
genres_list

['Action',
 'Adventure',
 'Cars',
 'Comedy',
 'Dementia',
 'Demons',
 'Drama',
 'Fantasy',
 'Game',
 'Harem',
 'Historical',
 'Horror',
 'Josei',
 'Kids',
 'Magic',
 'Martial Arts',
 'Mecha',
 'Military',
 'Music',
 'Mystery',
 'Parody',
 'Police',
 'Psychological',
 'Romance',
 'Samurai',
 'School',
 'Sci-Fi',
 'Seinen',
 'Shoujo',
 'Shoujo Ai',
 'Shounen',
 'Shounen Ai',
 'Slice of Life',
 'Space',
 'Sports',
 'Super Power',
 'Supernatural',
 'Thriller',
 'Vampire']