In [1]:
# Import dependencies
import pandas as pd

In [13]:
# Load anime data
file_anime = 'data/animes.csv'
anime_df = pd.read_csv(file_anime)

In [14]:
# Drop rows that are missing a synoposis
anime_df = anime_df.dropna(axis=0, how='any', subset=['synopsis'])

In [15]:
#Drop genres we are not interested in
anime_df = anime_df[anime_df['genre'].str.contains("Hentai") == False]
anime_df['genre'].value_counts()

['Music']                                                                 708
['Comedy']                                                                598
['Kids']                                                                  364
['Dementia']                                                              191
['Music', 'Kids']                                                         184
                                                                         ... 
['Super Power', 'Martial Arts']                                             1
['Action', 'Horror', 'School', 'Shounen', 'Supernatural']                   1
['Adventure', 'Comedy', 'Horror', 'Supernatural', 'School', 'Shounen']      1
['Fantasy', 'Comedy', 'Shoujo']                                             1
['Music', 'Romance', 'Super Power', 'Supernatural']                         1
Name: genre, Length: 4611, dtype: int64

In [16]:
# Remove columns we won't need
anime_df = anime_df[['uid', 'title', 'synopsis', 'members', 'popularity', 'ranked', 'score']]
anime_df.head()

Unnamed: 0,uid,title,synopsis,members,popularity,ranked,score
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,489888,141,25.0,8.82
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,995473,28,24.0,8.83
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,581663,98,23.0,8.83
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...",1615084,4,1.0,9.23
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,214621,502,22.0,8.83


In [20]:
anime_df.dtypes

uid             int64
title          object
synopsis       object
members         int64
popularity      int64
ranked        float64
score         float64
dtype: object

In [21]:
# Remove animes with user scores less than 7; don't want to reccomend bad shows!
# also gets rid of 13778 rows in anime_df :)
anime_df = anime_df[anime_df['score'] > 6.99]
anime_df.shape

(5111, 7)

In [22]:
# Check for missing values
anime_df.isna().sum()

uid            0
title          0
synopsis       0
members        0
popularity     0
ranked        14
score          0
dtype: int64

In [23]:
# Make sure rows are dropped (ex: 29 and 31 should be missing)
anime_df[27:32]

Unnamed: 0,uid,title,synopsis,members,popularity,ranked,score
27,5680,K-On!,"Hirasawa Yui, a young, carefree girl entering ...",610222,90,792.0,7.86
28,451,InuYasha Movie 3: Tenka Hadou no Ken,"Izayoi and Inu no Taishou, Inuyasha's parents,...",68407,1527,791.0,7.86
30,30364,Haikyuu!! Movie 2: Shousha to Haisha,Second Haikyuu!! recap movie.,27734,2698,789.0,7.86
32,36369,Ginga Eiyuu Densetsu: Die Neue These - Seiran 1,The Free Planets Alliance's invasion of the Ga...,11817,4041,787.0,7.86
33,27633,Gekkan Shoujo Nozaki-kun Specials,Specials bundled with the Blu-ray/DVDs.,87789,1233,786.0,7.86


In [24]:
# Load other show data from CSVs
file_amazon = 'data/amazon_prime_titles.csv'
file_hulu = 'data/hulu_titles.csv'
file_disney = 'data/disney_plus_titles.csv'
file_netflix = 'data/netflix_titles.csv'

In [25]:
# Create df for each one
amazon_df = pd.read_csv(file_amazon)
hulu_df = pd.read_csv(file_hulu)
disney_df = pd.read_csv(file_disney)
netflix_df = pd.read_csv(file_netflix)

In [26]:
# Pare down columns
netflix_df = netflix_df[['show_id', 'title', 'description']]
disney_df = disney_df[['show_id', 'title', 'description']]
amazon_df = amazon_df[['show_id', 'title', 'description']]
hulu_df = hulu_df[['show_id', 'title', 'description']]

In [27]:
netflix_df.head()

Unnamed: 0,show_id,title,description
0,s1,Dick Johnson Is Dead,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,"After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,To protect his family from a powerful drug lor...
3,s4,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo..."
4,s5,Kota Factory,In a city of coaching centers known to train I...


In [28]:
# Add source flag to show ids (will be combining all of these into one df)
netflix_df['show_id'] = 'n-' + netflix_df['show_id']
disney_df['show_id'] = 'd-' + disney_df['show_id']
hulu_df['show_id'] = 'h-' + hulu_df['show_id']
amazon_df['show_id'] = 'a-' + amazon_df['show_id']

In [29]:
amazon_df.head()

Unnamed: 0,show_id,title,description
0,a-s1,The Grand Seduction,A small fishing village must procure a local d...
1,a-s2,Take Care Good Night,A Metro Family decides to fight a Cyber Crimin...
2,a-s3,Secrets of Deception,After a man discovers his wife is cheating on ...
3,a-s4,Pink: Staying True,"Pink breaks the mold once again, bringing her ..."
4,a-s5,Monster Maker,Teenage Matt Banting wants to work with a famo...


In [30]:
# Add source column 
netflix_df['source'] = 'Netflix'
disney_df['source'] = 'Disney'
hulu_df['source'] = 'Hulu'
amazon_df['source'] = 'Amazon'

In [31]:
hulu_df.head()

Unnamed: 0,show_id,title,description,source
0,h-s1,Ricky Velez: Here's Everything,​Comedian Ricky Velez bares it all with his ho...,Hulu
1,h-s2,Silent Night,"Mark, a low end South London hitman recently r...",Hulu
2,h-s3,The Marksman,A hardened Arizona rancher tries to protect an...,Hulu
3,h-s4,Gaia,A forest ranger and two survivalists with a cu...,Hulu
4,h-s5,Settlers,Mankind's earliest settlers on the Martian fro...,Hulu


In [35]:
dfs = [netflix_df, hulu_df, disney_df,amazon_df]

In [36]:
live_action_df = pd.concat(dfs)

In [37]:
len(live_action_df)

22998

In [38]:
live_action_df

Unnamed: 0,show_id,title,description,source
0,n-s1,Dick Johnson Is Dead,"As her father nears the end of his life, filmm...",Netflix
1,n-s2,Blood & Water,"After crossing paths at a party, a Cape Town t...",Netflix
2,n-s3,Ganglands,To protect his family from a powerful drug lor...,Netflix
3,n-s4,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo...",Netflix
4,n-s5,Kota Factory,In a city of coaching centers known to train I...,Netflix
...,...,...,...,...
9663,a-s9664,Pride Of The Bowery,New York City street principles get an East Si...,Amazon
9664,a-s9665,Planet Patrol,"This is Earth, 2100AD - and these are the adve...",Amazon
9665,a-s9666,Outpost,"In war-torn Eastern Europe, a world-weary grou...",Amazon
9666,a-s9667,Maradona: Blessed Dream,"The series tells the story of Diego Maradona, ...",Amazon


In [39]:
# Count the number of shows from each source
live_action_df.groupby('source').count()

Unnamed: 0_level_0,show_id,title,description
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Amazon,9668,9668,9668
Disney,1450,1450,1450
Hulu,3073,3073,3069
Netflix,8807,8807,8807


In [40]:
# Check to see if there are any nulls
live_action_df.isna().sum()

show_id        0
title          0
description    4
source         0
dtype: int64

In [41]:
# Drop shows without description
live_action_df = live_action_df.dropna()

In [42]:
live_action_df.shape

(22994, 4)

In [43]:
live_action_df.isna().sum()

show_id        0
title          0
description    0
source         0
dtype: int64

In [44]:
# Create clean CSVs
anime_df.to_csv('data/cleaned_anime.csv')
live_action_df.to_csv('data/cleaned_live_actions.csv')