In [None]:
%matplotlib inline
#dependencies
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
#load anime csv and convert to dataframe
file_anime = 'Resources/animes.csv'
anime_df = pd.read_csv(file_anime)
anime_df.head()

In [None]:
# Drop rows that are missing a synoposis
anime_df = anime_df.dropna(axis=0, how='any', subset=['synopsis'])

In [None]:
#Drop genres we are not interested in
anime_df = anime_df[anime_df['genre'].str.contains("Hentai") == False]

In [None]:
#confirm bad genres were dropped
anime_df['genre'].value_counts()

In [None]:
#create a cleaned anime dataframe with columns we want
anime_cleaned_df = anime_df[['uid', 'title', 'synopsis', 'members', 'popularity', 'ranked', 'score']]
anime_cleaned_df.head()

In [None]:
#check shape of cleaned database
anime_cleaned_df.shape

In [None]:
#check data types
anime_cleaned_df.dtypes

In [None]:
#remove animes with a score of less than 7
#want to recommend well ranked animes for new users
anime_cleaned_df = anime_cleaned_df[anime_cleaned_df['score'] > 6.99]
anime_cleaned_df.shape

In [None]:
anime_cleaned_df.head()

In [None]:
#load live action csvs
file_amazon = 'Resources/amazon_prime_titles.csv'
file_hulu = 'Resources/hulu_titles.csv'
file_disney = 'Resources/disney_plus_titles.csv'
file_netflix = 'Resources/netflix_titles.csv'

In [None]:
#convert csvs to dataframes
amazon_df = pd.read_csv(file_amazon)
hulu_df = pd.read_csv(file_hulu)
disney_df = pd.read_csv(file_disney)
netflix_df = pd.read_csv(file_netflix)

In [None]:
#clean dataframes to include show id, title, and description
netflix_cleaned_df = netflix_df[['show_id', 'title', 'description']]
disney_cleaned_df = disney_df[['show_id', 'title', 'description']]
amazon_cleaned_df = amazon_df[['show_id', 'title', 'description']]
hulu_cleaned_df = hulu_df[['show_id', 'title', 'description']]

In [None]:
#check netflix dataframe
netflix_cleaned_df.head()

In [None]:
#add identifier to beginning of show id to differentiate between different sources
netflix_cleaned_df['show_id'] = 'n-' + netflix_cleaned_df['show_id']
disney_cleaned_df['show_id'] = 'd-' + disney_cleaned_df['show_id']
hulu_cleaned_df['show_id'] = 'h-' + hulu_cleaned_df['show_id']
amazon_cleaned_df['show_id'] = 'a-' + amazon_cleaned_df['show_id']

In [None]:
#check amazon dataframe
amazon_cleaned_df.head()

In [None]:
#add source column to show what the live action is streaming on
netflix_cleaned_df['source'] = 'Netflix'
disney_cleaned_df['source'] = 'Disney'
hulu_cleaned_df['source'] = 'Hulu'
amazon_cleaned_df['source'] = 'Amazon'

In [None]:
#check hulu dataframe
hulu_cleaned_df.head()

In [None]:
#group live action dataframes to prep for concat
dfs = [netflix_cleaned_df, hulu_cleaned_df, disney_cleaned_df,amazon_cleaned_df]

In [None]:
#create full live action dataframe
live_action_df = pd.concat(dfs)

In [None]:
# check live action df
live_action_df.head()

In [None]:
#check count of live actions from each source
live_action_df.groupby('source').count()

In [None]:
#check na volumes
live_action_df.isna().sum()

In [None]:
#drop na rows
live_action_df = live_action_df.dropna()

In [None]:
#check live action df shape
live_action_df.shape

In [None]:
#verify na values are dropped
live_action_df.isna().sum()

In [None]:
#convert both cleaned dfs to csvs
anime_cleaned_df.to_csv('Resources/cleaned_anime.csv')
live_action_df.to_csv('Resources/cleaned_live_actions.csv')

In [None]:
live_action_df.head()

In [None]:
anime_cleaned_df.head()

In [None]:
#data = anime_df.score
plt.hist(anime_cleaned_df['score'], bins = 10, edgecolor='black', linewidth=1.2, alpha=1)

plt.xlabel('Score')
plt.ylabel('Score Count')
plt.title('Anime Scores')


plt.show()

In [None]:
live_action_counts_df = pd.DataFrame(live_action_df['source'].value_counts())

x = live_action_counts_df.index
y = live_action_counts_df.source

plt.bar(x, y)
plt.xlabel('Live Action Source')
plt.ylabel('Count From Source')
plt.title('Live Action Source Breakdown')

plt.show()

In [None]:
x= anime_cleaned_df['score']
y= anime_cleaned_df['members']

plt.scatter(x,y)
plt.xlabel('Anime Score')
plt.ylabel('Popularity')
plt.title('Anime Scores vs Audience size')

plt.show()

In [None]:
anime_test_df = anime_df[anime_df['episodes'] <500]

x= anime_test_df['episodes'] 
y= anime_test_df['score']

plt.scatter(x,y)
plt.xlabel('Number of episodes')
plt.ylabel('Score')
plt.title('Anime Episodes vs Score')

plt.show()