In [1]:

from flask import Flask, render_template
import pandas as pd
from sqlalchemy import create_engine
from bokeh.io import output_file
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, FactorRange
from bokeh.palettes import Category10
import pprint 
import os

In [2]:
app = Flask(__name__)

In [3]:
df = pd.read_csv('resources/netflix_titles.csv')

# Filter out rows where the 'Director' column is null
df_cleaned = df[df['director'].notnull()]

In [4]:
df_cleaned.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."


In [5]:
# df_cleaned is dataset with director entries cleaned
df_cleaned = df_cleaned.copy()  


In [6]:
# Split the 'cast' column to get individual actors
df_cleaned = df_cleaned.dropna(subset=['cast']) 
df_cast_split = df_cleaned.assign(cast=df_cleaned['cast'].str.split(', ')).explode('cast')


In [7]:
# Remove entries where actor is null or empty

df_cast_split = df_cast_split[df_cast_split['cast'] != '']

# Calculate the number of movies for each actor 
movie_count = 0

# Count the occurrences of the name in the specified column
#count = df_cast_split['cast'].value_counts()[name_to_count]
movie_count = df_cast_split['cast'].value_counts()


print(movie_count)


cast
Anupam Kher              42
Shah Rukh Khan           35
Naseeruddin Shah         32
Om Puri                  30
Akshay Kumar             30
                         ..
Lesley-Anne Down          1
Niko Foster               1
John Lewis                1
Ellen Hollman             1
Chittaranjan Tripathy     1
Name: count, Length: 26725, dtype: int64


In [8]:
# Count the occurrences of the name in the specified column
dir_movie_count = 0

dir_movie_count = df_cast_split['director'].value_counts()

print(dir_movie_count)

director
Martin Scorsese        139
Cathy Garcia-Molina    125
Steven Spielberg       121
Rajiv Chilaka          119
Youssef Chahine        104
                      ... 
Ari Levinson             1
Kubhaer T. Jethwani      1
Richard Mears            1
Sandy Chronopoulos       1
Robert Townsend          1
Name: count, Length: 4152, dtype: int64


In [9]:
movie_count = movie_count.head(100)
# Convert to a list
x_values1 = movie_count.index.tolist()  # Convert to list for Bokeh
y_values1 = movie_count.values.tolist()


In [10]:
#Have each object have a unique color
colors = Category10[10]  # Category10 has 10 colors. We'll repeat if necessary
color_dict = {name: colors[i % 10] for i, name in enumerate(x_values1)}  # Map names to colors

#Create a ColumnDataSource
source = ColumnDataSource(data=dict(
    x=x_values1,
    y=y_values1,
    color=[color_dict[name] for name in x_values1],  # Assign colors based on cast name
    name=x_values1,
    frequency=y_values1
))

#Create a Bokeh scatter plot
output_file("name_frequency_scatter_with_legend_and_toolbar.html")  # Output the plot to an HTML file
p = figure(title="Frequency of Top 100 Actors on Netflix (Scatter Plot)",
           toolbar_location="above", tools="pan,reset,hover,wheel_zoom", height=1000, width=1000,
           x_range=FactorRange(*x_values1))  # Explicitly set x_range to handle categorical data

#Add scatter renderer with color and legend
p.scatter(x='x', y='y', source=source, size=8, color='color', alpha=0.7, legend_field="name", fill_alpha=0.6)

#Add HoverTool to show the values when hovering
hover = HoverTool()
hover.tooltips = [("Actor Name", "@name"), ("# of Movies on Netflix", "@frequency")]
p.add_tools(hover)

#Customize the plot appearance
p.xaxis.major_label_orientation = 1.2  # Rotate names for better readability
p.xaxis.axis_label = "Top 100 Actor Names"
p.yaxis.axis_label = "Number Of Movies On Netflix"
p.y_range.start = 0

# Step 11: Customize the legend
p.legend.title = 'Actor Names'
p.legend.orientation = 'vertical'  # Set the legend to vertical orientation
p.legend.location = 'top_right'    # Position the legend outside the plot on the right
p.legend.glyph_width = 20          # Adjust the width of the legend glyphs
p.legend.spacing = 5               # Adjust the spacing between legend items
#Show the plot in an HTML file
show(p)

In [11]:
engine = create_engine('sqlite:///mydatabase.db')

df_cast_split.to_sql('mydatabase', con=engine, if_exists='replace', index=False)

print("CSV data has been transferred")

CSV data has been transferred
