In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
from jupyter_dash import JupyterDash

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Create stopword list:
stopwords = set(stopwords.words('english'))

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# import .py file for 
import Nrl as nrl
import warnings
warnings.filterwarnings('ignore')


In [2]:
'''Read in data (This is Dylan's dataset sent on Tuesday, Feb 14.)'''
data = pd.read_csv("/Users/dylanjorling/UCLA/418proj/box_office_data_final.csv") ## change path to load data

'''Turning languages into a binary variable (English or Multilingual/Foreign)'''
data['languages'] = np.where(data['languages'] == 'English, None', 'English', data['languages'])
data['languages'] = np.where(data['languages'] == 'None, English', 'English', data['languages'])
data['languages'] = np.where(data['languages'] == 'None, French', 'French', data['languages'])
data['languages'] = np.where(data['languages'].str.contains(','), 'Multilingual', data['languages'])
data['languages'] = np.where(data['languages'] != 'English', 'Multilingual_Foreign', data['languages'])

'''Splitting genres variable so each row is a unique movie-genre combination'''
data = data.set_index([i for i in data.columns if i != 'genres']).apply(
    lambda x: x.str.split(', ').explode()).reset_index()

data['genre'] = np.where(data['genres'].isin(['Drama', 'Comedy', 'Action', 'Adventure', 'Crime',
                                                 'Thriller', 'Romance', 'Horror', 'Mystery', 'Sci-Fi', 
                                                 'Fantasy', 'Biography', 'Family', 'Animation']), 
                          data['genres'], 'Other')

'''Cleaning rating'''
data['rating'] = np.where(data['rating'].isin(['PG', 'PG-13', 'R']), data['rating'], 'Other')

'''Removing stop words from plot variable'''
data['plot no stop words'] = data['plot'].dropna().apply(
    lambda x: [x for x in word_tokenize(x) if not x.lower() in stopwords])

'''Get the polarity of the plot'''
sia = SentimentIntensityAnalyzer()
data['polarity'] = data['plot'].dropna().apply(lambda x: sia.polarity_scores(x))

'''Each value in data['polarity'] is a dictionary --> separating each sentiment into its own variable'''
data['negative'] = data['polarity'].dropna().apply(lambda x: x['neg'])
data['positive'] = data['polarity'].dropna().apply(lambda x: x['pos'])
data['neutral'] = data['polarity'].dropna().apply(lambda x: x['neu'])
data = data.drop('polarity', axis=1)

'''Creating categorical company size (this mimics what we did in Spring 2022)'''
data['company_size_cat'] = 1
data['company_size_cat'] = np.where((data['company_size'] > 8) & (data['company_size'] <= 44),
                                    2, data['company_size_cat'])

data['company_size_cat'] = np.where((data['company_size'] > 44) & (data['company_size'] <= 179),
                                    3, data['company_size_cat'])

data['company_size_cat'] = np.where((data['company_size'] > 179),
                                    4, data['company_size_cat'])

'''Creating categorical release period (this mimics what we did in Spring 2022)'''
data['release_period'] = 'Spring'
data['release_period'] = np.where((data['release_month'] == 'June') | 
                                     (data['release_month'] == 'July') |
                                     (data['release_month'] == 'August'), 
                                     'Summer', data['release_period'])

data['release_period'] = np.where((data['release_month'] == 'September') | 
                                     (data['release_month'] == 'October') |
                                     (data['release_month'] == 'November'), 
                                     'Fall', data['release_period'])

data['release_period'] = np.where((data['release_month'] == 'December') | 
                                     (data['release_month'] == 'January') |
                                     (data['release_month'] == 'February'), 
                                     'Winter', data['release_period'])

data

Unnamed: 0,id,title,fullTitle,type,year,date,runtime,plot,awards,directors,...,gross_profit_adj,new_genres,genres,genre,plot no stop words,negative,positive,neutral,company_size_cat,release_period
0,tt0010323,The Cabinet of Dr. Caligari,The Cabinet of Dr. Caligari (1920),Movie,1920,1920-02-27,67.0,"Francis, a young man, recalls in his memory th...","Awards, 1 win & 1 nomination",Robert Wiene,...,-1.552310e+05,Horror,Horror,Horror,"[Francis, ,, young, man, ,, recalls, memory, h...",0.098,0.104,0.798,1,Winter
1,tt0010323,The Cabinet of Dr. Caligari,The Cabinet of Dr. Caligari (1920),Movie,1920,1920-02-27,67.0,"Francis, a young man, recalls in his memory th...","Awards, 1 win & 1 nomination",Robert Wiene,...,-1.552310e+05,Horror,Mystery,Mystery,"[Francis, ,, young, man, ,, recalls, memory, h...",0.098,0.104,0.798,1,Winter
2,tt0010323,The Cabinet of Dr. Caligari,The Cabinet of Dr. Caligari (1920),Movie,1920,1920-02-27,67.0,"Francis, a young man, recalls in his memory th...","Awards, 1 win & 1 nomination",Robert Wiene,...,-1.552310e+05,Horror,Thriller,Thriller,"[Francis, ,, young, man, ,, recalls, memory, h...",0.098,0.104,0.798,1,Winter
3,tt0012349,The Kid,The Kid (1921),Movie,1921,1921-02-06,68.0,The opening title reads: A comedy with a smile...,"Top rated movie #129 | Awards, 2 wins",Charles Chaplin,...,-3.040178e+06,Drama,Comedy,Comedy,"[opening, title, reads, :, comedy, smile, --, ...",0.070,0.133,0.797,1,Winter
4,tt0012349,The Kid,The Kid (1921),Movie,1921,1921-02-06,68.0,The opening title reads: A comedy with a smile...,"Top rated movie #129 | Awards, 2 wins",Charles Chaplin,...,-3.040178e+06,Drama,Drama,Drama,"[opening, title, reads, :, comedy, smile, --, ...",0.070,0.133,0.797,1,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10907,tt9784798,Judas and the Black Messiah,Judas and the Black Messiah (2021),Movie,2021,2021-02-12,126.0,"Fred Hampton, a young, charismatic activist, b...","Won 2 Oscars, 44 wins & 80 nominations total",Shaka King,...,-2.167677e+07,Drama,Drama,Drama,"[Fred, Hampton, ,, young, ,, charismatic, acti...",0.089,0.035,0.877,2,Winter
10908,tt9784798,Judas and the Black Messiah,Judas and the Black Messiah (2021),Movie,2021,2021-02-12,126.0,"Fred Hampton, a young, charismatic activist, b...","Won 2 Oscars, 44 wins & 80 nominations total",Shaka King,...,-2.167677e+07,Drama,History,Other,"[Fred, Hampton, ,, young, ,, charismatic, acti...",0.089,0.035,0.877,2,Winter
10909,tt9844522,Escape Room: Tournament of Champions,Escape Room: Tournament of Champions (2021),Movie,2021,2021-07-16,88.0,Six people unwillingly find themselves locked ...,0,Adam Robitel,...,4.174173e+07,Horror,Action,Action,"[Six, people, unwillingly, find, locked, anoth...",0.000,0.155,0.845,4,Summer
10910,tt9844522,Escape Room: Tournament of Champions,Escape Room: Tournament of Champions (2021),Movie,2021,2021-07-16,88.0,Six people unwillingly find themselves locked ...,0,Adam Robitel,...,4.174173e+07,Horror,Adventure,Adventure,"[Six, people, unwillingly, find, locked, anoth...",0.000,0.155,0.845,4,Summer


In [7]:
######## Load Neural Net Related Data###########
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import json
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler

# load in writer/actor/star data
with open('input_dict.json', 'r') as j:
     input_dict = json.loads(j.read())
        
# Release month dict
month_dict = {'January': 'nov_to_jan',
              'February': 'feb_to_apr',
              'March': 'feb_to_apr',
              'April': 'feb_to_apr',
              'May': 'may_to_july',
              'June': 'may_to_july',
              'July': 'may_to_july',
              'August': 'aug_to_oct',
              'September': 'aug_to_oct',
              'October': 'aug_to_oct',
              'November': 'nov_to_jan',
              'December': 'nov_to_jan'}

genre_dict = {'Action': 'Action/Adventure',
              'Adventure': 'Action/Adventure',
              'Crime': 'Thriller',
              'Comedy': 'Comedy',
              'Horror': 'Thriller',
              'Thriller': 'Thriller',
              'Drama': 'Drama',
              'Romance': 'Misc',
              'Fantasy': 'Action/Adventure',
              'Sci-Fi': 'Action/Adventure',
              'Family': 'Misc',
              'Mystery': 'Thriller',
              'Biography': 'Drama',
              'Animation': 'Misc',
              'History': 'Drama',
              'Music': 'Drama',
              'Sport': 'Drama',
              'War': 'Drama',
              'Musical': 'Misc',
              'Western': 'Drama',
              'Film-Noir': 'Misc'}

co_size_dict = {'Very Unknown': 'small',
               'Relatively Unknown': 'medium',
               'Influential': 'large',
               'Highly Influential': 'giant'}
        

#sort
def sortSur(nameList):
    l2 = []
 
    # create 2d list of names
    for ele in nameList:
        l2.append(ele.split())
    nameList = []
 
    # sort by last name
    for ele in sorted(l2, key=lambda x: x[-1]):
        nameList.append(' '.join(ele))
 
    # return sorted list
    return nameList
    


In [4]:
'''Creating dataframe with just positive and negative. Rescoring the values so they sum to 1'''
pos_neg = data[['positive', 'negative']].copy()
pos_neg['pos_neg'] = pos_neg.sum(axis=1)
pos_neg['positive'] = pos_neg['positive'] / pos_neg['pos_neg']
pos_neg['negative'] = pos_neg['negative'] / pos_neg['pos_neg']
pos_neg = pos_neg.drop('pos_neg', axis=1)
pos_neg['genre'] = data['genre']
pos_neg['year'] = data['year']

'''
Separating positive from negative into separate dataframes --> combining the two dataframes into 1,
so we can create a pie chart.
'''
pos = pos_neg[['genre', 'positive', 'year']].rename(columns = {'positive':'score'})
pos['sentiment'] = 'Positive'

neg = pos_neg[['genre', 'negative', 'year']].rename(columns = {'negative':'score'})
neg['sentiment'] = 'Negative'

sentiment = pd.concat([pos, neg]).reset_index().drop('index', axis=1)
sentiment

Unnamed: 0,genre,score,year,sentiment
0,Horror,0.514851,1920,Positive
1,Mystery,0.514851,1920,Positive
2,Thriller,0.514851,1920,Positive
3,Comedy,0.655172,1921,Positive
4,Drama,0.655172,1921,Positive
...,...,...,...,...
21819,Drama,0.717742,2021,Negative
21820,Other,0.717742,2021,Negative
21821,Action,0.000000,2021,Negative
21822,Adventure,0.000000,2021,Negative


In [None]:
'''This is the code for the dashboard. There are 3 tabs: EDA, ML Profit Prediction, and Movie Recommendation'''

__name__ = 'main'

app = dash.Dash(__name__)

'''
This creates the header and tabs layout. Depending on which tab you pick, the dashboard layout will update
'''

app.layout = html.Div(
    children = [
        html.H1(
            'IMDb Analysis Dashboard',
            style = {'textAlign':'center'},
        ),
        
        html.H4(
            'Sofia Alcazar, Daniel Kwon, Dylan Jorling, Ajay Patel',
            style = {'textAlign':'center'}
        ),
        
        dcc.Tabs(id = 'multi-tabs-input', value = 'tab1', 
                 children = [
                     dcc.Tab(label = 'Exploratory Data Analysis', value = 'tab1'),
                     dcc.Tab(label = 'How Much Money Will a New Movie Make?', value = 'tab2'),
                     dcc.Tab(label = 'Movie Recommendation', value = 'tab3')
                 ]
        ),
        
        html.Div(id = 'multi-tabs-output') 
    ]
)

'''Depending on which tab you pick, the dashboard will update with one of the three following layouts'''
@app.callback(
    Output('multi-tabs-output', 'children'),
    Input('multi-tabs-input', 'value'))

def render_content(tab):
    if tab == 'tab1':
        
        '''
        This is the EDA Layout. There are three dropdowns and 6 graphs.
        All 6 graphs will update based on the start year, end year, and movie genre selected.
        Graph 1 is a histogram of IMDb ratings
        Graph 2 is a pie chart of the sentiment from the 'plot' variable
        Graph 3 is a line graph of IMDb ratings and sentiment over time
        Graph 4 is a bar chart of the genres
        Graph 5 is a bar chart of the most popular keywords from the 'keywords' variable
        Graph 6 is a scatterplot comparing IMDb ratings to Metacritic ratings
        '''
        
        return html.Div([
            html.Div(
                children = dcc.Dropdown(id = 'dropdown1', 
                                        options = ['From'] + sorted(data['year'].unique()), 
                                        value = 'From'),
                style = {'width':'33%', 'display':'inline-block'}
            ),

            html.Div(
                children = dcc.Dropdown(id = 'dropdown2', 
                                        options = ['To'] + sorted(data['year'].unique()), 
                                        value = 'To'),
                style = {'width':'34%', 'display':'inline-block'}
            ),

            html.Div(
                children = dcc.Dropdown(id = 'dropdown3', 
                                        options = ['All Genres'] + sorted(data['genre'].unique()), 
                                        value = 'All Genres'),
                style = {'width':'33%', 'display':'inline-block'}
            ),

            html.Div(
                children = dcc.Graph(id = 'graph1')
            ),

            html.Div(
                children = [dcc.Graph(id = 'graph2')],
                style = {'width':'50%', 'display':'inline-block'}
            ),

            html.Div(
                children = [dcc.Graph(id = 'graph3')],
                style = {'width':'50%', 'display':'inline-block'}
            ),

            html.Div(
                children = [dcc.Graph(id = 'graph4')],
                style = {'width':'33%', 'display':'inline-block'}
            ),

            html.Div(
                children = [dcc.Graph(id = 'graph5')],
                style = {'width':'34%', 'display':'inline-block'}
            ),

            html.Div(
                children = [dcc.Graph(id = 'graph6')],
                style = {'width':'33%', 'display':'inline-block'}
            )
        ])
    
    elif tab == 'tab2':
        
        '''
        In this tab, we can predict how much we think a new movie will make. For now,
        I have made a simple linear model for testing purposes. A user enters budget,
        runtime, release period, and season, and we return a prediction. Once we have
        the ML model, the dropdowns can be changed + dropdowns can be added or removed.
        '''
        
        return html.Div([
            html.H3(
                'Please select a budget, runtime, release period, production company influence,' +
                ' genre, stars and director.',
                style = {'textAlign':'center'}
            ),

            html.Div(
                children = dcc.Dropdown(id = 'dropdown4', 
                                        options = ['${:,.2f}'.format(i) for i in range(10000000, 1100000000, 10000000)], 
                                        value = '$10,000,000', 
                                        placeholder = 'Enter Budget'),
                style = {'width':'10%','display':'inline-block'}
            ),

            html.Div(
                children = dcc.Dropdown(id = 'dropdown5', 
                                        options = list(range(30, 240, 10)),
                                        value = 120,
                                        placeholder = 'Runtime'),
                style = {'width':'6%','display':'inline-block'}
            ),

            html.Div(
                children = dcc.Dropdown(id = 'dropdown6', 
                                        options = list(month_dict.keys()), 
                                        value = 'July',
                                        placeholder = 'Release Month'),
                style = {'width':'10%','display':'inline-block'}
            ),

            html.Div(
                children = dcc.Dropdown(id = 'dropdown7', 
                                        options = list(co_size_dict.keys()), 
                                        value = 'Influential',
                                        placeholder = 'Company Influence'),
                style = {'width':'15%','display':'inline-block'}
            ),
            
            html.Div(
                children = dcc.Dropdown(id = 'dropdown8', 
                                        options = sorted(list(genre_dict.keys())), 
                                        value = 'Action', 
                                        placeholder = 'Enter Genre'),
                style = {'width':'10%','display':'inline-block'}
            ),
            
            html.Div(
                children = dcc.Dropdown(id = 'dropdown9', 
                                        options = sortSur(list(input_dict['st'].keys())), 
                                        value = 'Leonardo DiCaprio', 
                                        placeholder = 'Choose First Star'),
                style = {'width':'15%','display':'inline-block'}
            ),
            
            html.Div(
                children = dcc.Dropdown(id = 'dropdown10', 
                                        options = sortSur(list(input_dict['st'].keys())), 
                                        value = 'Tom Cruise', 
                                        placeholder = 'Choose Second Star'),
                style = {'width':'15%','display':'inline-block'}
            ),
            
            html.Div(
                children = dcc.Dropdown(id = 'dropdown11', 
                                        options = sortSur(list(input_dict['dir'].keys())), 
                                        value = 'Steven Spielberg', 
                                        placeholder = 'Choose Director'),
                style = {'width':'15%','display':'inline-block'}
            ),
            
            

            html.H3(
                'We predict your movie will make:\n',
                style = {'textAlign':'center'}
            ),

            html.H3(
                id = 'prediction1_output',
                style = {'textAlign':'center'}
            )            
        ])
    
    
    else:
        
        '''
        The final tab can be the movie recommendation system. For now, I have put this simple
        bar chart to make sure the tab works. 
        '''
        
        return html.Div([
            dcc.Graph(
                figure = {
                    'data' : [{
                        'x' : [1, 2, 3],
                        'y' : [3, 1, 2],
                        'type' : 'bar'
                    }]
                }
            )
        ])


'''
For all the graphs in the EDA tab, they all take the inputs, filter the original dataset,
and then output the respective graph. Only Graph 2 uses the 'sentiment' dataframe created
at the beginning of the notebook.
'''
@app.callback(
    Output('graph1', 'figure'),
    Input('dropdown1', 'value'),
    Input('dropdown2', 'value'),
    Input('dropdown3', 'value'))

def update_graph1(year_from, year_to, genre):
    if (year_from == 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = data
    if (year_from != 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] >= year_from)]
    if (year_from == 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] <= year_to)]
    if (year_from == 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['genre'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['year'] <= year_to)]
    if (year_from == 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] <= year_to) & (data['genre'] == genre)]
    if (year_from != 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['genre'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['year'] <= year_to) & 
                             (data['genre'] == genre)]
      
    trace = px.histogram(
        x = filtered_data['imDbRating'],
        title = "Distribution of IMDb Ratings",
        labels = {'x':'IMDb Ratings',
                  'count':'Count'}
    )
    
    return trace


@app.callback(
    Output('graph2', 'figure'),
    Input('dropdown1', 'value'),
    Input('dropdown2', 'value'),
    Input('dropdown3', 'value'))

def update_graph2(year_from, year_to, genre):
    if (year_from == 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = sentiment
    if (year_from != 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = sentiment[(sentiment['year'] >= year_from)]
    if (year_from == 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = sentiment[(sentiment['year'] <= year_to)]
    if (year_from == 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = sentiment[(sentiment['genre'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = sentiment[(sentiment['year'] >= year_from) & (sentiment['year'] <= year_to)]
    if (year_from == 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = sentiment[(sentiment['year'] <= year_to) & (sentiment['genre'] == genre)]
    if (year_from != 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = sentiment[(sentiment['year'] >= year_from) & (sentiment['genre'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = sentiment[(sentiment['year'] >= year_from) & (sentiment['year'] <= year_to) & 
                             (sentiment['genre'] == genre)]
    
    trace = px.pie(
        filtered_data, values = 'score', names = 'sentiment', color = 'sentiment',
        color_discrete_map = {'Positive':'lightgreen',
                              'Negative':'red'},
        title = "Plot Sentiment"
    )
        
    return trace


@app.callback(
    Output('graph3', 'figure'),
    Input('dropdown1', 'value'),
    Input('dropdown2', 'value'),
    Input('dropdown3', 'value'))

def update_graph3(year_from, year_to, genre):
    if (year_from == 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = data
    if (year_from != 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] >= year_from)]
    if (year_from == 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] <= year_to)]
    if (year_from == 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['genre'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['year'] <= year_to)]
    if (year_from == 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] <= year_to) & (data['genre'] == genre)]
    if (year_from != 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['genre'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['year'] <= year_to) & 
                             (data['genre'] == genre)]

    '''
    In order to make the line graph, I had to separte the positive plot movies from
    the negative plot movies and then combine the 2 dataframes in long format. 
    '''
    
    positive = filtered_data[filtered_data['positive'] > filtered_data['negative']].groupby(
        ['year'])['imDbRating'].mean().reset_index()
    
    negative = filtered_data[filtered_data['positive'] < filtered_data['negative']].groupby(
        ['year'])['imDbRating'].mean().reset_index()
    
    positive['Sentiment'] = 'Positive'
    negative['Sentiment'] = 'Negative'
    
    positive_negative = pd.concat([positive, negative]).reset_index()
        
    trace = px.line(
        x = positive_negative['year'],
        y = positive_negative['imDbRating'],
        color = positive_negative['Sentiment'],
        color_discrete_map = {'Positive':'green',
                              'Negative':'red'},
        title = "IMDb Rating Over Time by Sentiment",
        labels = {'x':'Time',
                  'y':'IMDb Rating'}
    )
    
    return trace


@app.callback(
    Output('graph4', 'figure'),
    Input('dropdown1', 'value'),
    Input('dropdown2', 'value'),
    Input('dropdown3', 'value'))

def update_graph4(year_from, year_to, genre):
    if (year_from == 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = data
    if (year_from != 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] >= year_from)]
    if (year_from == 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] <= year_to)]
    if (year_from == 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['genres'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['year'] <= year_to)]
    if (year_from == 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] <= year_to) & (data['genres'] == genre)]
    if (year_from != 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['genres'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['year'] <= year_to) & 
                             (data['genres'] == genre)]
        
    trace = px.bar(
        x = filtered_data['genres'].value_counts().sort_values(ascending=True).tail(10),
        y = filtered_data['genres'].value_counts().sort_values(ascending=True).tail(10).index,
        title = "Most Popular Genres",
        labels = {'x':'Count',
                  'y':'Genre'}
    )
    
    return trace


@app.callback(
    Output('graph5', 'figure'),
    Input('dropdown1', 'value'),
    Input('dropdown2', 'value'),
    Input('dropdown3', 'value'))

def update_graph5(year_from, year_to, genre):
    if (year_from == 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = data
    if (year_from != 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] >= year_from)]
    if (year_from == 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] <= year_to)]
    if (year_from == 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['genre'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['year'] <= year_to)]
    if (year_from == 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] <= year_to) & (data['genre'] == genre)]
    if (year_from != 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['genre'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['year'] <= year_to) & 
                             (data['genre'] == genre)]
    
    '''
    For each movie, the keywords are in a list. Using the .explode() function is a quick way
    to make each row a unique movie-keyword pairing. From there, I removed stop words and
    punctuation. 
    '''
    words = filtered_data['keywords'].astype(str).apply(lambda x: x.split(',')).explode().reset_index()

    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~.'''
    words = words[~words['keywords'].isin(word_tokenize(punc))].reset_index().drop(
        'level_0', axis=1)

    trace = px.bar(
        x = words['keywords'].value_counts().head(10).sort_values(),
        y = words['keywords'].value_counts().head(10).sort_values().index,
        title = "Most Popular Keywords",
        labels = {'x':'Count',
                  'y':'Keywords'}
    )
    
    return trace


@app.callback(
    Output('graph6', 'figure'),
    Input('dropdown1', 'value'),
    Input('dropdown2', 'value'),
    Input('dropdown3', 'value'))

def update_graph6(year_from, year_to, genre):
    if (year_from == 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = data
    if (year_from != 'From') & (year_to == 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] >= year_from)]
    if (year_from == 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] <= year_to)]
    if (year_from == 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['genre'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre == 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['year'] <= year_to)]
    if (year_from == 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] <= year_to) & (data['genre'] == genre)]
    if (year_from != 'From') & (year_to == 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['genre'] == genre)]
    if (year_from != 'From') & (year_to != 'To') & (genre != 'All Genres'):
        filtered_data = data[(data['year'] >= year_from) & (data['year'] <= year_to) & 
                             (data['genre'] == genre)]
        
    trace = px.scatter(
        x = filtered_data['imDbRating'],
        y = filtered_data['metacriticRating']/10,
        title = "Do IMDb and Metacritic Rate the Same?",
        labels = {'x':'IMDb Rating',
                  'y':'Metacritic Rating'}
    )
    
    return trace


@app.callback(
    Output('prediction1_output', 'children'),
    Input('dropdown4', 'value'),
    Input('dropdown5', 'value'),
    Input('dropdown6', 'value'),
    Input('dropdown7', 'value'),
    Input('dropdown8', 'value'),
    Input('dropdown9', 'value'),
    Input('dropdown10', 'value'),
    Input('dropdown11', 'value'))

def update_prediction1(budget, runtime, release_month, company_size,
                      genre, star1, star2, director):
    """This function takes 8 user-defined inputs and outputs estimated budget through a trained NN"""
    
    numerical_df = pd.DataFrame(np.zeros((1, 5)),
                                columns=['runtime', 'dir_pop', 'wr_pop', 'star_power', 'adj_budget'])
    
    budget = float(budget.replace('$','').replace(',',''))
    #note: must manually scale since no referfence point in data
    numerical_df['runtime'] = (float(runtime) - 110.71) / 19.16
    numerical_df['dir_pop'] = (float(input_dict['dir'][director]) - 5.74) / 5.18
    numerical_df['wr_pop'] = 0.01 # this is avg value
    numerical_df['star_power'] = float(input_dict['st'][star1] + input_dict['st'][star2] + 7) # model uses 3 stars avg val 7
    numerical_df['star_power'] = (numerical_df['star_power'] - 20.95) / 13.50
    numerical_df['adj_budget'] = (float(budget) - 6.876731e+07) / 6.578914e+07
    
    # convert to numpy and scale the numericals
    normalized = numerical_df.values

    
    # one-hot the rest: ratings, genre, rp, cosize in that order
    # ratings
    ratings = np.array([0, 1, 0]).reshape(1, 3)
    
    # genre
    genres = pd.DataFrame(np.zeros((1, 5)), columns = ['Action/Adventure', 'Comedy', 'Drama', 'Misc', 'Thriller'])
    genre_model = genre_dict[genre]
    genres[genre_model] = 1.0
    
    # release period
    rp = pd.DataFrame(np.zeros((1, 4)), columns = ['aug_to_oct', 'feb_to_apr', 'may_to_july', 'nov_to_jan'])
    release_period = month_dict[release_month]
    rp[release_period] = 1.0
    
    # cosize
    cs = pd.DataFrame(np.zeros((1, 4)), columns = ['giant', 'large', 'medium', 'small'])
    cosize = co_size_dict[company_size]
    cs[cosize] = 1

    features = np.concatenate([normalized,
                               ratings,
                               genres.values,
                               rp.values,
                               cs.values],
                             axis=1)
    
    targ = np.random.rand(1, 1)
    
    ### Feed to NN ###
    
    model = torch.load('neural_net.pth')
    
    td = nrl.imdb_Dataset(features, targ)
    dloader = DataLoader(td,
                         batch_size=1,
                         shuffle=False)

    output = nrl.predict(dloader, model=model).numpy()[0][0]
    std = data['gross_profit_adj'].std()
    mean = data['gross_profit_adj'].mean()
    output = (output * std) + mean
    

    
    return '${:,.2f}'.format(output)


if __name__ == 'main':
    app.run_server()

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "main" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [22/Feb/2023 22:18:08] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [22/Feb/2023 22:18:08] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [22/Feb/2023 22:18:08] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [22/Feb/2023 22:18:08] "GET /_favicon.ico?v=2.8.1 HTTP/1.1" 200 -
127.0.0.1 - - [22/Feb/2023 22:18:08] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [22/Feb/2023 22:18:08] "GET /_dash-component-suites/dash/dcc/async-graph.js HTTP/1.1" 200 -
127.0.0.1 - - [22/Feb/2023 22:18:08] "GET /_dash-component-suites/dash/dcc/async-dropdown.js HTTP/1.1" 200 -
127.0.0.1 - - [22/Feb/2023 22:18:08] "GET /_dash-component-suites/dash/dcc/async-plotlyjs.js HTTP/1.1" 200 -
127.0.0.1 - - [22/Feb/2023 22:18:08] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [22/Feb/2023 22:18:08] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [22/Feb/2023 22:18:08] "POST /_dash-update-component HTTP/1.1" 