#  Data Analysis: Microsoft Movie Studios Viability Analysis                                      
***
- Student Name: Tenicka Norwood
- Program Pace: self paced 
- Scheduled Project Review time: 10/26/2022 
- Instructor name: Joe Comeaux
- Blog post Url: 

# Exploratory Data Analysis Q1

## When is the best time of year to release a movie?

In [None]:
## Investigate the relationship between release_month and worldwide gross profit for the top 100 grossing films
# Set up theme
plt.style.use('fivethirtyeight')
# Set up Parameters
# Create plot variables
values = np.array(budgets_by_month_df.month_name)
labels = np.array(budgets_by_month_df.worldwide_profit)/10000000
clrs = ['grey' if (x != max(values)) else 'blue' for x in values]

# Set up plot figure size
plt.figure(figsize =(14,10))
ax = sns.barplot(x = values, y = labels, palette = clrs)
ax.set_title("Figure 6.1: Worldwide Profit vs Release Month for the Top 100 Grossing films", weight = 'bold').set_fontsize('16')
ax.set_xlabel("Release Month", fontsize = '14', weight = 'bold')
ax.set_ylabel("World Wide Profit ($10 M)", fontsize = '14', weight = 'bold');
for bar in ax.patches:
    if bar.get_height() < 5:
        bar.set_color('grey')
    else:
        bar.set_color('blue')

In [None]:
# Print out the name of the column and the Pearson correlation of 
# the column most positively correlated with ROI 
correlation_values = filtered_movie_ROI_df.corr()["ROI"].sort_values()
print("The column with the most positive correlation with ROI is worldwide profit")

In [None]:
# Find correlations in filtered_movie_ROI dataframe
filtered_movie_ROI_df.corr().style.background_gradient(cmap="Blues")

In [None]:
# Rough filter of prior correlation investigating dataframe lower limit
new_filtered_movie_ROI_df = filtered_movie_ROI_df[filtered_movie_ROI_df["production_budget"] >= 175000000]
new_filtered_movie_ROI_df.describe()

In [None]:
# Look at the filtered_movie_ROI dataframe
filtered_movie_ROI_df.info()

In [None]:
# Rough filter of prior correlation investigating dataframe upper limit
new_filtered_movie_ROI_df = new_filtered_movie_ROI_df[new_filtered_movie_ROI_df["production_budget"] <= 250000000]
new_filtered_movie_ROI_df.info()

In [None]:
new_filtered_movie_ROI_df.head()

In [None]:
# Set up theme
sns.set_theme(style="darkgrid", palette="Set2")
x_values = new_filtered_movie_ROI_df.production_budget
y_values = new_filtered_movie_ROI_df.worldwide_gross

# Set up plot
fig, ax = plt.subplots(figsize = (15, 10))
ax.scatter(x = x_values, y = "worldwide_gross", data = new_filtered_movie_ROI_df.head(100), alpha=0.7, color="blue")
ax.set_title("Figure 6.2: Worldwide Gross Revenue vs Production Budget", weight = 'bold').set_fontsize('16')
ax.set_xlabel("Production Budget: (Millions of Dollars)", fontsize = 14, weight = 'bold')
ax.set_ylabel("Worldwide Gross Revenue: (Millions of Dollars)", fontsize = 14, weight = 'bold');
ax.ticklabel_format(style = "plain")
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos:'{:.4g}'.format(x/100000000)))
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos:'{:.4g}'.format(x/100000000)))

In [None]:
# Plot the relationship between production budget and ROI for films with production budgets between $175 and $250 Million.
# Set up theme
#sns.set_theme(style="darkgrid", palette="Set2")

# Create plot variables
data = new_filtered_movie_ROI_df.head(100)
values = np.array(data.ROI)
labels = np.array(data.production_budget)/1000000
clrs = ['grey' if (x != max(values)) else 'blue' for x in values]

# Set up plot figure size
plt.figure(figsize =(14,10))
ax = sns.boxplot(x = labels, y = values, palette = clrs)
ax.set_title("Figure 6.3: ROI vs Production Budget", weight = 'bold').set_fontsize('16')
ax.set_xlabel("Production Budget: (Millions of Dollars)", fontsize = '14', weight = 'bold')
ax.set_ylabel("ROI (% return on production budget)", fontsize = '14', weight = 'bold');

# Exploratory Data Analysis Q2

## Investigating the relationship between directors and WorldWide Profit

In [None]:
# Create a new dataframe that has worldwide_profit and directors
tn_and_imdb_full.info()
worldwide_profit_directors_df = tn_and_imdb_full.loc[:, ["director","primary_title", "production_budget",  "worldwide_profit","ROI"
                                         , "release_month", "start_year"]]


In [None]:
# Group by director and worldwide_profit
sorted_ww_profit_directors = worldwide_profit_directors_df.sort_values("worldwide_profit", ascending = False)
sorted_ww_profit_directors = sorted_ww_profit_directors.drop_duplicates()
top_100_movies_by_wwprofit = sorted_ww_profit_directors.head(100)

In [None]:
# Set up theme
# Set up Parameters
# Create plot variables
values = np.array(top_100_movies_by_wwprofit.head(10).director)
labels = np.array(top_100_movies_by_wwprofit.head(10).worldwide_profit)
sns.set(font_scale = 1.75)
clrs = ['grey' if (x != max(values)) else 'blue' for x in values]
# Set up plot figure size
plt.figure(figsize =(16,15))
ax = sns.barplot(y = values, x = labels, palette = clrs)
ax.set_title("Figure 6.3: Top 10 Directors by Worldwide Profit", weight = 'bold').set_fontsize('30')
ax.set_ylabel("Director Name", fontsize = '25', weight = 'bold')
ax.set_xlabel("Worldwide Profit ($1M)", fontsize = '25', weight = 'bold');
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos:'{:.4g}'.format(x/10000000)))
for bar in ax.patches:
    if bar.get_width() < 1150000000:
        bar.set_color('grey')
    else:
        bar.set_color('blue')

# Exploratory Data Analysis Q3

## Which genres of movies make the most profit at the box office?

In [None]:
# Investigating the relationship between Genre and Worldwide Profit
# Create a smaller dataframe to investigate production cost by genre
genre_budget_df = tn_and_imdb.loc[:, ["multi_genre","primary_title", "production_budget",  "worldwide_profit","ROI"
                                         , "release_month"]]
genre_budget_df.head(20)


In [None]:
# Visualize correlations
#plt.figure(figsize =(14,10))
#sns.pairplot(tn_and_imdb)

In [None]:
# Review correlations on filtered dataset
plt.figure(figsize =(14,10))
ax = sns.heatmap(filtered_movie_ROI_df.corr(), annot = True, cmap = 'Blues')

In [None]:
# Explode genres 
exploded_genre_budget_df = genre_budget_df.explode("multi_genre")
exploded_genre_budget_df.head(20)

In [None]:
# Group by multi_genre by production budget
grouped_genre_budget = exploded_genre_budget_df.groupby("multi_genre")["production_budget"].median()
grouped_genre_budget.head(20)

In [None]:
# Create a dataframe from the series
grouped_genre_budget_df = grouped_genre_budget.to_frame(name = "production_budget").reset_index()
grouped_genre_budget_df.head(20)

In [None]:
# Sort by Production Budget 
grouped_genre_budget_df.sort_values("production_budget", ascending = False)

In [None]:
# Count unique elements in each column
exploded_genre_budget_df.nunique()

In [None]:
exploded_genre_budget_df.sort_values("worldwide_profit", ascending = False)
exploded_genre_budget_df.head()

In [None]:
# Group by multi_genre by production budget
grouped_genre_ww_profit = exploded_genre_budget_df.groupby("multi_genre")["worldwide_profit"].median()

In [None]:
# Create a dataframe from the series
grouped_genre_ww_profit_df = grouped_genre_ww_profit.to_frame(name = "worldwide_profit").reset_index()
grouped_genre_ww_profit_df.head()

In [None]:
# Look at the dataframe sha
grouped_genre_ww_profit_df.shape

In [None]:
grouped_genre_ww_profit_df.info()

In [None]:
# Set up theme
#sns.set_theme(style="darkgrid", palette="Set2")
# Set up Parameters
# Create plot variables
values = np.array(grouped_genre_ww_profit_df.multi_genre)
labels = np.array(grouped_genre_ww_profit_df.worldwide_profit)/1000000
clrs = ['grey' if (x != max(values)) else 'blue' for x in values]

# Set up plot figure size
plt.figure(figsize =(14,10))
ax = sns.barplot(y = values, x = labels, palette = clrs)
ax.set_title("Figure 6.3: Median Worldwide Profit by Genre", weight = 'bold').set_fontsize('16')
ax.set_ylabel("Genre", fontsize = '14', weight = 'bold')
ax.set_xlabel("Median World Wide Profit ($1M)", fontsize = '14', weight = 'bold');
for bar in ax.patches:
    if bar.get_width() < 45:
        bar.set_color('grey')
    else:
        bar.set_color('blue')

In [None]:
# Group by multi_genre by production budget


month_grouped_genre_median_ww_profit = exploded_genre_budget_df.groupby(["release_month","multi_genre"])["worldwide_profit"].median().unstack().transpose()

In [None]:
month_grouped_genre_median_ww_profit.head()

In [None]:

month_grouped_genre_median_ww_profit.style.highlight_max(color = "cyan", axis = 0)


In [None]:

month_grouped_genre_median_ww_profit[1]

In [None]:
#order = month_grouped_genre_median_ww_profit
#g = sns.FacetGrid(month_grouped_genre_median_ww_profit, col = "month_name")
#g.map(sns.barplot,"worldwide_profit", "multi_genre")

In [None]:
x = month_grouped_genre_median_ww_profit.index
january =  month_grouped_genre_median_ww_profit[1]/100000
clrs = ['grey' if (x != max(values)) else 'blue' for x in values]
# Set up plot figure size
plt.figure(figsize =(14,10))
ax = sns.barplot(x = january, y = x ,palette = clrs )
ax.set_title("Figure 6.3: Median WorldWide Profit By Genre", weight = 'bold').set_fontsize('12')
ax.set_ylabel("Genre", fontsize = '12', weight = 'bold')
ax.set_xlabel("Median World Wide Profit (Millions of Dollars)", fontsize = '12', weight = 'bold');
for bar in ax.patches:
    if bar.get_width() < 600:
        bar.set_color('grey')
    else:
        bar.set_color('blue')
        