In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
from scipy.stats import pearsonr
from scipy.stats import spearmanr

In [8]:
df = pd.read_csv("../data/final_v2.csv")
df.head()

Unnamed: 0,Name,Countries,Genres,Budget(USD)_Inflated,Domestic(USD)_Inflated,Domestic_Percentage,Foreign(USD)_Inflated,Foreign_Percentage,Worldwide(USD)_Inflated,Runtime(mins),...,animation,teen,film adaptation,musical,history,coming of age,sports,black comedy,war film,Worldwide_profit
0,10 cloverfield lane,united states of america,"drama, horror, sci-fi, thriller",6076746.0,87606020.0,0.65401,46346130.0,0.34599,133952100.0,103,...,0,0,0,0,0,0,0,0,0,22.0434
1,"10,000 bc",united states of america,"thriller, adventure, costume drama, action, ad...",144156300.0,130130900.0,0.351333,240260600.0,0.648667,370391400.0,109,...,0,0,0,0,0,0,0,0,0,2.569373
2,12 rounds,united states of america,"action thrillers, thriller, action, crime fiction",27262750.0,16677570.0,0.708013,6877890.0,0.291987,23555460.0,108,...,0,0,0,0,0,0,0,0,0,0.864016
3,12 strong,united states of america,"action, drama, history, war",39790500.0,52091130.0,0.644274,28761330.0,0.355726,80852460.0,130,...,0,0,0,0,1,0,0,0,0,2.031954
4,12 years a slave,united states of america,"biography, drama, history",25392600.0,71952470.0,0.301875,166399300.0,0.698125,238351700.0,134,...,0,0,0,0,1,0,0,0,0,9.38666


In [9]:
print(len(df))

2689


In [10]:
df['Year'].value_counts()

Year
2016    167
2015    160
2011    159
2013    149
2012    148
2018    145
2017    143
2014    141
2002    138
2006    135
2003    133
2004    130
2007    129
2010    127
2005    125
2019    118
2008    117
2001    113
2000    111
2009    101
Name: count, dtype: int64

In [11]:
average_percentages = df.groupby('Year')[['Foreign_Percentage', 'Domestic_Percentage']].mean()
average_percentages = average_percentages.reset_index()
average_percentages = average_percentages[(average_percentages['Year'] >= 2000) & (average_percentages['Year'] <= 2019)]
average_percentages = average_percentages.rename(columns={'Foreign_Percentage':'Foreign Percentage', 'Domestic_Percentage':'Domestic Percentage'})


In [12]:
average_percentages.head()

Unnamed: 0,Year,Foreign Percentage,Domestic Percentage
0,2000,0.351877,0.648123
1,2001,0.33818,0.66182
2,2002,0.349131,0.650869
3,2003,0.387312,0.612688
4,2004,0.363138,0.636862


In [13]:

fig = px.line(average_percentages, 
            x='Year', 
            y=['Domestic Percentage', 'Foreign Percentage'], 
            title='Average Percentages Over Time',
            labels={'value': 'Percentage', 'variable': 'Percentages'}
)
fig.update_layout(title=dict(text="Impact of Budget on Foreign Percentage", x=0.5, xanchor='center'))
fig.show()


In [14]:
fig.write_html("../_includes/year_plot.html")

In [15]:
fig = px.scatter(
    df,
    x='Foreign_Percentage', 
    y='Worldwide(USD)_Inflated',  # Multiple series in scatter plot
    title='Foreign Percentage vs World Wide Gross (USD)',
    labels={'value': 'Percentage', 'variable': 'Type'},  # Label axes and legend
)

# Show the plot
fig.show()

In [16]:
fig = px.scatter(
    df,
    x='Budget(USD)_Inflated', 
    y='Foreign_Percentage',  # Multiple series in scatter plot
    title='Foreign Percentage vs Budget(USD)_Inflated',
    labels={'value': 'Percentage', 'variable': 'Type'},  # Label axes and legend
    trendline="ols"
)

fig.update_traces(marker=dict(opacity=0.5))
fig.data[1].update(line=dict(color='black'))
# Show the plot
fig.show()

In [None]:
# Calculate Pearson correlation coefficient and p-value
correlation, p_value = pearsonr(df['Budget(USD)_Inflated'], df['Foreign_Percentage'])

# Display the results
print(f"Pearson Correlation Coefficient: {correlation:.2f}")
print(f"P-value: {p_value:.10f}")

In [None]:
correlation, p_value = spearmanr(df['Budget(USD)_Inflated'], df['Foreign_Percentage'])

# Display the results
print(f"Spearman Correlation Coefficient: {correlation:.2f}")
print(f"P-value: {p_value:.4f}")

In [26]:
df = df.rename(columns={'Budget(USD)_Inflated': 'Budget(USD) Inflated', 'Foreign_Percentage': 'Foreign Percentage'})

In [28]:
df['Log Budget'] = np.log(df['Budget(USD) Inflated'])
fig = px.scatter(
    df,
    x='Log Budget', 
    y='Foreign Percentage',  # Multiple series in scatter plot
    title='Impact of Foreign Percentage on the Log of the Budget',
    labels={'value': 'Percentage', 'variable': 'Type'},  # Label axes and legend
    trendline="ols"
)

fig.update_traces(marker=dict(opacity=0.5))
fig.update_layout(title=dict(text="Impact of Foreign Percentage on the Log of the Budget", x=0.5, xanchor='center'))
fig.data[1].update(line=dict(color='black'))
# Show the plot
fig.show()

In [19]:
correlation, p_value = spearmanr(df['Log Budget'], df['Foreign_Percentage'])

# Display the results
print(f"Spearman Correlation Coefficient: {correlation:.2f}")
print(f"P-value: {p_value:.4f}")

Spearman Correlation Coefficient: 0.35
P-value: 0.0000


In [20]:
# Calculate Pearson correlation coefficient and p-value
correlation, p_value = pearsonr(df['Log Budget'], df['Foreign_Percentage'])

# Display the results
print(f"Pearson Correlation Coefficient: {correlation:.2f}")
print(f"P-value: {p_value:.10f}")

Pearson Correlation Coefficient: 0.30
P-value: 0.0000000000


In [29]:
fig.write_html("../_includes/log_budget.html")

In [35]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)  # First quartile (25th percentile)
    Q3 = df[column].quantile(0.75)  # Third quartile (75th percentile)
    IQR = Q3 - Q1  # Interquartile range
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [36]:
df_cleaned = remove_outliers(df, 'Budget(USD)_Inflated')

In [37]:
fig = px.scatter(
    df_cleaned,
    x='Budget(USD) Inflated', 
    y='Foreign Percentage',  # Multiple series in scatter plot
    title='Foreign Percentage vs Budget(USD)_Inflated',
    labels={'value': 'Percentage', 'variable': 'Type'},  # Label axes and legend
    trendline="ols"
)

fig.update_traces(marker=dict(opacity=0.5))
fig.update_layout(title=dict(text="Impact of Budget on Foreign Percentage", x=0.5, xanchor='center'))
fig.data[1].update(line=dict(color='black'))
# Show the plot
fig.show()

In [38]:
# Calculate Pearson correlation coefficient and p-value
correlation, p_value = pearsonr(df['Budget(USD)_Inflated'], df['Foreign_Percentage'])

# Display the results
print(f"Pearson Correlation Coefficient: {correlation:.2f}")
print(f"P-value: {p_value:.10f}")

Pearson Correlation Coefficient: 0.33
P-value: 0.0000000000


In [39]:
fig.write_html("../_includes/budget_foreign.html")

In [40]:
print(len(df_cleaned))

2491


In [41]:
odd = df[df['Foreign_Percentage'] > 0.99]
print(len(odd))

14


In [42]:
odd.head()

Unnamed: 0,Name,Countries,Genres,Budget(USD)_Inflated,Domestic(USD)_Inflated,Domestic_Percentage,Foreign(USD)_Inflated,Foreign_Percentage,Worldwide(USD)_Inflated,Runtime(mins),...,animation,teen,film adaptation,musical,history,coming of age,sports,black comedy,war film,Worldwide_profit
61,a little bit of heaven,united states of america,"romantic comedy, romantic drama, romance film,...",16644570.0,20472.821852,0.002268,9005396.0,0.997732,9025869.0,107,...,0,0,0,0,0,0,0,0,0,0.542271
238,barely lethal,united states of america,"action, comedy",18248980.0,7390.836177,0.006505,1128726.0,0.993495,1136117.0,96,...,0,0,0,0,0,0,0,0,0,0.062256
317,blonde ambition,united states of america,"romantic comedy, romance film, comedy",14024630.0,9006.615824,0.003945,2274075.0,0.996055,2283082.0,93,...,0,0,0,0,0,0,0,0,0,0.162791
1188,knock knock,united states of america,"crime, thriller",2433197.0,44206.32483,0.006527,6728723.0,0.993473,6772929.0,99,...,0,0,0,0,0,0,0,0,0,2.783552
1255,lol,united states of america,comedy,14396130.0,60201.997887,0.00439,13652120.0,0.99561,13712320.0,97,...,0,0,0,0,0,0,0,0,0,0.952501


In [22]:
df_profitabilty = df.copy(deep=True)

df_profitabilty = df_profitabilty.rename(columns={'Worldwide_profit': 'Worldwide profit', 'Foreign_Percentage': 'Foreign Percentage'})
#df_profitabilty = df_profitabilty[df_profitabilty['Worldwide profit'] < 10]
df_profitabilty['Log Profit'] = np.log(df_profitabilty['Worldwide profit'])

fig = px.scatter(
    df_profitabilty,
    x='Foreign Percentage', 
    y='Log Profit',  # Multiple series in scatter plot
    title='Log Profit vs Foreign Percentage',
    labels={'value': 'Percentage', 'variable': 'Type'},  # Label axes and legend
    trendline="ols"
)

fig.update_traces(marker=dict(opacity=0.5))
fig.update_layout(title=dict(text="Impact of Foreign Percentage on Log Worldwide profit", x=0.5, xanchor='center'))
fig.data[1].update(line=dict(color='black'))
# Show the plot
fig.show()

In [24]:
from plotly.subplots import make_subplots
import plotly.express as px

df_profitabilty['Foreign_higher'] = df.copy(deep=True)['Foreign_higher'].replace({
    0: "Domestic % > 50%",
    1: "Foreign % > 50%"
})


# Box plot grouped by Rating and Foreign_higher
fig_box = px.box(df_profitabilty,
    x='Foreign_higher', 
    y='Log Profit',  
    color='Foreign_higher', 
    title="Box Plot for Two Classes",
    labels={'value': 'Percentage', 'variable': 'Type'},
    color_discrete_map={"Domestic % > 50%": "blue", "Foreign % > 50%": "red"},  # Customize colors
)

fig_box.for_each_trace(
    lambda t: t.update(name="")
)

fig_box.update_layout(
    title={
        "text": "Distribution of Profitability by Class",
        "x": 0.5,  # Center the title
        "xanchor": "center",  # Anchor the title to the center
    },
    xaxis_title="Class",
    yaxis_title="Profitability",
    legend_title="Key",
)

# Grouped bar chart for counts of movies by Rating and Foreign_higher
fig_bar = px.histogram(
    df_profitabilty,
    x="Foreign_higher",
    color="Foreign_higher",  # Separate by Foreign_higher
    title="Count of Movies by Class",
    color_discrete_map={"Domestic % > 50%": "blue", "Foreign % > 50%": "red"},  # Optional: Customize colors
)

# Update the layout of the bar chart
fig_bar.update_layout(
    xaxis_title="Class",
    yaxis_title="Count of Movies",
    showlegend=False,  # Hide legend for the bar chart
    yaxis=dict(
        autorange="reversed",  # Flip the y-axis to make the bars go downward
    ),
)

# Create subplots: 2 rows, 1 column (vertical layout)
fig_combined = make_subplots(
    rows=2, cols=1,
    row_heights=[0.7, 0.3],  # More space for the box plot
    shared_xaxes=True,
    vertical_spacing=0.05,
    subplot_titles=["Class", ""],  # Empty title for second subplot
)

# Add all traces from the box plot to the first row
for trace in fig_box['data']:
    fig_combined.add_trace(trace, row=1, col=1)

# Add bar chart to the second row
for trace in fig_bar['data']:
    fig_combined.add_trace(trace, row=2, col=1)

# Update layout to center the title and maintain consistency in axis titles
fig_combined.update_layout(
    title={
        "text": "Distribution of Profitability and Movie Counts by Class",
        "x": 0.5,
        "xanchor": "center"
    },
    boxmode="group",  # Group the box plots side-by-side
    yaxis_title="Log Profitability",
    showlegend=True,
    xaxis=dict(
        type="category",  # Categorical x-axis
        categoryorder="array",  # Explicitly set order to control spacing
        categoryarray=["Domestic % > 50%", "Foreign % > 50%"],  # Define category sequence
    ),
    yaxis2=dict(
        title="Count",
        autorange="reversed",  # Ensure count axis is reversed for downward bars
    ),
    margin=dict(t=50, b=80),  # Adjust top and bottom margins for proper spacing
    annotations=[
        dict(
            y=-0.15,  # Position below the x-axis (outside the plot area)
            xref="paper",  # Reference paper (not data)
            yref="paper",
            showarrow=False,
            font=dict(size=14),
            align="center"
        )
    ]
)

fig_combined.show()

In [25]:
fig.write_html("../_includes/log_profitability_count.html")

In [46]:
print(len(df_profitabilty))

2521


In [60]:
df_runtime = df.copy(deep=True)

df_runtime['Foreign_higher'] = df.copy(deep=True)['Foreign_higher'].replace({
    0: "Domestic % > 50%",
    1: "Foreign % > 50%"
})


# Box plot grouped by Rating and Foreign_higher
fig_box = px.box(df_runtime,
    x='Foreign_higher', 
    y='Runtime(mins)',  
    color='Foreign_higher', 
    title="Box Plot for Two Classes",
    labels={'value': 'Percentage', 'variable': 'Type'},
    color_discrete_map={"Domestic % > 50%": "blue", "Foreign % > 50%": "red"},  # Customize colors
)

fig_box.update_layout(
    title={
        "text": "Distribution of Runtime by Class",
        "x": 0.5,  # Center the title
        "xanchor": "center",  # Anchor the title to the center
    },
    xaxis_title="Class",
    yaxis_title="Runtime",
    legend_title="Key",
)
fig_box.show()

In [61]:
fig_box.write_html("../_includes/runtime_plot.html")