In [2]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from src.utils.helpers import convert_csv
from src.constants import *

In [3]:
movies = pd.read_csv(DATA_FOLDER_PREPROCESSED + "preprocessed_movies.csv")
convert_csv(movies).head()

Unnamed: 0,title,languages,countries,genres,release_date,cold_war_side,character_western_bloc_representation,character_eastern_bloc_representation,western_bloc_values,eastern_bloc_values,theme
0,$,,[Russia],"[Comedy, Crime, Drama]",1971,Western,"[Joe Collins, American bank security consultan...","[Dawn Divine, hooker with a heart of gold, cun...",[None],"[Resourcefulness, cleverness, individualism, h...",[None]
1,"$1,000 on the Black","[Italian, German]","[Germany, Italy]",[Western],1966,Eastern,[None],"[Sartana, villainous, oppressive, cruel, arche...","[Johnny Liston, justice, determination, resili...","[Justice, revenge, oppressed vs. oppressor, re...","[Terror, betrayal, familial conflict, crime, r..."
2,"$10,000 Blood Money",,[Russia],"[Western, Drama]",1967,,[None],[None],[None],[None],"[crime, betrayal, revenge, bounty hunter, heis..."
3,"$100,000 for Ringo",[Italian],[Italy],"[Western, Drama]",1965,,[None],[None],[None],[None],"[Western, Civil War, mistaken identity, treasu..."
4,'Anna' i wampir,,[Russia],[Crime],1982,,[None],[None],[None],[None],"[murder mystery, horror, fog, Poland, 1960s]"


In [4]:
languages_exploded = movies.explode('languages')
languages_count = languages_exploded['languages'].value_counts().reset_index()
languages_count.columns = ['language', 'count']
languages_count = languages_count.sort_values(by='count', ascending=False)

fig = px.bar(languages_count.head(25), x='language', y='count', labels={'language':'Language', 'count':'Number of Movies'})
fig.update_yaxes(type="log")
fig.update_layout(title_text="Top 25 Languages in Movies", title_x=0.5, title_font_weight='bold')
fig.update_traces(marker_color='#2A3F5F')

fig.show()

In [5]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'pie'}, {'type': 'pie'}]], subplot_titles=("Including neutral movies", "Excluding neutral movies"))

# prepare data for the pie chart
global_movies_side_count = movies['cold_war_side'].value_counts().reset_index()
global_movies_side_count.columns = ['side', 'count']

cw_movies_side_count = movies[movies['cold_war_side'] != 'None']
cw_movies_side_count = cw_movies_side_count['cold_war_side'].value_counts().reset_index()
cw_movies_side_count.columns = ['side', 'count']

# Define colors for the pie chart slices
colors = {'None': NEUTRAL_COLOR, 'Western': WESTERN_COLOR, 'Eastern': EASTERN_COLOR}

fig.add_trace(go.Pie(labels=global_movies_side_count['side'],
                    values=global_movies_side_count['count'],
                    marker=dict(colors=[colors[side] for side in global_movies_side_count['side']]),
                    hovertemplate='<b>%{label}</b><br>Count: %{value}<br><extra></extra>',
                    name='Side'), row=1, col=1)

fig.add_trace(go.Pie(labels=cw_movies_side_count['side'],
                    values=cw_movies_side_count['count'],
                    marker=dict(colors=[colors[side] for side in cw_movies_side_count['side']]),
                    hovertemplate='<b>%{label}</b><br>Count: %{value}<br><extra></extra>',
                    name='Side'), row=1, col=2)

# Adjust the domain of the pie charts to add space between the charts and the titles
fig.update_traces(domain=dict(x=[0, 0.45], y=[0, 0.95]), row=1, col=1)
fig.update_traces(domain=dict(x=[0.55, 1], y=[0, 0.95]), row=1, col=2)

fig.update_layout(title_text="Movies Distribution based on Cold War Side", title_x=0.5, title_font_weight='bold')
fig.show()

In [6]:
western_movies = movies[movies['cold_war_side'] == 'Western']
eastern_movies = movies[movies['cold_war_side'] == 'Eastern']

western_languages = western_movies.explode('languages')
western_languages = western_languages[western_languages['languages'] != '']
western_languages_count = western_languages['languages'].value_counts().reset_index()
western_languages_count.columns = ['language', 'count']

eastern_languages = eastern_movies.explode('languages')
eastern_languages = eastern_languages[eastern_languages['languages'] != '']
eastern_languages = eastern_languages[eastern_languages['languages'] != '??????']
eastern_languages_count = eastern_languages['languages'].value_counts().reset_index()
eastern_languages_count.columns = ['language', 'count']

In [7]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'bar'}, {'type': 'bar'}]], subplot_titles=("in Western Bloc Movies", "in Eastern Bloc Movies"))
fig.add_trace(go.Bar(x=western_languages_count['language'].head(20), 
                    y=western_languages_count['count'],
                    hovertemplate='%{x}: %{y}<extra></extra>', 
                    marker_color='#0F89E6'), row=1, col=1)

fig.add_trace(go.Bar(x=eastern_languages_count['language'].head(20),
                    y=eastern_languages_count['count'],
                    hovertemplate='%{x}: %{y}<extra></extra>',
                    marker_color='#DD3C32'), row=1, col=2)

# add log scale to y-axis
fig.update_yaxes(type="log", row=1, col=1)
fig.update_yaxes(type="log", row=1, col=2)
fig.update_xaxes(tickangle=45)
# add x-axis and y-axis labels (set a common label for both subplots)

fig.update_xaxes(title_text="Language")
fig.update_yaxes(title_text="Number of Movies")


fig.update_layout(title_text="Top 20 Languages", title_x=0.5, title_font_weight='bold', showlegend=False)
# Update the layout for better visualization
fig.update_layout(
    title_text="Top 20 Languages",
    title_x=0.5,
    title_font_weight='bold', 
    showlegend=False,
    xaxis=dict(title_text='Language'),
    xaxis2=dict(title_text='Language'),
    yaxis=dict(title_text='Number of Movies'),
    yaxis2=dict(title_text='Number of Movies')
)

fig.show()

In [8]:
fig = px.bar(western_languages_count.head(20), x='language', y='count', labels={'language':'Language', 'count':'Number of Movies (log scale)'})
fig.update_layout(title_text="Top 20 Languages in Western Bloc Movies", title_x=0.5, title_font_weight='bold', width=800)
fig.update_yaxes(type="log")
fig.update_traces(marker_color='#0F89E6')
fig.show()

In [9]:
fig = px.bar(eastern_languages_count.head(20), x='language', y='count', labels={'language':'Language', 'count':'Number of Movies (log scale)'})
fig.update_layout(title_text="Top 20 Languages in Eastern Bloc Movies", title_x=0.5, title_font_weight='bold', width=800)
fig.update_yaxes(type="log")
fig.update_traces(marker_color='#DD3C32')
fig.show()

In [10]:
fig = go.Figure()
fig.add_trace(go.Bar(x=western_languages_count['language'].head(20), 
                    y=western_languages_count['count'], 
                    name='Western Bloc Movies',
                    hovertemplate='%{x}: %{y}<extra></extra>',
                    marker_color='#0F89E6'))

fig.add_trace(go.Bar(x=eastern_languages_count['language'].head(20),
                    y=eastern_languages_count['count'],
                    name='Eastern Bloc Movies',
                    hovertemplate='%{x}: %{y}<extra></extra>',
                    marker_color='#DD3C32'))

fig.update_layout(
    title_text="Top 20 Languages in Western and Eastern Bloc Movies", 
    title_x=0.5, 
    title_font_weight='bold',
    xaxis=dict(title_text='Language'),
    yaxis=dict(title_text='Number of Movies (log scale)'), 
    barmode='group',
    width=950)

fig.update_xaxes(tickangle=45)
fig.update_yaxes(type="log")
fig.show()