In [57]:
import pandas as pd
import numpy as np
import re

from src.utils.helpers import clean_column_values

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings('ignore')

In [58]:
DATA_FOLDER = "data/"

movies = pd.read_csv(DATA_FOLDER + "v1_movies_cleaned.csv")
movies.head()

Unnamed: 0,title,languages,countries,genres,keywords,release_date,plot_summary,year_release_date,cold_war_side,character_western_bloc_representation,character_eastern_bloc_representation,western_bloc_values,eastern_bloc_values,theme
0,$,['English'],['United States of America'],"['Drama', 'Comedy', 'Action', 'Thriller', 'Hei...",,1971-12-17,"Set in Hamburg, West Germany, several criminal...",1971,"""Western""","['Joe Collins', 'American bank security consul...","['Sarge', 'corrupt U.S. Army sergeant', 'value...","['Cunning', 'heroism', 'cleverness', 'survival...","['Ruthlessness', 'violence', 'greed', 'betraya...","['Heist', 'crime', 'betrayal', 'survival', 'te..."
1,"$1,000 on the Black","['Deutsch', 'Italiano']","['Germany', 'Italy']",['Western'],,1966-12-18,Johnny Liston has just been released from pris...,1966,"""Western""","['Johnny Liston', 'justice', 'redemption', 'he...","['Sartana', 'tyranny', 'betrayal', 'antagonist']","['Justice', 'redemption', 'individualism', 'pe...","['Tyranny', 'fear', 'betrayal', 'oppression']","['Revenge', 'self-discovery', 'moral conflict'..."
2,"$10,000 Blood Money",,['Russia'],"['Drama', 'Western']",,1967-01-01,Hired by a Mexican landowner to rescue his dau...,1967,"""None""",['None'],['None'],['None'],['None'],"['Betrayal', 'Greed', 'Bounty Hunter', 'Heist']"
3,"$100,000 for Ringo",['Italiano'],['Italy'],"['Drama', 'Western']","['spaghetti western', 'whipping']",1965-11-18,A stranger rides into Rainbow Valley where he'...,1965,"""None""",['None'],['None'],['None'],['None'],"['Western', 'Frontier', 'Stranger', 'Rivalry',..."
4,'68,['English'],"['United States of America', 'Hungary']","['Drama', 'Coming of age', 'Family Drama', 'Pe...",,1988-01-01,The father escaped the Soviet invasion of Buda...,1988,"""None""",['None'],['None'],['None'],['None'],"['Gay rights', 'counterculture', 'family confl..."


In [59]:
languages_translation = {
    '广州话/廣州話':'Chinese',
    '广州话 / 廣州話':'Chinese',
    '日本語':'Japanese',
    'Japan':'Japanese',
    '普通话':'Chinese',
    '한국어/조선말':'Korean',
    'ภาษาไทย':'Thai',
    'हिन्दी':'Indian',
    'தமிழ்':'Indian',
    'TiếngViệt':'Vietnamese',
    'Tiếng Việt':'Vietnamese',
    'العربية':'Arabic',
    'اردو':'Indian',
    'българскиезик':'Bulgarian',
    'Pусский':'Russian',
    'беларускаямова':'Belarusian',
    'Український':'Ukrainian',
    'Srpski':'Serbian',
    'Slovenčina':'Slovak',
    'Français':'French',
    'France':'French',
    'Deutsch':'German',
    'Italiano':'Italian',
    'Español':'Spanish',
    'Polski':'Polish',
    'Standard Mandarin':'Chinese',
    'Mandarin Chinese':'Chinese',
    'Mandarin':'Chinese',
    'Português':'Portuguese',
    'Standard Cantonese':'Chinese',
    'Cantonese':'Chinese',
    'suomi':'Finnish',
    'Magyar':'Hungarian',
    'Bosanski':'Bosnian',
    'svenska':'Swedish',
    'ελληνικά':'Greek',
    'Český':'Czech',
    'Dansk':'Danish',
    'Dutch':'Nederlands',
    'עִבְרִית':'Hebrew',
    'American English':'English',
    'Türkçe':'Turkish',
    'Tagalog':'Filipino',
    'Khmer':'Cambodian',
    'Hindi':'Indian',
    'Tamil':'Indian',
    'Telugu':'Indian',
    'Urdu':'Indian',
    'Oriya':'Indian',
    'Eesti':'Estonian',
    'Română':'Romanian',
    'Romani':'Romanian',
    'Norsk':'Norwegian',
    'No':'Norwegian',
    'Íslenska':'Icelandic',
    'Bahasa indonesia':'Indonesian',
    'Català':'Spanish',
    'Inuktitut':'Inuit',
    'Hakka':'Chinese',
    'Sicilian':'Italian',
    'Marathi':'Indian',
    'Hrvatski':'Croatian',
    'shqip':'Albanian',
    'isiZulu':'Zulu', 
    'Latviešu':'Latvian',
    'ქართული':'Georgian',
    'Australian English':'English',
    'Bahasamelayu':'Malay',
    'Lietuvi\\x9akai':'Lithuanian',
}

EASTERN_COLOR = '#DD3C32'
WESTERN_COLOR = '#0F89E6'
NEUTRAL_COLOR = '#C2C7D6'


In [60]:
movies['languages'] = movies['languages'].apply(clean_column_values)
movies['languages'] = movies['languages'].apply(lambda x: 
    set([languages_translation.get(string, string) for string in x]) if isinstance(x, list) else x)

In [61]:
languages_exploded = movies.explode('languages')
languages_exploded = languages_exploded[languages_exploded['languages'] != '']
len(languages_exploded['languages'].unique())

159

In [62]:
# Plot top 20 languages using plotly using log scale on y-axis
languages_count = languages_exploded['languages'].value_counts().reset_index()
languages_count.columns = ['language', 'count']
languages_count = languages_count.sort_values(by='count', ascending=False)

fig = px.bar(languages_count.head(25), x='language', y='count', labels={'language':'Language', 'count':'Number of Movies'})
fig.update_yaxes(type="log")
fig.update_layout(title_text="Top 25 Languages in Movies", title_x=0.5, title_font_weight='bold')
fig.update_traces(marker_color='#2A3F5F')

fig.show()

In [63]:
movies['cold_war_side'] = \
  movies['cold_war_side'].apply(lambda x: re.sub(r'[, " ]+', '', x) if isinstance(x, str) else x)

western_movies = movies[movies['cold_war_side'] == 'Western']
print('Number of "Western side" movies', len(western_movies))
eastern_movies = movies[movies['cold_war_side'] == 'Eastern']
print('Number of "Eastern side" movies', len(eastern_movies))

Number of "Western side" movies 3738
Number of "Eastern side" movies 3263


In [64]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'pie'}, {'type': 'pie'}]], subplot_titles=("Including neutral movies", "Excluding neutral movies"))

# prepare data for the pie chart
global_movies_side_count = movies['cold_war_side'].value_counts().reset_index()
global_movies_side_count.columns = ['side', 'count']

cw_movies_side_count = movies[movies['cold_war_side'] != 'None']
cw_movies_side_count = cw_movies_side_count['cold_war_side'].value_counts().reset_index()
cw_movies_side_count.columns = ['side', 'count']

# Define colors for the pie chart slices
colors = {'None': NEUTRAL_COLOR, 'Western': WESTERN_COLOR, 'Eastern': EASTERN_COLOR}

fig.add_trace(go.Pie(labels=global_movies_side_count['side'],
                    values=global_movies_side_count['count'],
                    marker=dict(colors=[colors[side] for side in global_movies_side_count['side']]),
                    hovertemplate='<b>%{label}</b><br>Count: %{value}<br><extra></extra>',
                    name='Side'), row=1, col=1)

fig.add_trace(go.Pie(labels=cw_movies_side_count['side'],
                    values=cw_movies_side_count['count'],
                    marker=dict(colors=[colors[side] for side in cw_movies_side_count['side']]),
                    hovertemplate='<b>%{label}</b><br>Count: %{value}<br><extra></extra>',
                    name='Side'), row=1, col=2)

# Adjust the domain of the pie charts to add space between the charts and the titles
fig.update_traces(domain=dict(x=[0, 0.45], y=[0, 0.95]), row=1, col=1)
fig.update_traces(domain=dict(x=[0.55, 1], y=[0, 0.95]), row=1, col=2)

fig.update_layout(title_text="Movies Distribution based on Cold War Side", title_x=0.5, title_font_weight='bold')
fig.show()

In [65]:
western_languages = western_movies.explode('languages')
western_languages = western_languages[western_languages['languages'] != '']
western_languages_count = western_languages['languages'].value_counts().reset_index()
western_languages_count.columns = ['language', 'count']

eastern_languages = eastern_movies.explode('languages')
eastern_languages = eastern_languages[eastern_languages['languages'] != '']
eastern_languages = eastern_languages[eastern_languages['languages'] != '??????']
eastern_languages_count = eastern_languages['languages'].value_counts().reset_index()
eastern_languages_count.columns = ['language', 'count']

In [67]:
# print all unique languages for western and eastern movies
print('Western Movies Languages:', western_languages['languages'].unique())


Western Movies Languages: ['English' 'Italian' 'German' nan 'French' 'Spanish' 'Japanese' 'Russian'
 'Turkish' 'Portuguese' 'Vietnamese' 'Chinese' 'Polish' 'Latin'
 'Nederlands' 'Swedish' 'Czech' 'Hungarian' 'Arabic' 'Finnish' 'Hebrew'
 'Filipino language' 'Indian' 'Khmer language' 'Greek' 'Irish' 'Navajo'
 'Korean' 'Hungarian language' 'Yiddish' 'Persian' 'Telugu language'
 'Malayalam' 'Maya' 'Yucatán' 'Romani language' 'Romanian'
 'Chinese language' 'Norwegian' 'Swahili' 'Kiswahili' 'Estonian'
 'Scottish Gaelic language' 'Nepali' 'Sinhala' 'Icelandic' 'Indonesian'
 'Afrikaans' 'Serbian language' 'Malti' 'American Sign' 'Thai' 'Inuit'
 'Danish' 'Malay' 'Georgian' 'Tagalog language' 'Quechua'
 'Klingon language' 'Croatian' 'Albanian' 'Ukrainian' 'Swiss German'
 'Slovak' 'Serbian' 'Bosnian' 'Sioux language' 'Bamanankan' 'Zulu']


In [68]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'bar'}, {'type': 'bar'}]], subplot_titles=("in Western Bloc Movies", "in Eastern Bloc Movies"))
fig.add_trace(go.Bar(x=western_languages_count['language'].head(20), 
                    y=western_languages_count['count'],
                    hovertemplate='%{x}: %{y}<extra></extra>', 
                    marker_color='#0F89E6'), row=1, col=1)

fig.add_trace(go.Bar(x=eastern_languages_count['language'].head(20),
                    y=eastern_languages_count['count'],
                    hovertemplate='%{x}: %{y}<extra></extra>',
                    marker_color='#DD3C32'), row=1, col=2)

# add log scale to y-axis
fig.update_yaxes(type="log", row=1, col=1)
fig.update_yaxes(type="log", row=1, col=2)
fig.update_xaxes(tickangle=45)
# add x-axis and y-axis labels (set a common label for both subplots)

fig.update_xaxes(title_text="Language")
fig.update_yaxes(title_text="Number of Movies")


fig.update_layout(title_text="Top 20 Languages", title_x=0.5, title_font_weight='bold', showlegend=False)
# Update the layout for better visualization
fig.update_layout(
    title_text="Top 20 Languages",
    title_x=0.5,
    title_font_weight='bold', 
    showlegend=False,
    xaxis=dict(title_text='Language'),
    xaxis2=dict(title_text='Language'),
    yaxis=dict(title_text='Number of Movies'),
    yaxis2=dict(title_text='Number of Movies')
)

fig.show()

In [69]:
fig = px.bar(western_languages_count.head(20), x='language', y='count', labels={'language':'Language', 'count':'Number of Movies (log scale)'})
fig.update_layout(title_text="Top 20 Languages in Western Bloc Movies", title_x=0.5, title_font_weight='bold', width=800)
fig.update_yaxes(type="log")
fig.update_traces(marker_color='#0F89E6')
fig.show()

In [70]:
fig = px.bar(eastern_languages_count.head(20), x='language', y='count', labels={'language':'Language', 'count':'Number of Movies (log scale)'})
fig.update_layout(title_text="Top 20 Languages in Eastern Bloc Movies", title_x=0.5, title_font_weight='bold', width=800)
fig.update_yaxes(type="log")
fig.update_traces(marker_color='#DD3C32')
fig.show()

In [71]:
fig = go.Figure()
fig.add_trace(go.Bar(x=western_languages_count['language'].head(20), 
                    y=western_languages_count['count'], 
                    name='Western Bloc Movies',
                    hovertemplate='%{x}: %{y}<extra></extra>',
                    marker_color='#0F89E6'))

fig.add_trace(go.Bar(x=eastern_languages_count['language'].head(20),
                    y=eastern_languages_count['count'],
                    name='Eastern Bloc Movies',
                    hovertemplate='%{x}: %{y}<extra></extra>',
                    marker_color='#DD3C32'))

fig.update_layout(
    title_text="Top 20 Languages in Western and Eastern Bloc Movies", 
    title_x=0.5, 
    title_font_weight='bold',
    xaxis=dict(title_text='Language'),
    yaxis=dict(title_text='Number of Movies (log scale)'), 
    barmode='group',
    width=950)

fig.update_xaxes(tickangle=45)
fig.update_yaxes(type="log")
fig.show()