In [298]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from ast import literal_eval 
import plotly.express as px
import numpy as np

import spacy
nlp = spacy.load("en_core_web_sm")


In [299]:
df = pd.read_csv('data\merged_file.csv',low_memory=False)
df= df.dropna(subset='PlotSummaries')

df_sum = df[['release_date','PlotSummaries']]
df_sum = df_sum[df_sum['release_date'] >= 1910] 
display(df_sum.dropna(subset='release_date'))

Unnamed: 0,release_date,PlotSummaries
0,2007.0,"Three friends -– Arlene , Shanna , and radio D..."
1,2007.0,{{plot}} The movie opens with the first traile...
5,2007.0,Om Prakash Makhija is a junior artiste in the ...
6,2007.0,Shot in both Latin America and the United Stat...
9,2008.0,The film is presented as found footage from a...
...,...,...
88012,2010.0,"The film is about two friends, Tayyar , a mafi..."
88013,1941.0,{{plot}} The film opens with a Great Western e...
88015,2011.0,Two former National Oceanic Atmospheric Admini...
88017,1992.0,"The story takes place in the year 2092,The Sup..."


In [194]:
# Load spaCy model with disabled components for efficiency
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Pre-compile the list of stop words for faster checking
stop_words = nlp.Defaults.stop_words

# Function to check if token should be included
def is_token_allowed(token):
    return not token.is_punct and token.pos_ in ["ADJ", "PROPN", "NOUN"] and token.text.lower() not in stop_words

# Function to preprocess token
def preprocess_token(token):
    return token.lemma_.strip().lower()

def process_text(summary):
    # Ensure the text is treated as a writable string
    doc = nlp(summary)
    # Convert the processed tokens to strings to avoid any complex types
    return [preprocess_token(token) for token in doc if is_token_allowed(token)]

# 7 mins        
df_sum['tokens']= df_sum['PlotSummaries'].apply(lambda x: process_text(x))



In [300]:
display(df_sum)

Unnamed: 0,release_date,PlotSummaries
0,2007.0,"Three friends -– Arlene , Shanna , and radio D..."
1,2007.0,{{plot}} The movie opens with the first traile...
5,2007.0,Om Prakash Makhija is a junior artiste in the ...
6,2007.0,Shot in both Latin America and the United Stat...
9,2008.0,The film is presented as found footage from a...
...,...,...
88012,2010.0,"The film is about two friends, Tayyar , a mafi..."
88013,1941.0,{{plot}} The film opens with a Great Western e...
88015,2011.0,Two former National Oceanic Atmospheric Admini...
88017,1992.0,"The story takes place in the year 2092,The Sup..."


In [301]:
def plot_occurrences(df, eth, ethnicities, period=5,highlight_periods=None):
    data = df.copy()

    for et in ethnicities:
        index = eth[eth['Ethnicity'] == et].index[0]
        occ_list = literal_eval(eth.at[index, 'Keywords'])

        def count_words(word_list):
            return [word.lower() in occ_list for word in word_list].count(True)

        # Apply the function to count ethnicity-related words for each row
        data[et] = data['tokens'].apply(lambda x: count_words(x))

    # Group by release year and sum the counts
    data['release_period'] = data['release_date'] // period * period
    counts_by_period = data.groupby('release_period')[ethnicities].sum().reset_index()

    # Convert the DataFrame to long format for Plotly
    counts_by_period_long = pd.melt(counts_by_period, id_vars='release_period', value_vars=ethnicities)

    # Create a stacked bar plot using Plotly Express
    fig = px.area(counts_by_period_long, x='release_period', y='value', color='variable',
                 title=f'Number of Group-related Words Over the Years ',
                 labels={'value': 'Number of Group-related Words', 'variable': 'Ethnicity'},
                 category_orders={'variable': ethnicities})

    if highlight_periods:
        for period_range in highlight_periods:
            fig.add_vrect(
                x0=period_range[0],
                x1=period_range[1],
                annotation_text=period_range[2],
                annotation_position="top left",
                fillcolor='rgba(255, 0, 0, 0.3)', 
                layer='below',
                line=None)

    fig.show()

The way we looked at the representation of different ethnicities or demographic groups through time is by identifying ethnicity related words in the movie summaries. We decided to include locations as well since in the case of nationality, location play a great role in representation of a group. This method of search for representation is partially flawed because all summaries do not precise the location or ethnicity of their characters, which reduces greatly our sample size. Moreover, the way the words are counted favors historical films which may have a lot more nationality related words in their summary, even when other film are taking place in historically relevant places but are not historical.

In [302]:
data = {
    "Ethnicity": ["French", "German", "Italian", "Spanish", "Chinese", "Japanese", "Indian", "Russian", "Mexican", 
    "Brazilian", "Canadian", "Australian", "African", "Middle Eastern", "Scandinavian","American",'English','Vietnamese',
    'Afghan','Ukrainian','Thai','Malaysian'],
    "Keywords": [
        ["french", "france", "paris", "parisian", "parisians", "frenchman", "frenchmen", "frenchwoman", "frenchwomen"],
        ["german", "germany", "berlin", "bavarian", "berliners",'frankfurt','germans'],
        ["italian", "italy", "rome", "venetian", "italians", "romans"],
        ["spanish", "spain", "madrid", "barcelona", "spaniards", "madrilenian", "barcelonians"],
        ["chinese", "china", "beijing", "shanghai", "chinese person", "chinese people"],
        ["japanese", "japan", "tokyo", "osakan", "japanese person", "japanese people"],
        ["indian", "india", "delhi", "mumbai", "indians", "delhites", "mumbaikars"],
        ["russian", "russia", "moscow", "stpetersburg", "russians", "moscovites", "petersburgers"],
        ["mexican", "mexico", "mexico city", "mexicans", "mexico city dwellers"],
        ["brazilian", "brazil", "rio", "saopaulo", "brazilians", "cariocas", "paulistanos"],
        ["canadian", "canada", "toronto", "vancouver", "canadians", "torontonians", "vancouverites"],
        ["australian", "australia", "sydney", "melbourne", "australians", "sydneysiders", "melburnians"],
        ["african", "africa", "nigerian", "kenyan", "africans", "nigerians", "kenyans"],
        ["middleeastern", "middleeast", "arab", "israeli", "middle eastern", "arabs", "israelis"],
        ["scandinavian", "scandinavia", "swedish", "norwegian", "scandinavians", "swedes", "norwegians"],
        ["american", "york", "yorker", "angeles", "chicago", "washington", "philadelphia"],
        ["british",'uk','britain','london'],
        ["viet",'vietnam','vietcong'],
        ["afghanistan",'afghan'],
        ['ukrainian','ukraine','kiev'],
        ['thai','thailand'],
        ['malaysian','malaysia']
    ]
}

eth = pd.DataFrame(data)
eth['Keywords'] = eth['Keywords'].apply(str)
eth['Ethnicity'] = eth['Ethnicity'].apply(str)

data = {
    'Ethnicity': ['Nazi','Jewish','Islamic','LGBTQ','Man','Woman','Boy','Girl','Elderly'],
    'Keywords': [
        ['nazi', 'nazis'],
        ['jewish', 'jews','jew','ashkenazy'],
        ['islamic', 'muslim', 'islam','muslims'],
        ['pansexual', 'homo', 'homosexual', 'queer', 'gay', 'lesbian', 'bisexual', 'transgender','lgbtq'],
        ['man','men'],
        ['woman','women'],
        ['boy'],
        ['girl'],
        ['old','elderly','grandma','grandpa']
    ]
}

eth2 = pd.DataFrame(data)
eth2['Keywords'] = eth2['Keywords'].apply(str)
eth2['Ethnicity'] = eth2['Ethnicity'].apply(str)
ethmerged= pd.concat([eth,eth2],ignore_index=True)

ethmerged


Unnamed: 0,Ethnicity,Keywords
0,French,"['french', 'france', 'paris', 'parisian', 'par..."
1,German,"['german', 'germany', 'berlin', 'bavarian', 'b..."
2,Italian,"['italian', 'italy', 'rome', 'venetian', 'ital..."
3,Spanish,"['spanish', 'spain', 'madrid', 'barcelona', 's..."
4,Chinese,"['chinese', 'china', 'beijing', 'shanghai', 'c..."
5,Japanese,"['japanese', 'japan', 'tokyo', 'osakan', 'japa..."
6,Indian,"['indian', 'india', 'delhi', 'mumbai', 'indian..."
7,Russian,"['russian', 'russia', 'moscow', 'stpetersburg'..."
8,Mexican,"['mexican', 'mexico', 'mexico city', 'mexicans..."
9,Brazilian,"['brazilian', 'brazil', 'rio', 'saopaulo', 'br..."


In [303]:
ethnicities=['German','Nazi','Japanese','English','French','Spanish']
plot_occurrences(df_sum, ethmerged, ethnicities,1,[
    ('1914', '1918', 'WWI'),
    ('1939', '1945', 'WWII'),
    ('1989', '1989', 'Fall of the Berlin Wall')
])

KeyError: 'tokens'

Looking at some major actors of WWII, the prevalence of german representation can be seen a feen years after its begining as well but fades out rapidly, the same can be observed at a lower extent for the japanese representation. It also seems that big events trigger an increase in representation even after the main spike is going down. German and Japanese representation was mostly inexitent before WWII compared to France for example, but became relatively well represented afterwards, up until a global jump in representation due to film abundance.

In [None]:
ethnicities=['Russian','Scandinavian','Ukrainian']
plot_occurrences(df_sum, ethmerged, ethnicities, 1,[
    ('1947', '1991', 'Cold War')
])

The cold war period seems to have a delayed increase in representation after 1960, which sustains itself through the conflict. While Russian representation keeps rising afterward, the comparison with a few east european countries suggest that this is not only due to the rise of total number of films. 

In [None]:
ethnicities=['Vietnamese','Afghan','Thai','Malaysian']
plot_occurrences(df_sum, ethmerged, ethnicities, 1,[
    ('1955', '1975', 'Vietnam War'),('2001', '2021', 'Afghanistan War')
])

The Vietnam war display similar behavior to the cold war but we can observe a decrease in representation after some time, the War in Afghanistan seem to display an surge of interest as well. The data does not stop abruptly in the dataset, so the last years having few representations is mainly due to lack of data.

In [None]:
ethnicities=['Afghan']
plot_occurrences(df_sum, ethmerged, ethnicities, 1,[
    ('2001', '2021', 'Afghanistan War')
])

In [None]:
ethnicities=["Middle Eastern",'Islamic']
plot_occurrences(df_sum, ethmerged, ethnicities, 1, [('2001', '2001', '9/11 Attacks')])

This is similar to what we saw for other events.

In [None]:
ethnicities=['Chinese']
plot_occurrences(df_sum, ethmerged, ethnicities, 1,[('1966','1976', 'Chinese Cultural Revolution')])

We choose to focus on historical event involving the USA because for the time period of our dataset, the USA were the main producers of movies. Indeed, it seems like big events for other countries do not create the same reaction spike, like this example.

Overall, we would like to dive deeper into the time lag of representation .