In [None]:
# === Imports ===
import pandas as pd
import numpy as np
from nltk import word_tokenize, download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from wordcloud import WordCloud
from PIL import Image
import matplotlib.pyplot as plt
import gzip
import xml.etree.ElementTree as ET
from pathlib import Path
from tqdm import tqdm
import nltk

nltk.download(['stopwords', 'wordnet', 'punkt_tab'])

In [None]:
# === Load Data and Plot Summaries ===
df = pd.read_csv("data/merged_movie_metadata.csv")

# Take only unique Wikipedia_ID
df = df.drop_duplicates(subset=['Wikipedia_ID'])

plot_summaries_path = 'data/plot_summaries.txt'
plot_summaries = {}

with open(plot_summaries_path, 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split('\t', 1)
        if len(parts) == 2:
            wiki_id, summary = parts
            plot_summaries[int(wiki_id)] = summary

# Map plot summaries to the DataFrame
df['Plot_Summary_Base'] = df['Wikipedia_ID'].map(plot_summaries)

### Loading and Preparing Movie Plot Summaries

Our dataset includes two main sources for movie plot summaries:

1. **OMDB Data (Plot column)**: This source provides a brief summary of the movie plot in a few sentences.
2. **CMU Movie Summary Corpus (plot_summaries.txt)**: This file, obtained from the [CMU Movie Summary Corpus website](https://www.cs.cmu.edu/~ark/personas/), contains detailed plot summaries for a range of movies.

Since both sources may contain missing values for certain movies, we will combine the two to maximize coverage and ensure we have plot information for as many movies as possible. Below is a check to see how many values are missing in each plot source:

In [None]:
# Check missing values in both plot sources
print(f"Total movies: {len(df)}")
print(f"Missing values in OMDB Plot column: {df['Plot'].isna().sum()}")
print(f"Missing values in CMU Plot Summary column: {df['Plot_Summary_Base'].isna().sum()}")

# Movies missing both plot summaries
print(f"Movies missing both summaries: {(df['Plot'].isna() & df['Plot_Summary_Base'].isna()).sum()}")

### Text Preprocessing for Movie Plots

To analyze the language used in movie plots, we need to process the text data to standardize and clean it. We perform several key steps in this process:

1. For each movie, we prioritize plot summaries from the [CMU Movie Summary Corpus](https://www.cs.cmu.edu/~ark/personas/) because they tend to be richer in content. If a summary from this source is unavailable (i.e., a missing value), we use the summary from OMDB. If both sources are missing, the result will be marked as `NaN`.
   
2. We split each plot summary into individual words (tokens), to analyze the frequency and type of words used.

3. Commonly used words that don’t contribute meaningful information (known as stop words) are removed. Examples of stop words include "the," "is," "and," "in," "to," etc.

4. Each word is reduced to its base or root form. For instance, words like "running," "runs," and "ran" are all converted to "run." This helps improve the accuracy of our analysis by avoiding duplicates in different forms. This process is called **lemmatization**.

Below is the code that implements this preprocessing:

In [None]:
# === Text Preprocessing ===
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def tok(sentence):
    """Tokenize and lemmatize a sentence, removing stop words."""
    tokenized = word_tokenize(sentence)
    return [lemmatizer.lemmatize(word.lower()) for word in tokenized if word.isalpha() and word.lower() not in stop_words]

# Tokenize plots by region
tokenized_plots_america = df[df['Continents'].str.contains('America')].apply(
    lambda row: tok(row['Plot_Summary_Base']) if pd.notna(row['Plot_Summary_Base']) 
    else tok(row['Plot']) if pd.notna(row['Plot']) else np.nan,
    axis=1
)

tokenized_plots_europe = df[df['Continents'].str.contains('Europe')].apply(
    lambda row: tok(row['Plot_Summary_Base']) if pd.notna(row['Plot_Summary_Base']) 
    else tok(row['Plot']) if pd.notna(row['Plot']) else np.nan,
    axis=1
)

tokenized_plots_both = df[df['Continents'].str.contains('Both')].apply(
    lambda row: tok(row['Plot_Summary_Base']) if pd.notna(row['Plot_Summary_Base']) 
    else tok(row['Plot']) if pd.notna(row['Plot']) else np.nan,
    axis=1
)


We now want to see the most frequently used words in movie plots across our three regions: **America**, **Europe**, and **Both**. We use a `Counter` to count occurrences of each word in the tokenized plots for each region.

In [None]:
# === Word Counts ===
def get_word_count(tokenized_plots):
    """Returns a Counter of words in tokenized plots."""
    return Counter(tokenized_plots.dropna().explode())

word_count_america = get_word_count(tokenized_plots_america)
word_count_europe = get_word_count(tokenized_plots_europe)
word_count_both = get_word_count(tokenized_plots_both)

print("Top 10 words in America:", word_count_america.most_common(10))
print("\nTop 10 words in Europe:", word_count_europe.most_common(10))
print("\nTop 10 words in Both:", word_count_both.most_common(10))

We now have the most frequently used words in each region’s plot summaries, but these results feel somewhat limited. To gain deeper insights, we’ll use the Stanford CoreNLP-processed summaries, which provide richer linguistic information.

These summaries, derived from `plot_summaries.txt`, have been processed with the [Stanford CoreNLP pipeline](https://stanfordnlp.github.io/CoreNLP/), a tool that performs advanced language processing tasks such as part-of-speech tagging, syntactic parsing, named entity recognition (NER), and coreference resolution. This additional information will allow us to analyze not only word frequency but also the context and roles of words within each summary. We will first extract the **POS tags**.


### Filter using the POS fields

**POS (Part Of Speech) fields contains word metadata. It indicates whether the current word is a noun, a verb, an adjective, etc.**

[An example POS definition can be found here](https://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used) :

![POS example definition](data/POS_tokens_BrownCorpusWikipedia.png)

We need to know what kind of fields we have in our files to extract the most useful ones for our analysis. To do so, and to avoid unzipping all the xml files to the computer, we can leverage bash tools and use this (non-optimized) command :

```bash
    zcat *.xml.gz | grep -oP "(?<=<POS>)[^<]+" | sort | uniq -c | sort -rn
```

How it works :
 - `zcat` displays the content of a .gz (gzipped) file as text
 - The output is piped to `grep` which extracts all the POS fields as strings : "(...) <POS>NN</POS> (...)" => "NN" 
 - The output is piped to `sort` which sorts this sequence of strings
 - The output is piped to `uniq` which prints each string only once, preceded with its number of occurences
 - The output is piped to `sort` which sorts the list of string occurences by descending order.


However, even if this method is (quiet) efficient, it would still take a long time to process all the files. Since we use this command only to get a grasp of the POS fields, we can execute it on a subset of files chosen randomly. We just replace "*xml.gz" with the first 5000 files after being sorted randomly.

```bash
    zcat  $(ls *.xml.gz | sort -R | head -n5000) | grep -oP "(?<=<POS>)[^<]+" | sort | uniq -c | sort -rn
```

Which gives as output :

```text
     250061 NN
     185484 IN
     164548 NNP
     162675 DT
     116598 VBZ
      93384 ,
      83306 JJ
      78609 .
      71877 PRP
      64993 CC
      59780 RB
      58662 NNS
      58193 TO
      55793 VB
      45140 PRP$
      39655 VBN
      37985 VBG
      23029 VBP
      18158 VBD
      17188 POS
      12434 CD
      10615 RP
      10395 WP
       9718 WRB
       7947 MD
       5856 WDT
       5332 ''
       5325 ``
       4710 :
       3115 -LRB-
       2315 JJR
       2197 NNPS
       1298 -RRB-
       1211 JJS
       1121 RBR
        989 EX
        775 PDT
        696 FW
        590 SYM
        394 WP$
        304 $
        240 RBS
        125 UH
         36 #
         17 LS
```

We redirect the output to a file `data/POS_tokens.csv` and read it with pandas :



In [None]:
df_pos = pd.read_csv("data/POS_tokens.csv", sep=" ", header=None)
df_pos.columns = ["occurence", "token"]
plt.figure(figsize=(10, 6))
df_pos.plot(x='token', y='occurence', kind='bar', color='skyblue')
plt.title(f'Repartition of the POS field content in 5000 files.', fontdict={'fontsize': 16, 'fontweight': 'bold'})
plt.xlabel('POS', fontdict={'fontsize': 14, 'fontweight': 'bold'})
plt.ylabel('Frequency', fontdict={'fontsize': 14, 'fontweight': 'bold'})
plt.xticks(rotation=90)
plt.show()

In [None]:
# === Helper Functions for Metadata Extraction ===

folder_path = Path("../corenlp_plot_summaries")

assert folder_path.exists(), "Must configure the correct path to corenlp_plot_summaries"

# count matching folders/files
processed_ids = {f.stem.replace(".xml", "") for f in folder_path.glob("*.xml.gz")}
available_ids = set(df["Wikipedia_ID"].astype(str))
matching_files = [folder_path/f"{filename}.xml.gz" for filename in processed_ids.intersection(available_ids)]

print(f"Number of matching folders/files: {len(matching_files)}")
print(f"Number of unique Wiki id's: {df['Wikipedia_ID'].nunique()}")

7605 out of 11681 films that have went through the Stanford pipeline


In [None]:
def get_sentence_word_metadata(id, print_output=False):
    """Retrieve Stanford CoreNLP metadata for a given ID from .xml.gz files."""
    gz_file_path = folder_path / f"{id}.xml.gz"
    if not gz_file_path.is_file():
        return []

    try:
        with gzip.open(gz_file_path, 'rb') as gz_file:
            xml_data = gz_file.read()
            root = ET.fromstring(xml_data.decode())
        
        summary_word_metadata = []
        for sentence in root[0][0]:
            if print_output:
                print(f"\n\n=================== Sentence n°{sentence.attrib['id']} ===================")
            for child in sentence:
                if child.tag != "tokens":
                    continue
                for token in child:
                    attribs = {c.tag: c.text for c in token}
                    if print_output:
                        print(f"{attribs['word']} ({attribs['lemma']}) => {attribs['POS']}")
                    summary_word_metadata.append(attribs)
        return summary_word_metadata
    except Exception as e:
        print(f"Error processing {gz_file_path}: {e}")
        return []
    

# === Filter Functions ===
def filter_words_by_pos(tokens_metadata, pos_tags):
    """Filter words by specified POS tags."""
    return [entry['word'] for entry in tokens_metadata if entry['POS'] in pos_tags]

def filter_words_by_pos_ngram(tokens_metadata, pos_tags, ngram):
    """Generate n-grams filtered by specified POS tags."""
    results = []
    for ngram_instance in nltk.ngrams(tokens_metadata, ngram):
        if ngram_instance[0]['POS'] in pos_tags:
            results.append(' '.join(entry['word'] for entry in ngram_instance))
    return results

In [None]:
# === Word Cloud Generation ===
def generate_word_cloud(region, pos_tags, title, mask_image=None, sample_output=True, ngrams=1):
    """Generates a word cloud for a specified region and POS tags, with optional mask and sample output."""
    word_counter = Counter()
    for wiki_id in tqdm(region_ids[region]):
        tokens_metadata = get_sentence_word_metadata(wiki_id)
        filtered_words = filter_words_by_pos(tokens_metadata, pos_tags) if ngrams == 1 else filter_words_by_pos_ngram(tokens_metadata, pos_tags, ngrams)
        word_counter.update(filtered_words)
    
    if sample_output:
        # print sample output for verification
        print(f"\nSample of filtered words for {region} - {title}:")
        print(word_counter.most_common(10))
    
    # generate and display the word cloud with mask if provided
    
    # TODO omit the most common ?
    # word_counter.subtract(word_counter.most_common(6))
    
    wordcloud = WordCloud(width=800, height=400, background_color='white', mask=mask_image,
                          contour_color='black', contour_width=1).generate_from_frequencies(word_counter)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"{title} Word Cloud for {region}")
    plt.show()

Using the above POS denomination, we can group the fields we want to appear in a list so that only tokens that match those POS appear in our wordcloud :


In [None]:
# === Define Regions ===
region_ids = {
    "America": set(df[df['Continents'].str.contains("America")]['Wikipedia_ID'].astype(str)),
    "Europe": set(df[df['Continents'].str.contains("Europe")]['Wikipedia_ID'].astype(str)),
    "Both": set(df[df['Continents'].str.contains("Both")]['Wikipedia_ID'].astype(str))
}

# === POS Tags Groups ===
noun_tags = ['NN', 'NNA', 'NNC', 'NNS', 'NNP', 'NNPS']
verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBS', 'VBZ']
adjective_tags = ['JJ', 'JJR', 'JJS', 'JJC', 'JJA', 'JJF', 'JJM']

# === Word Cloud Mask Images ===
americas_mask = np.array(Image.open("data/Location_North_America.png"))
europe_mask = np.array(Image.open("data/Location_Europe.png"))
both_mask = np.array(Image.open("data/Location_Both.png"))

In [None]:
# === Generate Word Clouds ===

# Generate example word clouds
generate_word_cloud("America", noun_tags, "Nouns in America", mask_image=americas_mask, ngrams=2)
generate_word_cloud("Europe", verb_tags, "Verbs in Europe", mask_image=europe_mask, ngrams=2)