In [None]:
import pandas as pd 
import numpy as np

In [None]:
df = pd.read_csv("data/merged_movie_metadata.csv")
df.head(2)

In [None]:
plot_summaries_path = 'data/plot_summaries.txt'
plot_summaries = {}
with open(plot_summaries_path, 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split('\t', 1)
        if len(parts) == 2:
            wiki_id, summary = parts
            plot_summaries[int(wiki_id)] = summary
            

# add plot_summaries.txt
df['Plot_Summary_Base'] = df['Wikipedia_ID'].map(plot_summaries)

# take only unique Wikipedia_ID
df['Wikipedia_ID'].nunique()

df.head()

In [None]:
import nltk
nltk.download(['stopwords', 'wordnet', 'punkt_tab'])

In [None]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def tok(sentence):
    tokenized = word_tokenize(sentence)
    return [lemmatizer.lemmatize(word.lower()) for word in tokenized if word.isalpha()]

In [None]:
tokenized_plots_america = df[df['Continents'].str.contains('America')].apply(
    lambda row: tok(row['Plot_Summary_Base']) if pd.notna(row['Plot_Summary_Base']) 
    else tok(row['Plot']) if pd.notna(row['Plot']) else np.nan,
    axis=1
)

In [None]:
tokenized_plots_europe = df[df['Continents'].str.contains('Europe')].apply(
    lambda row: tok(row['Plot_Summary_Base']) if pd.notna(row['Plot_Summary_Base']) 
    else tok(row['Plot']) if pd.notna(row['Plot']) else np.nan,
    axis=1
)

In [None]:
tokenized_plots_both = df[df['Continents'].str.contains('Both')].apply(
    lambda row: tok(row['Plot_Summary_Base']) if pd.notna(row['Plot_Summary_Base']) 
    else tok(row['Plot']) if pd.notna(row['Plot']) else np.nan,
    axis=1
)

In [None]:
from collections import Counter

word_count_america = Counter(tokenized_plots_america.dropna().explode())
word_count_europe = Counter(tokenized_plots_europe.dropna().explode())
word_count_both = Counter(tokenized_plots_both.dropna().explode())

In [None]:
print("Top 10 words in America:")
print(word_count_america.most_common(10))

print("\nTop 10 words in Europe:")
print(word_count_europe.most_common(10))

print("\nTop 10 words in Both:")
print(word_count_both.most_common(10))

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def filter_stop_words(word_list):
    return [word for word in word_list if word not in stop_words]

filtered_word_count_america = Counter(filter_stop_words(tokenized_plots_america.dropna().explode()))
filtered_word_count_europe = Counter(filter_stop_words(tokenized_plots_europe.dropna().explode()))
filtered_word_count_both = Counter(filter_stop_words(tokenized_plots_both.dropna().explode()))

print("\nTop 10 interesting words in America:")
print(filtered_word_count_america.most_common(10))

print("\nTop 10 interesting words in Europe:")
print(filtered_word_count_europe.most_common(10))

print("\nTop 10 interesting words in Both:")
print(filtered_word_count_both.most_common(10))

In [None]:
from wordcloud import WordCloud
from PIL import Image
import matplotlib.pyplot as plt

americas_mask = np.array(Image.open("data/Location_North_America.png"))
europe_mask = np.array(Image.open("data/Location_Europe.png"))
both_mask = np.array(Image.open("data/Location_Both.png"))


def generate_word_cloud(word_count, title, mask_image):
    wordcloud = WordCloud(width=800, height=400, background_color='white',
                          mask=mask_image).generate_from_frequencies(word_count)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()


generate_word_cloud(filtered_word_count_america, "Word Cloud for America", americas_mask)
generate_word_cloud(filtered_word_count_europe, "Word Cloud for Europe", europe_mask)
generate_word_cloud(filtered_word_count_both, "Word Cloud for Both", both_mask)

### Filter using the POS fields

**POS (Part Of Speech) fields contains word metadata. It indicates whether the current word is a noun, a verb, an adjective, etc.**

[An example POS definition can be found here](https://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used) :

![POS example definition](data/POS_tokens_BrownCorpusWikipedia.png)

We need to know what kind of fields we have in our files to extract the most useful ones for our analysis. To do so, and to avoid unzipping all the xml files to the computer, we can leverage bash tools and use this (non-optimized) command :

```bash
    zcat *.xml.gz | grep -oP "(?<=<POS>)[^<]+" | sort | uniq -c | sort -rn
```

How it works :
 - `zcat` displays the content of a .gz (gzipped) file as text
 - The output is piped to `grep` which extracts all the POS fields as strings : "(...) <POS>NN</POS> (...)" => "NN" 
 - The output is piped to `sort` which sorts this sequence of strings
 - The output is piped to `uniq` which prints each string only once, preceded with its number of occurences
 - The output is piped to `sort` which sorts the list of string occurences by descending order.


However, even if this method is (quiet) efficient, it would still take a long time to process all the files. Since we use this command only to get a grasp of the POS fields, we can execute it on a subset of files chosen randomly. We just replace "*xml.gz" with the first 5000 files after being sorted randomly.

```bash
    zcat  $(ls *.xml.gz | sort -R | head -n5000) | grep -oP "(?<=<POS>)[^<]+" | sort | uniq -c | sort -rn
```

Which gives as output :

```text
     250061 NN
     185484 IN
     164548 NNP
     162675 DT
     116598 VBZ
      93384 ,
      83306 JJ
      78609 .
      71877 PRP
      64993 CC
      59780 RB
      58662 NNS
      58193 TO
      55793 VB
      45140 PRP$
      39655 VBN
      37985 VBG
      23029 VBP
      18158 VBD
      17188 POS
      12434 CD
      10615 RP
      10395 WP
       9718 WRB
       7947 MD
       5856 WDT
       5332 ''
       5325 ``
       4710 :
       3115 -LRB-
       2315 JJR
       2197 NNPS
       1298 -RRB-
       1211 JJS
       1121 RBR
        989 EX
        775 PDT
        696 FW
        590 SYM
        394 WP$
        304 $
        240 RBS
        125 UH
         36 #
         17 LS
```

We redirect the output to a file `data/POS_tokens.csv` and read it with pandas :



In [None]:
df_pos = pd.read_csv("data/POS_tokens.csv", sep=" ", header=None)
df_pos.columns = ["occurence", "token"]
plt.figure(figsize=(10, 6))
df_pos.plot(x='token', y='occurence', kind='bar', color='skyblue')
plt.title(f'Repartition of the POS field content in 5000 files.', fontdict={'fontsize': 16, 'fontweight': 'bold'})
plt.xlabel('POS', fontdict={'fontsize': 14, 'fontweight': 'bold'})
plt.ylabel('Frequency', fontdict={'fontsize': 14, 'fontweight': 'bold'})
plt.xticks(rotation=90)
plt.show()

In [None]:
import gzip
from tqdm import tqdm
import xml.etree.ElementTree as ET
from pathlib import Path

In [None]:
# path where the compressed files are
folder_path = Path("../corenlp_plot_summaries")

assert folder_path.exists(), "Must configure the correct path to corenlp_plot_summaries"

# count matching folders/files
processed_ids = {f.stem.replace(".xml", "") for f in folder_path.glob("*.xml.gz")}
available_ids = set(df["Wikipedia_ID"].astype(str))
matching_files = [folder_path/f"{filename}.xml.gz" for filename in processed_ids.intersection(available_ids)]

print(f"Number of matching folders/files: {len(matching_files)}")
print(f"Number of unique Wiki id's: {df['Wikipedia_ID'].nunique()}")

7605 out of 11681 films that have went through the Stanford pipeline

In [None]:
def get_sentence_word_metadata(id, print_output=False):
    """Returns the output of the Stanford pipeline for param id in a list of sentences (=list of dict).
       If print_output is True, prints the token details.
    """
    gz_file_path = folder_path / f"{id}.xml.gz"

    if not gz_file_path.is_file():
        #print(f"Unable to find the corresponding .gz file for {id}.")
        return []

    try:
        # open and read the .gz file, then parse the XML content
        with gzip.open(gz_file_path, 'rb') as gz_file:
            xml_data = gz_file.read()
            root = ET.fromstring(xml_data.decode())  # parse XML from the string

        # process sentences and tokens as in the original code
        sentences = root[0][0]
        summary_word_metadata = []
        for sentence in sentences:
            if print_output:
                print(f"\n\n====================== Sentence n°{sentence.attrib['id']} ======================")
            for child in sentence:
                if child.tag != "tokens":
                    continue
                for token in child:
                    attribs = {c.tag: c.text for c in token}
                    if print_output:
                        print(f"{attribs['word']} ({attribs['lemma']}) => {attribs['POS']}")
                    summary_word_metadata.append(attribs)
        return summary_word_metadata

    except Exception as e:
        print(f"An error occurred while processing {gz_file_path}: {e}")
        return []

def filter_words_by_pos(tokens_metadata, pos_tags):
    """Filter words from tokens_metadata by specified POS tags."""
    return [entry['word'] for entry in tokens_metadata if entry['POS'] in pos_tags]

def filter_words_by_pos_ngram(tokens_metadata, pos_tags, ngram):
    """Filter ngrams from tokens_metadata by specified POS tags."""
    results = []
    for ngram_instance in nltk.ngrams(tokens_metadata, ngram):
        if ngram_instance[0]['POS'] in pos_tags:
            results.append((' ').join([entry['word'] for entry in ngram_instance]))

    return results

def generate_word_cloud(region, pos_tags, title, mask_image=None, sample_output=True, ngrams=1):
    """Generates a word cloud for a specified region and POS tags, with optional mask and sample output."""
    word_counter = Counter()
    for wiki_id in tqdm(region_ids[region]):
        tokens_metadata = get_sentence_word_metadata(wiki_id)
        filtered_words = filter_words_by_pos(tokens_metadata, pos_tags) if ngrams == 1 else filter_words_by_pos_ngram(tokens_metadata, pos_tags, ngrams)
        word_counter.update(filtered_words)
    
    if sample_output:
        # print sample output for verification
        print(f"\nSample of filtered words for {region} - {title}:")
        print(word_counter.most_common(10))
    
    # generate and display the word cloud with mask if provided
    
    # TODO omit the most common ?
    # word_counter.subtract(word_counter.most_common(6))
    
    wordcloud = WordCloud(width=800, height=400, background_color='white', mask=mask_image,
                          contour_color='black', contour_width=1).generate_from_frequencies(word_counter)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"{title} Word Cloud for {region}")
    plt.show()

Using the above POS denomination, we can group the fields we want to appear in a list so that only tokens that math those POS appear in our wordcloud :

In [None]:
# define path

# get IDs by region
region_ids = {
    "America": set(df[df['Continents'].str.contains("America")]['Wikipedia_ID'].astype(str)),
    "Europe": set(df[df['Continents'].str.contains("Europe")]['Wikipedia_ID'].astype(str)),
    "Both": set(df[df['Continents'].str.contains("Both")]['Wikipedia_ID'].astype(str))
}

# POS tags for nouns or verbs
# !! à completer
noun_tags = ['NN', 'NNA', 'NNC', 'NNS', 'NNP', 'NNPS']
verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBS', 'VBZ']
adjective_tags = ['JJ', 'JJR', 'JJS', 'JJC', 'JJA', 'JJF', 'JJM']


In [None]:
# generate_word_cloud("America", noun_tags, "Nouns in America", mask_image=americas_mask, ngrams=2)
generate_word_cloud("Europe", verb_tags, "Verbs in Europe", mask_image=europe_mask, ngrams=2)

In [None]:
generate_word_cloud("America", noun_tags, "Nouns in America", mask_image=americas_mask)
generate_word_cloud("Europe", noun_tags, "Nouns in Europe", mask_image=europe_mask)

In [None]:
generate_word_cloud("America", verb_tags, "Verbs in America", mask_image=americas_mask)
generate_word_cloud("Europe", verb_tags, "Verbs in Europe", mask_image=europe_mask)

In [None]:
generate_word_cloud("America", adjective_tags, "Adjectives in America", mask_image=americas_mask)
generate_word_cloud("Europe", adjective_tags, "Adjectives in Europe", mask_image=europe_mask)