In [None]:
import pandas as pd 
import numpy as np

In [None]:
df = pd.read_csv("data/merged_movie_metadata.csv")
df.head(2)

In [None]:
plot_summaries_path = 'data/plot_summaries.txt'
plot_summaries = {}
with open(plot_summaries_path, 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split('\t', 1)
        if len(parts) == 2:
            wiki_id, summary = parts
            plot_summaries[int(wiki_id)] = summary
            

# add plot_summaries.txt
df['Plot_Summary_Base'] = df['Wikipedia_ID'].map(plot_summaries)

# take only unique Wikipedia_ID
df['Wikipedia_ID'].nunique()

df.head()

In [None]:
import nltk
nltk.download(['stopwords', 'wordnet', 'punkt_tab'])

In [None]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def tok(sentence):
    tokenized = word_tokenize(sentence)
    return [lemmatizer.lemmatize(word.lower()) for word in tokenized if word.isalpha()]

In [None]:
tokenized_plots_america = df[df['Continents'].str.contains('America')].apply(
    lambda row: tok(row['Plot_Summary_Base']) if pd.notna(row['Plot_Summary_Base']) 
    else tok(row['Plot']) if pd.notna(row['Plot']) else np.nan,
    axis=1
)

In [None]:
tokenized_plots_europe = df[df['Continents'].str.contains('Europe')].apply(
    lambda row: tok(row['Plot_Summary_Base']) if pd.notna(row['Plot_Summary_Base']) 
    else tok(row['Plot']) if pd.notna(row['Plot']) else np.nan,
    axis=1
)

In [None]:
tokenized_plots_both = df[df['Continents'].str.contains('Both')].apply(
    lambda row: tok(row['Plot_Summary_Base']) if pd.notna(row['Plot_Summary_Base']) 
    else tok(row['Plot']) if pd.notna(row['Plot']) else np.nan,
    axis=1
)

In [None]:
from collections import Counter

word_count_america = Counter(tokenized_plots_america.dropna().explode())
word_count_europe = Counter(tokenized_plots_europe.dropna().explode())
word_count_both = Counter(tokenized_plots_both.dropna().explode())

In [None]:
print("Top 10 words in America:")
print(word_count_america.most_common(10))

print("\nTop 10 words in Europe:")
print(word_count_europe.most_common(10))

print("\nTop 10 words in Both:")
print(word_count_both.most_common(10))

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def filter_stop_words(word_list):
    return [word for word in word_list if word not in stop_words]

filtered_word_count_america = Counter(filter_stop_words(tokenized_plots_america.dropna().explode()))
filtered_word_count_europe = Counter(filter_stop_words(tokenized_plots_europe.dropna().explode()))
filtered_word_count_both = Counter(filter_stop_words(tokenized_plots_both.dropna().explode()))

print("\nTop 10 interesting words in America:")
print(filtered_word_count_america.most_common(10))

print("\nTop 10 interesting words in Europe:")
print(filtered_word_count_europe.most_common(10))

print("\nTop 10 interesting words in Both:")
print(filtered_word_count_both.most_common(10))

In [None]:
from wordcloud import WordCloud
from PIL import Image
import matplotlib.pyplot as plt

americas_mask = np.array(Image.open("data/Location_North_America.png"))
europe_mask = np.array(Image.open("data/Location_Europe.png"))
both_mask = np.array(Image.open("data/Location_Both.png"))


def generate_word_cloud(word_count, title, mask_image):
    wordcloud = WordCloud(width=800, height=400, background_color='white',
                          mask=mask_image).generate_from_frequencies(word_count)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()


generate_word_cloud(filtered_word_count_america, "Word Cloud for America", americas_mask)
generate_word_cloud(filtered_word_count_europe, "Word Cloud for Europe", europe_mask)
generate_word_cloud(filtered_word_count_both, "Word Cloud for Both", both_mask)

In [None]:
import gzip
import xml.etree.ElementTree as ET
from pathlib import Path

In [None]:
# path where the compressed files are
folder_path = Path("your_path/corenlp_plot_summaries")

# count matching folders/files
matching_files = [file for file in folder_path.glob("*.xml.gz") if file.stem.replace('.xml', '') in set(df['Wikipedia_ID'].astype(str))]


print(f"Number of matching folders/files: {len(matching_files)}")
print(f"Number of unique Wiki id's: {df['Wikipedia_ID'].nunique()}")

7605 out of 11681 films that have went through the Stanford pipeline

In [None]:
def get_sentence_word_metadata(id, print_output=False):
    """Returns the output of the Stanford pipeline for param id in a list of sentences (=list of dict).
       If print_output is True, prints the token details.
    """
    gz_file_path = folder_path / f"{id}.xml.gz"

    if not gz_file_path.is_file():
        #print(f"Unable to find the corresponding .gz file for {id}.")
        return []

    try:
        # open and read the .gz file, then parse the XML content
        with gzip.open(gz_file_path, 'rb') as gz_file:
            xml_data = gz_file.read()
            root = ET.fromstring(xml_data.decode())  # parse XML from the string

        # process sentences and tokens as in the original code
        sentences = root[0][0]
        summary_word_metadata = []
        for sentence in sentences:
            if print_output:
                print(f"\n\n====================== Sentence n°{sentence.attrib['id']} ======================")
            for child in sentence:
                if child.tag != "tokens":
                    continue
                for token in child:
                    attribs = {c.tag: c.text for c in token}
                    if print_output:
                        print(f"{attribs['word']} ({attribs['lemma']}) => {attribs['POS']}")
                    summary_word_metadata.append(attribs)
        return summary_word_metadata

    except Exception as e:
        print(f"An error occurred while processing {gz_file_path}: {e}")
        return []

def filter_words_by_pos(tokens_metadata, pos_tags):
    """Filter words from tokens_metadata by specified POS tags."""
    return [entry['word'] for entry in tokens_metadata if entry['POS'] in pos_tags]


def generate_word_cloud(region, pos_tags, title, mask_image=None, sample_output=True):
    """Generates a word cloud for a specified region and POS tags, with optional mask and sample output."""
    word_counter = Counter()
    for wiki_id in region_ids[region]:
        tokens_metadata = get_sentence_word_metadata(wiki_id)
        filtered_words = filter_words_by_pos(tokens_metadata, pos_tags)
        word_counter.update(filtered_words)
    
    if sample_output:
        # print sample output for verification
        print(f"\nSample of filtered words for {region} - {title}:")
        print(word_counter.most_common(10))
    
    # generate and display the word cloud with mask if provided
    wordcloud = WordCloud(width=800, height=400, background_color='white', mask=mask_image,
                          contour_color='black', contour_width=1).generate_from_frequencies(word_counter)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"{title} Word Cloud for {region}")
    plt.show()

In [None]:
# define path

# get IDs by region
region_ids = {
    "America": set(df[df['Continents'].str.contains("America")]['Wikipedia_ID'].astype(str)),
    "Europe": set(df[df['Continents'].str.contains("Europe")]['Wikipedia_ID'].astype(str)),
    "Both": set(df[df['Continents'].str.contains("Both")]['Wikipedia_ID'].astype(str))
}

# POS tags for nouns or verbs
# !! à completer
noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

generate_word_cloud("America", noun_tags, "Nouns in America", mask_image=americas_mask)