In [None]:
from collections import Counter
from pathlib import Path
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from wordcloud import WordCloud
from ipywidgets import interact, Dropdown, VBox

%load_ext autoreload
%autoreload 2

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

In [2]:
# Load Data and Plot Summaries
DATASET_FILEPATH = Path("data/merged_movie_metadata.csv")
PLOT_SUMMARIES_FILEPATH = Path("../MovieSummaries/plot_summaries.txt")

df = pd.read_csv(DATASET_FILEPATH)

# Take only unique Wikipedia_ID
df = df.drop_duplicates(subset=['Wikipedia_ID'])

plot_summaries = {}

with open(PLOT_SUMMARIES_FILEPATH, 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split('\t', 1)
        if len(parts) == 2:
            wiki_id, summary = parts
            plot_summaries[int(wiki_id)] = summary

# Map plot summaries to the DataFrame
df['Plot_Summary_Base'] = df['Wikipedia_ID'].map(plot_summaries)


# Loading the datasets for the splits
dataset_path = "data/data_p3"

Data_Biggest_America = pd.read_csv(f"{dataset_path}/Data_Biggest_America.csv")
Data_Biggest_Both = pd.read_csv(f"{dataset_path}/Data_Biggest_Both.csv")
Data_Biggest_Europe = pd.read_csv(f"{dataset_path}/Data_Biggest_europe.csv")
Data_Biggest_Gap_diff_Eu_Am = pd.read_csv(f"{dataset_path}/Data_Biggest_Gap_diff_Eu_Am.csv")
Data_Smallest_Gap_diff_Eu_Am = pd.read_csv(f"{dataset_path}/Data_Smallest_Gap_diff_Eu_Am.csv")

# Dictionary for the splits
dataset_subsets = {
    "Biggest_America": Data_Biggest_America,
    "Biggest_Both": Data_Biggest_Both,
    "Biggest_Europe": Data_Biggest_Europe,
    "Biggest_Gap_diff_Eu_Am": Data_Biggest_Gap_diff_Eu_Am,
    "Smallest_Gap_diff_Eu_Am": Data_Smallest_Gap_diff_Eu_Am,
}

In [3]:
folder_path = Path("../corenlp_plot_summaries_augmented")

assert folder_path.exists(), "Must configure the correct path to corenlp_plot_summaries"

In [None]:
# count matching folders/files
processed_ids = {f.stem.replace(".xml", "") for f in folder_path.glob("*.xml.gz")}
available_ids = set(df["Wikipedia_ID"].astype(str))
matching_files = [folder_path/f"{filename}.xml.gz" for filename in processed_ids.intersection(available_ids)]

print(f"Number of matching folders/files: {len(matching_files)}")
print(f"Number of unique Wiki id's: {df['Wikipedia_ID'].nunique()}")

In [5]:
from src.utils.data_utils import filter_words_by_pos, filter_words_by_pos_ngram, get_sentence_word_metadata

# Define Regions
region_ids = {
    "America": set(df[df['Continents'].str.contains("America")]['Wikipedia_ID'].astype(str)),
    "Europe": set(df[df['Continents'].str.contains("Europe")]['Wikipedia_ID'].astype(str)),
    "Both": set(df[df['Continents'].str.contains("Both")]['Wikipedia_ID'].astype(str))
}

After we've successfully extracted the words we wanted, the next step is to visualize them. To do this, we will use a word cloud visualization to display the most frequent words. In a word cloud, word size represent their frequency of occurence. This gives immediately a grasp of the word occurences.

In [6]:
def generate_word_cloud(region, pos_tags, title, mask_image=None, sample_output=True, ngrams=1, subset_name=None, store_to=None, dpi=300):
    """Generates a word cloud for a specified region, subset, and POS tags, with optional mask and sample output."""
    # Filter the region_ids based on the subset
    subset_ids = set()
    if subset_name and subset_name in dataset_subsets:
        subset_ids = set(dataset_subsets[subset_name]['Wikipedia_ID'].astype(str))
    
    region_subset_ids = region_ids[region].intersection(subset_ids) if subset_name else region_ids[region]
    
    word_counter = Counter()
    for wiki_id in tqdm(region_subset_ids):
        tokens_metadata = get_sentence_word_metadata(folder_path, wiki_id)
        filtered_words = filter_words_by_pos(tokens_metadata, pos_tags) if ngrams == 1 else filter_words_by_pos_ngram(tokens_metadata, pos_tags, ngrams)
        word_counter.update(filtered_words)
    
    if sample_output:
        # Print sample output for verification
        print(f"\nSample of filtered words for {region} - {title}:")
        print(word_counter.most_common(10))
    
    # Generate and display the word cloud with mask if provided
    wordcloud = WordCloud(width=1200, height=1200, background_color='white', mask=mask_image,
                          contour_color='black', contour_width=1).generate_from_frequencies(word_counter)
    
    # Save or show the word cloud with higher DPI
    plt.figure(figsize=(10, 10), dpi=dpi)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    if store_to:
        plt.savefig(store_to, dpi=dpi, bbox_inches='tight', pad_inches=0.1)
    else:
        plt.show()


In [7]:
# POS Tags Groups
noun_tags = ['NN', 'NNA', 'NNC', 'NNS', 'NNP', 'NNPS']
verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBS', 'VBZ']
adjective_tags = ['JJ', 'JJR', 'JJS', 'JJC', 'JJA', 'JJF', 'JJM']

# Word Cloud Mask Images
americas_mask = np.array(Image.open("src/ressource/Location_North_America.png"))
europe_mask = np.array(Image.open("src/ressource/Location_Europe.png"))
both_mask = np.array(Image.open("src/ressource/Location_Both.png"))

In [8]:
from ipywidgets import interact, Dropdown, IntSlider

# Dropdown for Region
region_dropdown = Dropdown(
    options=list(region_ids.keys()),
    value="Europe",   # Default
    description="Region:"
)

# Dropdown for Subset
subset_dropdown = Dropdown(
    options=[None] + list(dataset_subsets.keys()),
    value="Biggest_Both",   # Default
    description="Subset:"
)

# Dropdown for POS Tags
pos_tags_dropdown = Dropdown(
    options=["NN", "VB", "JJ"],
    value="NN",   # Default
    description="POS Tags:"
)

# Slider for N-grams
ngrams_slider = IntSlider(
    value=1,   # Default
    min=1,
    max=5,
    step=1,
    description="N-grams:"
)

# Region to Mask Mapping
region_to_mask = {
    "America": americas_mask,
    "Europe": europe_mask,
    "Both": both_mask
}


In [9]:
store = Path("docs/assets/img/wordclouds/")
store.mkdir(exist_ok=True)

In [None]:
# WARNING this took 90min on my laptop.

regions = list(region_ids.keys())
pos_tags = ["NN", "VB", "JJ"]
ngrams = list(range(1, 4))
subsets = [None] + list(dataset_subsets.keys())

# region = "Europe"
# pos_tag = "VB"
# ngram = 3
# subset = None

for region in regions:
    for pos_tag in pos_tags:
        for ngram in ngrams:
            for subset in subsets:
                mask = region_to_mask.get(region, None)
                filename = f"{region}__{subset}__{pos_tag}__{ngram}.jpg"
                generate_word_cloud(region=region,
                                    pos_tags=[pos_tag],
                                    title=f"[POS={pos_tag}] - '{subset if subset else 'All available'}' in {region.title()}",
                                    subset_name=subset,
                                    sample_output=False,
                                    mask_image=mask,
                                    ngrams=ngram,
                                    store_to=store / filename,
                                    dpi = 300)
                print(filename)

In [None]:

# Interactive Word Cloud Function
def interactive_word_cloud(region, subset_name, pos_tag, ngrams):
    """Generates and displays a word cloud based on widget selections"""
    mask_image = region_to_mask.get(region, None)  # Get the correct mask
    
    generate_word_cloud(
        region=region,
        pos_tags=[pos_tag],
        title=f"{pos_tag} Word Cloud with {ngrams}-grams",
        mask_image=mask_image,
        subset_name=subset_name,
        ngrams=ngrams
    )

# Bind Widgets to Function
interact(
    interactive_word_cloud,
    region=region_dropdown,
    subset_name=subset_dropdown,
    pos_tag=pos_tags_dropdown,
    ngrams=ngrams_slider 
)