# PLOT VISUALISATIONS

In [1]:
import sys
import os

import itertools

import dask
import numpy as np
import pandas as pd
from dask.dataframe import from_pandas
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
# set Dask configuration to avoid automatic string conversion in DataFrames
dask.config.set({"dataframe.convert-string": False})

import requests
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# add the path to the parent directory of `src` to the system path
sys.path.insert(0, os.path.abspath('../..'))
# import custom utility functions from the `src` package
from src.utils.data_preparation import *

# reload all modules (except those excluded by %aimport) before executing the Python code
%load_ext autoreload
%autoreload 2

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46693 instead


In [2]:
from datetime import datetime


def get_journal_date_issue_from_page(x_str):
    try:
        components = x_str.split("-")

        # validate minimum number of components
        if len(components) < 4:
            raise ValueError(f"Invalid input format: {x_str}")

        # extract parts
        issue_id = "-".join(components[:-1])
        journal = components[0]
        date_str = "-".join(np.array(components)[[1, 2, 3]])
        date = datetime.strptime(date_str, "%Y-%m-%d")

        return (journal, issue_id, date)

    except Exception as e:
        # return fallback values
        print(f"Error processing input '{x_str}': {e}")
        return None, None, None

In [3]:
issues_df = pd.read_parquet("../../data/data_preparation/samples/issues_20241112-135609.parquet.gzip")
issues_df.sample(n=5)

Unnamed: 0,id,cdt,pp,iiif_manifest_uri,ci_id,ci_tp,ci_t,ci_l,ci_ro,journal,date
18544,jdpl-1820-12-16-a,2024-03-27 16:37:13,jdpl-1820-12-16-a-p0002,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k42...,jdpl-1820-12-16-a-i0004,article,"... – . , . FitAmiE.",fr,6,jdpl,1820-12-16
95660,lepetitparisien-1938-12-01-a,2024-03-28 13:23:19,lepetitparisien-1938-12-01-a-p0010,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k66...,lepetitparisien-1938-12-01-a-i0181,ad,Adv. 46 Page 10,,181,lepetitparisien,1938-12-01
143491,oeuvre-1927-06-05-a,2024-02-25 23:19:54,oeuvre-1927-06-05-a-p0003,https://gallica.bnf.fr/iiif/ark:/12148/bpt6k46...,oeuvre-1927-06-05-a-i0129,ad,Publicité,,55,oeuvre,1927-06-05
103007,obermosel-1917-04-11-a,2024-03-01 14:56:59,obermosel-1917-04-11-a-p0002,,obermosel-1917-04-11-a-i0026,article,Prinzeß Lolos Verzicht. 1. Fortsetzung.,de,23,obermosel,1917-04-11
115041,luxwort-1936-05-04-a,2024-03-01 20:36:53,luxwort-1936-05-04-a-p0002,,luxwort-1936-05-04-a-i0047,article,Einzelergebnisfe.,de,43,luxwort,1936-05-04


In [4]:
journals = list(set(issues_df.sort_values(by=["journal"])["journal"].drop_duplicates()))

In [5]:
journals = [j for j in journals if j not in ["luxwort"]]

In [None]:
k = 12000

i = 0
nb_journals = len(journals)
for journal in journals:
    print(f"Processing journal: {journal}")
    i += 1
    print(f"{i}/{nb_journals}")
    journal_issues_df = issues_df[issues_df["journal"] == journal]
    
    # get canonical data
    print(f"Reading bbxs...")
    pages_df = pd.read_parquet(f"../../data/data_preparation/samples/journals/{journal}_sample_{k}_pages.parquet.gzip")
    
    journal_issues_df = journal_issues_df.rename(
        columns={"id": "issue_id", "cdt": "issue_cdt", "pp": "page_id"}
    )
    pages_df = pages_df.rename(columns={"id": "page_id", "cdt": "page_cdt"})
    
    print(f"Merging pages and issues...")
    data = pd.merge(
        pages_df,
        journal_issues_df,
        left_on=["page_id", "pOf"],
        right_on=["page_id", "ci_id"],
        how="left",
    )
    
    # select only rows where 'journal' is NaN
    na_mask = data["journal"].isna()

    print(f"Dealing with NAs...")
    # apply the function only to rows where 'journal' is NaN
    results = data.loc[na_mask, "page_id"].apply(
        lambda x: pd.Series(get_journal_date_issue_from_page(x))
    )
    
    # update only relevant columns (those that have nan values)
    data.loc[na_mask, ["journal", "issue_id", "date"]] = results.values
    data.sample(5)
    
    print(f"Get page urls...")
    page_url_df = data[["page_id", "iiif_img_base_uri"]].drop_duplicates()
    
    def get_page_url(df, page_id, full=True):
        journal = page_id.split("-")[0]
        url = df[df["page_id"] == page_id]["iiif_img_base_uri"].to_list()[0]

        # diff across journals
        # if journal == ... do
        if full:
            url = url + "/full/full/0/default.jpg"
        else:
            url = url + "/10,10,5000,8000/full/0/default.jpg"  # modify

        return url
    
    ########################################
    ##### number of pages across issues#####
    ########################################
    df = data[["issue_id", "page_id", "date"]].copy().drop_duplicates()
    df["year"] = df["date"].dt.year
    df["decade"] = (df["year"] // 10) * 10

    # set time period to get time windows for each issue, e.g. Y, M, W, D, ...
    time_period = "Y"
    df["timewindow"] = df["date"].dt.to_period(time_period).astype(str)
    df["timewindow"] = df["decade"]  # TO RM !!!
    
    df = (
        df.groupby(["timewindow", "issue_id"])["page_id"]
        .nunique()
        .reset_index(name="nb_pages")
    )
    # df = df.groupby(["timewindow"])["nb_pp"].mean().reset_index(name = "avg_nb_pp")

    sns.boxplot(
        data=df, x="timewindow", y="nb_pages", hue="timewindow", fill=False, gap=0.1
    )
    plt.title(f"{journal}: Number of Pages across Time", fontsize=16)
    plt.xticks(rotation = 30)
    plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_nb_pages_through_time.png")
    plt.close()
    
    ########################################
    ### Number of Content Items (CIs) per page across issues
    ########################################
    
    df = data[["issue_id", "page_id", "date", "ci_id"]].copy().drop_duplicates()
    df["year"] = df["date"].dt.year
    df["decade"] = (df["year"] // 10) * 10

    # set time period to get time windows for each issue, e.g. Y, M, W, D, ...
    time_period = "Y"
    df["timewindow"] = df["date"].dt.to_period(time_period).astype(str)
    df["timewindow"] = df["decade"]  # TO RM !!!

    # get nb of CIs per page
    df = df.groupby(["timewindow", "page_id"])["ci_id"].nunique().reset_index(name="nb_cis")
    # df = df.groupby(["timewindow"])["nb_pp"].mean().reset_index(name = "avg_nb_pp")

    sns.boxplot(data=df, x="timewindow", y="nb_cis", hue="timewindow", fill=False, gap=0.1)
    plt.title(f"{journal}:Number of Content Items per Page across Time", fontsize=16)
    plt.xticks(rotation = 30)
    plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_nb_cis_per_page_through_time_boxplot.png")
    plt.close()
    
    f, ax = plt.subplots(figsize=(11, 6))

    # daw a violinplot
    sns.violinplot(
        data=df,
        x="timewindow",
        y="nb_cis",
        hue="timewindow",
        cut=1,
        linewidth=0.8,
        palette="Set3",
    )
    plt.title(f"{journal}:Number of Content Items per Page across Time", fontsize=16)
    plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_nb_cis_per_page_through_time_violin.png")
    plt.close()
    
    ########################################
    ##### Proportion of Content Items Types (CI types) per page across issues
    ########################################
    
    df = data[["issue_id", "page_id", "date", "ci_id", "ci_tp"]].copy().drop_duplicates()
    df["year"] = df["date"].dt.year
    df["decade"] = (df["year"] // 10) * 10

    # set time period to get time windows for each issue, e.g. Y, M, W, D, ...
    time_period = "Y"
    df["timewindow"] = df["date"].dt.to_period(time_period).astype(str)
    df["timewindow"] = df["decade"]  # TO RM !!!

    # FILL NAs
    # df["ci_tp"] = df["ci_tp"].fillna("NaN")

    # calculate the count of each ci_tp per journal and decade
    df1 = df.groupby(["timewindow", "ci_tp"]).size().reset_index(name="ci_type_count")
    df2 = df.groupby(["timewindow"]).size().reset_index(name="total_count")
    df = pd.merge(df1, df2, on=["timewindow"])
    df["ci_proportion"] = df["ci_type_count"] / df["total_count"]
    
    # pivot the data so that each ci_tp becomes a column and the timewindow is the index
    df_pivot = df.pivot_table(
        index="timewindow",
        columns="ci_tp",
        values="ci_proportion",
        aggfunc="sum",
        fill_value=0,
    )

    # Now, plot the stacked bar plot
    plt.figure(figsize=(12, 8))
    df_pivot.plot(kind="bar", stacked=True, figsize=(12, 8))

    # Add labels and title
    plt.title(f"{journal}: Proportion of CI Types per Page across Time", fontsize=16)
    plt.xlabel("Time Window", fontsize=12)
    plt.ylabel("Proportion", fontsize=12)
    plt.xticks(rotation=45, ha="right")
    plt.legend(title="CI Type", bbox_to_anchor=(1.05, 1), loc="upper left")

    # Show the plot
    plt.tight_layout()
    plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_proportion_ci_tp_per_page.png")
    plt.close()
    
    def get_bbx_width(bbx):
        return bbx[2]


    def get_bbx_height(bbx):
        return bbx[3]


    def get_bbx_area(bbx):
        return bbx[2] * bbx[3]
    
    
    
    
    #########################
    ##### BY CI TYPE #####
    #########################
    
    ci_types = list(set(data["ci_tp"]))
    for ci_type in ci_types:
        print(ci_type)
        df = data[data["ci_tp"] == ci_type].copy()
        if len(df) == 0:
            continue
        bbx_cols = ["bbx_region", "bbx_paragraph"]
        for col in bbx_cols:
            if col in df.columns:
                df[col + "_str"] = df[col].apply(lambda x: str(x))

        df["year"] = df["date"].dt.year
        df["decade"] = (df["year"] // 10) * 10

        # set time period to get time windows for each issue, e.g. Y, M, W, D, ...
        time_period = "Y"
        df["timewindow"] = df["date"].dt.to_period(time_period).astype(str)
        df["timewindow"] = df["decade"]  # TO RM !!!
        
        
        #########################
        ##### NB PARAGRAPHS PER CI #####
        #########################
    
        # get nb of paragraphs per CI per page
        sub_df = (
            df[["timewindow", "page_id", "ci_id", "bbx_paragraph_str"]]
            .dropna(subset=["ci_id"])
            .drop_duplicates()
        )

        sub_df = (
            sub_df.groupby(["timewindow", "page_id", "ci_id"])["bbx_paragraph_str"]
            .nunique()
            .reset_index(name="nb_paragraphs")
        )

        f, ax = plt.subplots(figsize=(11, 6))

        # daw a violinplot
        sns.violinplot(
            data=sub_df,
            x="timewindow",
            y="nb_paragraphs",
            hue="timewindow",
            cut=1,
            linewidth=0.8,
            palette="Set3",
        )
        # Add labels and title
        plt.title(f"{journal}: Number of Paragraphs per CI", fontsize=16)
        plt.xlabel("Time Window", fontsize=12)
        plt.xticks(rotation=45, ha="right")
        plt.legend(title="CI Type", bbox_to_anchor=(1.05, 1), loc="upper left")

        # Show the plot
        plt.tight_layout()
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_nb_paragraphs_per_ci.png")
        plt.close()
        
        #########################
        ##### #### Is one Paragraph spanning over the whole CI? #####
        #########################
        
        # get the coordinates of CI bounding boxes
        def get_superbox_bbx(boxes):
            if not boxes:
                return None

            min_x = float("inf")
            min_y = float("inf")
            max_x2 = float("-inf")
            max_y2 = float("-inf")

            # loop through each box
            for x, y, w, h in boxes:
                min_x = min(min_x, x)
                min_y = min(min_y, y)
                max_x2 = max(max_x2, x + w)
                max_y2 = max(max_y2, y + h)

            # calculate width and height of the containing box
            width = max_x2 - min_x
            height = max_y2 - min_y

            return [min_x, min_y, width, height]
        
        # compute the CI bounding boxes
        sub_df = df[
            ["page_id", "ci_id", "bbx_paragraph_str", "bbx_paragraph", "decade", "timewindow"]
        ].drop_duplicates(subset=["page_id", "ci_id", "bbx_paragraph_str"])

        ci_bbxs = (
            sub_df.groupby(["page_id", "ci_id"])["bbx_paragraph"]
            .apply(lambda group: get_superbox_bbx(list(group)))
            .reset_index(name="bbx_ci")
        )
        
        # get area of CI from coordinates
        ci_bbxs["ci_area"] = ci_bbxs["bbx_ci"].apply(get_bbx_area)
        
        sub_df = sub_df.merge(ci_bbxs, on=["page_id", "ci_id"])
        
        # get area of paragraphs bounding boxes
        bbx_cols = ["bbx_paragraph"]  # "bbx_region"
        for col in bbx_cols:
            sub_df[col + "_area"] = sub_df[col].apply(lambda x: x[2] * x[3])
            
        # compute the ratio
        sub_df["paragraph_ci_ratio"] = sub_df["bbx_paragraph_area"] / sub_df["ci_area"]
        # set time period to get time windows for each issue, e.g. Y, M, W, D, ...
        time_period = "Y"
        # df["timewindow"] = df["date"].dt.to_period(time_period).astype(str)
        sub_df["timewindow"] = sub_df["decade"]  # TO RM !!!
        
        sns.histplot(data=sub_df, x="paragraph_ci_ratio")
        
        plt.title(f"{journal}: Proportion of Paragraph spanning over CI", fontsize=16)

        # Show the plot
        plt.tight_layout()
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_proportion_paragraph_span_over_ci.png")
        plt.close()
        
        # draw ridge plot
        # initialize the FacetGrid object
        pal = sns.cubehelix_palette(10, rot=-0.25, light=0.7)
        g = sns.FacetGrid(
            sub_df, row="decade", hue="decade", aspect=15, height=0.5, palette=pal
        )

        # draw the densities
        g.map(
            sns.kdeplot,
            "paragraph_ci_ratio",
            bw_adjust=0.5,
            clip_on=False,
            fill=True,
            alpha=1,
            linewidth=1.5,
        )
        g.map(
            sns.kdeplot, "paragraph_ci_ratio", clip_on=False, color="w", lw=1.5, bw_adjust=0.5
        )

        # passing color=None to refline() uses the hue mapping
        g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)


        # label the plot in axes coordinates
        def label(x, color, label):
            ax = plt.gca()
            ax.text(
                0,
                0.2,
                label,
                fontweight="bold",
                color=color,
                ha="left",
                va="center",
                transform=ax.transAxes,
            )


        g.map(label, "decade")

        # Set the subplots to overlap
        g.figure.subplots_adjust(hspace=0.5)

        # rm axes details that don't play well with overlap
        g.set_titles("")
        g.set(yticks=[], ylabel="")
        g.despine(bottom=True, left=True)
        
        plt.title(f"{journal}: Proportion of Paragraph spanning over CI", fontsize=16)

        # Show the plot
        plt.tight_layout()
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_proportion_paragraph_span_over_ci_ridge.png")
        plt.close()
        
        df = df.merge(
            sub_df[
                [
                    "page_id",
                    "ci_id",
                    "bbx_paragraph_str",
                    "bbx_ci",
                    "ci_area",
                    "bbx_paragraph_area",
                    "paragraph_ci_ratio",
                ]
            ],
            on=["page_id", "ci_id", "bbx_paragraph_str"],
        )
        
        #########################
        ######## Is one CI spanning over the whole Page?
        #########################
        def get_page_dimensions(bboxes):
            bboxes = bboxes.drop_duplicates()

            unit_width, unit_height = [0, 0]
            for x, y, w, h in bboxes:
                width = x + w
                height = y + h
                if width > unit_width:
                    unit_width = width
                if height > unit_height:
                    unit_height = height
            area = unit_width * unit_height

            return unit_width, unit_height, area
        
        
        # get the dimensions of each page
        dims = df.groupby(["page_id"])["bbx_paragraph"].apply(get_page_dimensions)
        dims = pd.DataFrame(
            dims.tolist(), index=dims.index, columns=["page_width", "page_height", "page_area"]
        ).reset_index()
        
        df = df.merge(dims, on=["page_id"])
        
        # compute ratio
        df["ci_page_ratio"] = df["ci_area"] / df["page_area"]
        
        sns.histplot(data=df, x="ci_page_ratio")
        plt.title(f"{journal}: Proportion of CI spanning over page", fontsize=16)
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_proportion_ci_span_over_page.png")
        plt.close()
                
        #########################
        ##### How thicks are paragraphs (width paragraph / width page)? 
        #########################
        
        # compute paragraph width from paragraph bounding boxe coordinates
        df["paragraph_width"] = df["bbx_paragraph"].apply(get_bbx_width)
        # compute ratio
        df["paragraph_page_width_ratio"] = df["paragraph_width"] / df["page_width"]
        
        sns.histplot(data=df, x="paragraph_page_width_ratio")
        plt.title(f"{journal}: Width of paragraph", fontsize=16)
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_width_paragraph.png")
        plt.close()
        
        plt.figure(figsize=(12, 6))

        # daw a violinplot
        sns.violinplot(
            data=df,
            x="timewindow",
            y="paragraph_page_width_ratio",
            hue="timewindow",
            legend=False,
            cut=1,
            linewidth=0.8,
            palette="Set3",
        )

        # plt.legend(loc='upper right')
        plt.tight_layout()
        plt.title(f"{journal}: Width of paragraph across time", fontsize=16)
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_width_paragraph_across_time.png")
        plt.close()
        
        #########################
        ##### #### How heigh are paragraphs
        #########################
        # compute paragraph height from paragraph bounding boxe coordinates
        df["paragraph_height"] = df["bbx_paragraph"].apply(get_bbx_height)
        # compute ratio
        df["paragraph_page_height_ratio"] = df["paragraph_height"] / df["page_height"]
        
        sns.histplot(data=df, x="paragraph_page_height_ratio")
        plt.title(f"{journal}: Height of paragraph", fontsize=16)
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_height_paragraph.png")
        plt.close()
        
        plt.figure(figsize=(12, 6))

        # daw a violinplot
        sns.violinplot(
            data=df,
            x="timewindow",
            y="paragraph_page_height_ratio",
            hue="timewindow",
            legend=False,
            cut=1,
            linewidth=0.8,
            palette="Set3",
        )

        # plt.legend(loc='upper right')
        plt.tight_layout()
        
        plt.title(f"{journal}: Height of paragraph across time", fontsize=16)
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_height_paragraph_across_time.png")
        plt.close()
        
        #########################
        ##### #### How many lines are they in a paragraph? 
        #########################
        
        sub_df = df[
            ["page_id", "ci_id", "bbx_paragraph_str", "lines_bbxs", "decade", "timewindow"]
        ].drop_duplicates(subset=["page_id", "bbx_paragraph_str"])
        sub_df = sub_df.explode("lines_bbxs")

        sub_df["lines_bbxs"] = sub_df["lines_bbxs"].apply(lambda x: str(x))

        sub_df = sub_df.groupby(["page_id", "bbx_paragraph_str"])["lines_bbxs"].nunique().reset_index(name="nb_lines")
        
        sub_df = sub_df.merge(
            df[["page_id", "ci_id", "bbx_paragraph_str", "lines_bbxs", "decade", "timewindow"]],
            on=["page_id", "bbx_paragraph_str"],
        )
        
        plt.figure(figsize=(12, 6))
        sns.histplot(data=sub_df, x="nb_lines")
        plt.title(f"{journal}: Nb of lines in a paragraph", fontsize=16)
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_nb_lines_paragraph.png")
        plt.close()
        
        f, ax = plt.subplots(figsize=(11, 6))

        # daw a violinplot
        sns.violinplot(
            data=sub_df,
            x="timewindow",
            y="nb_lines",
            hue="timewindow",
            cut=1,
            linewidth=0.8,
            palette="Set3",
        )
        plt.title(f"{journal}: Nb of lines in a paragraph across time", fontsize=16)
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_nb_lines_paragraph_across_time.png")
        plt.close()
        
        #########################
        ######### Check overlaps between CIs in pages
        #########################
        
        # compute the CI bounding boxes
        sub_df = df[
            ["page_id", "ci_id", "bbx_paragraph_str", "bbx_paragraph", "decade", "timewindow"]
        ].drop_duplicates(subset=["page_id", "ci_id", "bbx_paragraph_str"])

        ci_bbxs = (
            sub_df.groupby(["page_id", "ci_id"])["bbx_paragraph"]
            .apply(lambda group: get_superbox_bbx(list(group)))
            .reset_index(name="bbx_ci")
        )
        
        def get_points_from_bbx(bbx):
            (x, y, w, h) = bbx

            bbx = {}
            bbx["x1"] = x
            bbx["y1"] = y
            bbx["x2"] = x + w
            bbx["y2"] = y + h

            return bbx


        def get_intersection_coordinates(bbx1, bbx2):
            assert bbx1["x1"] < bbx1["x2"]
            assert bbx1["y1"] < bbx1["y2"]
            assert bbx2["x1"] < bbx2["x2"]
            assert bbx2["y1"] < bbx2["y2"]

            # determine the coordinates of the intersection rectangle
            bbx = {}
            bbx["x1"] = max(bbx1["x1"], bbx2["x1"])
            bbx["y1"] = max(bbx1["y1"], bbx2["y1"])
            bbx["x2"] = min(bbx1["x2"], bbx2["x2"])
            bbx["y2"] = min(bbx1["y2"], bbx2["y2"])

            if bbx["x2"] < bbx["x1"] or bbx["y2"] < bbx["y1"]:
                return None

            return bbx


        def get_intersection_area(inter_bbx):
            if inter_bbx is None:
                return 0
            return (inter_bbx["x2"] - inter_bbx["x1"]) * (inter_bbx["y2"] - inter_bbx["y1"])
        
        
        # processing function for each group
        def process_group(bbx_list):
            # check if only one element in list
            if len(bbx_list) == 1:
                return 0

            # convert bounding box tuples to dictionaries
            points_list = [get_points_from_bbx(bbx) for bbx in bbx_list]

            # calculate all pairs
            intersection_areas = []
            for bbx1, bbx2 in itertools.combinations(points_list, 2):
                inter_bbx = get_intersection_coordinates(bbx1, bbx2)
                area = get_intersection_area(inter_bbx)
                intersection_areas.append(area)

            return intersection_areas
        
        # get intersection areas for each page
        ci_bbxs = (
            ci_bbxs.groupby("page_id")["bbx_ci"]
            .apply(list)
            .apply(process_group)
            .reset_index(name="intersection_area")
            .explode("intersection_area")
        )
        
        # normalize intersection area by page area
        ci_bbxs = ci_bbxs.merge(df[["page_id", "page_area"]].drop_duplicates(), on=["page_id"])
        ci_bbxs["ratio"] = ci_bbxs["intersection_area"] / ci_bbxs["page_area"]
        
        sns.histplot(data=ci_bbxs, x="ratio")
        plt.title(f"{journal}: CI overlaps", fontsize=16)
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_ci_overlaps.png")
        plt.close()
        
        sns.histplot(data=ci_bbxs[ci_bbxs["ratio"] > 0.1], x="ratio")
        plt.title(f"{journal}: CI overlaps", fontsize=16)
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_ci_overlaps_bigger.png")
        plt.close()
        #########################
        ######### Check overlaps between paragraphs in CI
        #########################
        
        # get intersection areas for each paragraph of a same CI in each page

        sub_df = df[
            ["page_id", "ci_id", "bbx_paragraph_str", "bbx_paragraph", "decade", "timewindow"]
        ].drop_duplicates(subset=["page_id", "ci_id", "bbx_paragraph_str"])

        sub_df = (
            sub_df.groupby(["page_id", "ci_id"])["bbx_paragraph"]
            .apply(list)
            .apply(process_group)
            .reset_index(name="intersection_area")
            .explode("intersection_area")
        )
        
        # normalize intersection area by ci area or page area
        sub_df = sub_df.merge(
            df[["page_id", "ci_id", "page_area", "ci_area"]].drop_duplicates(),
            on=["page_id", "ci_id"],
        )
        
        sub_df["ratio"] = sub_df["intersection_area"] / sub_df["ci_area"]
        sns.histplot(data=sub_df[sub_df["ratio"] > 0.001], x="ratio")
        plt.title(f"{journal}: Paragraph overlaps", fontsize=16)
        plt.savefig(f"/scratch/students/danae/data/data_preparation/eda_vizualisation/journals/{journal}_{ci_type}_paragraph_overlaps_bigger.png")
        plt.close()
        
                
        
        
        

Processing journal: lepetitparisien
1/36
Reading bbxs...
Merging pages and issues...
Dealing with NAs...
Get page urls...
ad


  self._figure.tight_layout(*args, **kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  self._figure.tight_layout(*args, **kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()


image


  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()


table


  self._figure.tight_layout(*args, **kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  self._figure.tight_layout(*args, **kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()


article


  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()


<NA>
Processing journal: luxzeit1858
2/36
Reading bbxs...
Merging pages and issues...
Dealing with NAs...
Get page urls...
ad


  self._figure.tight_layout(*args, **kwargs)
  func(*plot_args, **plot_kwargs)
  self._figure.tight_layout(*args, **kwargs)
  func(*plot_args, **plot_kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()


table


  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()


article


  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()


<NA>
Processing journal: indeplux
3/36
Reading bbxs...
Merging pages and issues...
Dealing with NAs...
Get page urls...
ad


  self._figure.tight_layout(*args, **kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  self._figure.tight_layout(*args, **kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  func(*plot_args, **plot_kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()


death_notice


  self._figure.tight_layout(*args, **kwargs)
  func(*plot_args, **plot_kwargs)
  self._figure.tight_layout(*args, **kwargs)
  func(*plot_args, **plot_kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()


article


  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()


<NA>
Processing journal: jdpl
4/36
Reading bbxs...
Merging pages and issues...
Dealing with NAs...


KeyboardInterrupt: 