# OID Semantic Landscape – Full Pipeline


In [None]:
# DF, PDFs, Numpy, and regular expressions
import pandas as pd
import re
import numpy as np
import fitz 

# Text Splitting
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embeddings, dimensionality reduction, and clustering
from sentence_transformers import SentenceTransformer
import umap.umap_ as umap
import hdbscan

# Interpretation
from LengthSafeClusterThemeExtractor import ClusterThemeExtractor

# Visualization
import datamapplot

# Step 1 - Cleaning and Chunking

Import DataFrame and Define Paths

In [None]:
# This dataset includes all PDFs from Zotero and all the additional ones we gathered
df=pd.read_csv('.../map_df_FINAL.csv') 

In [None]:
paths = df[df['File Attachments'].notna()]['File Attachments']
paths = paths.str.split(';')
paths = paths.explode()
paths = paths[paths.str.contains('.pdf')] # select only those referring to pdfs
paths = [s[1:] if s.startswith(" ") else s for s in paths] # There was an annoying space at the beginning of some PDF paths so I took it away when present

Trim Footnotes and Trim Bibliography Functions

In [None]:
def trim_bibliography(page_content):
    # Case-insensitive search for bibliography/references
    pattern = r'\b(references|bibliography)\b'
    match = re.search(pattern, page_content, re.IGNORECASE)
    
    if match:
        # Only trim from the first occurrence of references/bibliography
        return page_content[:match.start()].strip()
    
    return page_content

###########################
def remove_footnotes(text):
    # Regular expression to match footnote patterns
    footnote_pattern = r'\[\d+\]|\(\d+\)|\d+\.'
    
    # Remove footnote markers
    text_without_footnotes = re.sub(footnote_pattern, '', text)
    
    return text_without_footnotes.strip()

### Read and Chunk PDFs

In [None]:
# Set up the TextSplitter from LangChain
rec_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1300,
    chunk_overlap = 0,
    length_function = len
)

In [None]:
# Initialize dataframe
chunk_df = pd.DataFrame(columns=['Publication Year', 'Authors', 'Title', 'Source', 'chunk_number', 'text'])
failed_paths = []

# Iterate over each path we have in our Zotero Library    
for path in paths:
    try:
         # Extract key variables that we need to know what chunk is belonging to what article
        matching_rows = df[df['File Attachments'].str.contains(path, na=False)]

        if len(matching_rows) == 0:
            print(f"No matching entry found for path: {path}")
            print()
            failed_paths = failed_paths.append(path)
            continue
        
        # Extract key variables
        year = matching_rows['Publication Year'].values[0]
        author = matching_rows['Author'].values[0]
        title = matching_rows['Title'].values[0]
        source = matching_rows['Journal Abbreviation'].values[0]
        
        # Initialize empty text string
        full_text = ""

        # Open PDF File
        with fitz.open(path) as pdf_document:
            print(f'Processing {title} with {pdf_document.page_count} pages...')

            # Iterate through the pages and extract text
            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                page_content = page.get_text()
                
                # Remove footnotes
                page_content = remove_footnotes(page_content)

                # Put it all together in one big text string for the full article
                full_text += page_content + " "

            # Remove bibliography 
            full_text = trim_bibliography(full_text)
            print('Full text acquired, splitting into chunks...')

            # Apply the text splitter from langchain
            chunks = rec_text_splitter.split_text(full_text)
            print(f'The Article produced a total of ', len(chunks), ' chunks.')

            # Calculate average length of the chunks
            temp = [len(ele) for ele in chunks]
            res = 0 if len(temp) == 0 else (float(sum(temp)) / len(temp))        
            print(f'The average lenth of a chunk is of {res} characters')

            # Iterate through chunks and append them to the main output dataframe
            for i, _ in enumerate(chunks):
                chunk_number = i
                chunk_text = chunks[i]

                temp_df = pd.DataFrame({
                            'Publication Year': [year],
                            'Authors': [author],
                            'Title': [title],
                            'Source': [source],
                            'chunk_number': [chunk_number],
                            'text': [chunk_text]
                        })

                chunk_df = pd.concat([chunk_df, temp_df], ignore_index=True)

            print()
        
    except Exception as e:
        print(f"Error processing path {path}: {e}")
        continue

In [None]:
print(f'We have a total of {len(chunk_df)} chunks from {len(chunk_df.Title.unique())} articles')
chunk_df.sample(5)

In [None]:
chunk_df.to_csv('report_literature_chunked.zip', compression='zip')

In [None]:
print(f'In total we failed to process {len(failed_paths)} PDF files for the following paths' 
      f'which resulted like having no entries in the main df: {failed_paths}')

# Step 2 - Embedding

## First Embedding Step
In this first embedding step we calculate **an embedding vector for each chunk** we derived from the text, we then reduce dimensionality and run HDBSCAN for clustering

#### Text Embedding

In [None]:
documents = chunk_df[~chunk_df['text'].isna()]['text']
documents=list(documents.reset_index(drop=True))

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
embeddings = model.encode(documents)

In [None]:
print(embeddings.shape)
similarities = model.similarity(embeddings, embeddings)
print(similarities)

In [None]:
embeddings_dict = dict(zip(chunk_df.index, embeddings))

#### Dimensionality reduction

In [None]:
reducer = umap.UMAP(n_components = 2, n_jobs=1)

In [None]:
reduced_embedding = reducer.fit_transform(embeddings)

In [None]:
coordinates_dict = dict(zip(chunk_df.index, reduced_embedding))

#### Clusterisation

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=10)

In [None]:
cluster_labels = clusterer.fit_predict(reduced_embedding)

In [None]:
clusters_dict = dict(zip(chunk_df.index, cluster_labels))
string_labels = cluster_labels.astype(str)

#### Match with chunk_df

In [None]:
chunk_df['map_cluster']=chunk_df.index.map(clusters_dict)
chunk_df['map_coordinates']=chunk_df.index.map(coordinates_dict)
chunk_df['embedding_vector']=chunk_df.index.map(embeddings_dict)

## Second Embedding Step
In the second embedding step we use leverage the clustering from step one to calculate **average vectors for each paper having multiple paragraphs in the same cluster**. 

Specifically: For papers with multiple paragraphs in the same cluster, we average their positions (in the original embedding space), but paragraphs of a same paper, being in different clusters remain distinct. We then run UMAP again on the “averaged paragraphs" to derive their position in a reduced dimensionality space and re-cluster using HDBSCAN.

After evaluation of a first run which still granted lots of paper-specific clusters, **we run this section a second time**

First Time
~~~
averaged_df = chunk_df.groupby(['Title', 'map_cluster']).agg({
    'embedding_vector': 'mean',
    'chunk_number': list,
    'Publication Year': 'first',
    'Authors': 'first'
}).reset_index()
~~~
$+$ we run the rest of the section untill "Interpretation"

In [None]:
### Second Time
averaged_df = averaged_df.groupby(['Title', 'map_cluster']).agg({
    'embedding_vector': 'mean',
    'chunk_number': list,
    'Publication Year': 'first',
    'Authors': 'first'
}).reset_index()

In [None]:
averaged_df.head()

Step 1 - Create a novel array with the averaged vectors

In [None]:
embeddings_averages = np.stack(averaged_df.embedding_vector)

In [None]:
np.save('/Users/giovanni/Desktop/Forum Info and Democracy /Zotero Data/GarganText/embeddings/FINAL_01_pipeline_embeddings_step2.2.npy',
       embeddings_averages)

In [None]:
print(embeddings_averages.shape)
similarities = model.similarity(embeddings_averages, embeddings_averages)
print(similarities)

Step 2 - Project the vectors on a bidimensional space

In [None]:
reduced_embedding_avg = reducer.fit_transform(embeddings_averages)

In [None]:
reduced_embedding_avg

In [None]:
coordinates_avg_dict = dict(zip(averaged_df.index, reduced_embedding_avg))

In [None]:
np.save('/Users/giovanni/Desktop/Forum Info and Democracy /Zotero Data/GarganText/embeddings/FINAL_01_pipeline_umap_step2.2.npy',
       reduced_embedding_avg)

Step 3 - Run HDBSCAN Clustering

In [None]:
cluster_labels_avg = clusterer.fit_predict(reduced_embedding_avg)

In [None]:
clusters_dict_avg = dict(zip(averaged_df.index, cluster_labels_avg))
string_labels_avg = cluster_labels_avg.astype(str)

In [None]:
np.save('/Users/giovanni/Desktop/Forum Info and Democracy /Zotero Data/GarganText/embeddings/FINAL_01_pipeline_cluster_labels_step2.2.npy',
        clusters_dict_avg)

Step 4 - Map onto averaged_df and re-order columns

In [None]:
averaged_df['map_cluster']=averaged_df.index.map(clusters_dict_avg)
averaged_df['map_coordinates']=averaged_df.index.map(coordinates_avg_dict)

In [None]:
averaged_df= averaged_df[['Publication Year', 'Authors', 'Title', 'chunk_number',
                          'map_cluster', 'map_coordinates', 'embedding_vector']]
averaged_df.head()

# Step 3 - Interpretation
Interpretation was conducted quantitatively and qualitatively. We started by creating a dataset for interpretation.

In [None]:
averaged_df['chunk_number_flat'] = averaged_df['chunk_number'].apply(lambda x: [item for sublist in x for item in (sublist if isinstance(sublist, list) else [sublist])])

chunk_text_mapping = {}
for _, row in chunk_df.iterrows():
    key = (row['Title'], row['chunk_number'])
    chunk_text_mapping[key] = row['text']

# Map and merge texts by matching both Title and chunk_number
def merge_texts(row):
    title = row['Title']
    chunk_numbers = row['chunk_number_flat']
    texts = [chunk_text_mapping.get((title, num), '') for num in chunk_numbers]
    return ' '.join(filter(None, texts))

averaged_df['merged_text'] = averaged_df.apply(merge_texts, axis=1)

averaged_df = averaged_df.drop(columns=['chunk_number_flat'])

In [None]:
averaged_df.to_csv('interpretation_df.csv')
averaged_df.head()

### Quantitative Analysis

**WARNING**: This section is highly computationally intensive as it leverages BERT to extract the theme of each cluster based on the processing of its text. 

In [None]:
extractor = ClusterThemeExtractor()

In [None]:
# For around 3,300 different chunks it takes about 1 hour and a half to run
themes = extractor.extract_cluster_themes(texts_by_cluster)

### Qualitative Analysis
We then analysed the output qualitatively, reading representative paragraphs and full documents where LLM-based labelling was unclear. Random samples of clearly defined clusters were also analysed to check BERT's output. We then developed a **dictionary of intepretation** to use for plotting.

In [None]:
dictionary = {
    "-1": "Unlabelled",
    "0": "Zero Rating and Net Neutrality",
    "1": "AI and Data Governance Objectives",
    "2": "Unlabelled",
    "3": "(Digital) Media Literacy",
    "4": "(Digital) Media Literacy",
    "5": "Unlabelled",
    "6": "Unlabelled",
    "7": "Journalists Online Harassment",
    "8": "AI Fairness and AI Bias",
    "9": "AI Fairness and AI Bias",
    "10": "AI Fairness and AI Bias",
    "11": "Children Internet Use",
    "12": "Unlabelled",
    "13": "Mis- and Disinformation",
    "14": "AI News and Journalism",
    "15": "Fact Checking",
    "16": "Fake News and Politicization",
    "17": "Covid-19 Misinformation",
    "18": "Political News Consumption",
    "19": "Socio-demographic Indicators",
    "20": "Media Trust",
    "21": "Comparative Studies",
    "22": "Content Analyses",
    "23": "Policy and Research Reviews",
    "24": "Profiling and Micro-Targeting",
    "25": "Advertising",
    "26": "Research Access",
    "27": "Chinese Surveillance",
    "28": "Data Trade",
    "29": "Deepfakes",
    "30": "Comparative Studies",
    "31": "Data Competition",
    "32": "Platform Regulation",
    "33": "Data Asymmetries",
    "34": "(Digital) Human Rights",
    "35": "Data Governance",
    "36": "Hate Speech",
    "37": "Privacy Regulation",
    "38": "Privacy Regulation",
    "39": "Information Freedom",
    "40": "Governance Gap",
    "41": "AI Disinformation Risk",
    "42": "Middle East Disinformation",
    "43": "Digital Inclusion",
    "44": "Governance Gap",
    "45": "Algorithmic Politics",
    "46": "Algorithmic Content Moderation",
    "47": "Content Moderation",
    "48": "LLMs",
    "49": "Gen-AI News and Journalism",
    "50": "Disinformation Campaigns",
    "51": "Political Communication",
    "52": "AI Governance",
    "53": "Democratizing AI",
    "54": "AI Definitions",
    "55": "Disinformation Campaigns",
    "56": "Disinformation Campaigns",
    "57": "News Media Bias",
    "58": "Decolonisation",
    "59": "Twitter Research",
    "60": "WhatsApp Research",
    "61": "Media Regulation",
    "62": "African Media",
    "63": "Digital News and Journalism",
    "64": "Digital News and Journalism",
    "65": "Digital News and Journalism",
    "66": "Digital News and Journalism",
    "67": "Russian Trolls",
    "68": "Political Campaigns and Elections",
    "69": "Gender Media and Politics",
    "70": "Populism and Authoritarianism",
    "71": "Polarization"
}

dictionary_second_level = {'Unlabelled':'Unlabelled',
    "AI Fairness and AI Bias": "Artificial Intelligence",
    "AI News and Journalism": "Artificial Intelligence",
    "Deepfakes": "Artificial Intelligence",
    'AI and Data Governance Objectives':'Artificial Intelligence',
    "AI Disinformation Risk": "Artificial Intelligence",
    "Algorithmic Politics": "Artificial Intelligence",
    "LLMs": "Artificial Intelligence",
    "Gen-AI News and Journalism": "Artificial Intelligence",
    "AI Governance": "Artificial Intelligence",
    "Democratizing AI": "Artificial Intelligence",
    "AI Definitions": "Artificial Intelligence",
    "Journalists Online Harassment": "Content Moderation",
    "Platform Regulation": "Content Moderation",
    "Hate Speech": "Content Moderation",
    "Information Freedom": "Content Moderation",
    "Algorithmic Content Moderation": "Content Moderation",
    "Chinese Surveillance": "Data",
    "Data Competition": "Data",
    "Data Asymmetries": "Data",
    "Data Governance": "Data",
    "Privacy Regulation": "Data",
    "Governance Gap": "Data",
    "Digital Inclusion": "Data",
    "(Digital) Media Literacy": "News Media",
    "Digital News and Journalism": "News Media",
    "Children Internet Use": "News Media",
    "Political News Consumption": "News Media",
    "Media Trust": "News Media",
    "News Media Bias": "News Media",
    "Socio-demographic Indicators": "News Media",
    "Media Regulation": "News Media",
    "African Media": "News Media",
    "Comparative Studies": "Research and Methodologies",
    "Content Analyses": "Research and Methodologies",
    "Policy and Research Reviews": "Research and Methodologies",
    "Mis- and Disinformation": "Mis- and Disinformation",
    "Fact Checking": "Mis- and Disinformation",
    "Fake News and Politicization": "Mis- and Disinformation",
    "Covid-19 Misinformation": "Mis- and Disinformation",
    "Middle East Disinformation": "Mis- and Disinformation",
    "Disinformation Campaigns": "Mis- and Disinformation",
    "Russian Trolls": "Mis- and Disinformation",
    "Political Communication": "Social Media and Politics",
    "Political Campaigns and Elections": "Social Media and Politics",
    "Gender Media and Politics": "Social Media and Politics",
    "Populism and Authoritarianism": "Social Media and Politics",
    "Polarization": "Social Media and Politics",
    "Twitter Research": "Social Media and Politics",
    "WhatsApp Research": "Social Media",
    "Profiling and Micro-Targeting": "Content Moderation",
    "Advertising": "Unlabelled",
    "Research Access": "Research and Methodologies",
    "(Digital) Human Rights": "Unlabelled",
    "Zero Rating and Net Neutrality": "Zero Rating and Net Neutrality",
    'AI and Data Governance Objectives':'Artificial Intelligence', 
    'Decolonisation': 'Research and Methodologies',
    'Content Moderation':'Content Moderation', 
    'Data Trade':'Data'
}

# Step 4 - Visualization

### Prep data
We first map all cluster names (defined on two levels – macro and micro clusters) to their respective chunks, and we add the regional information.

In [None]:
visualization_df = averaged_df.copy()

visualization_df['map_cluster']=visualization_df['map_cluster'].astype(str)
visualization_df['labels_1']=visualization_df['map_cluster'].map(dictionary)
visualization_df['labels_2']=visualization_df['labels_1'].map(dictionary_second_level)

# Check that no label is na – both outputs should be 0
print('We have the following number of rows containing na labels:')
print(len(visualization_df[visualization_df.labels_1.isna()==True]))
print(len(visualization_df[visualization_df.labels_2.isna()==True]))

# add regional information from the main df
visualization_df = pd.merge(visualization_df,df[['Title', 'region']], on='Title')

We then prepare all the variables in the right format we need for the plotting.

In [None]:
# coordinates 
reduced_embedding=np.array(visualization_df.map_coordinates)
reduced_embedding = np.array([list(map(float, re.sub(r"(\d)\s+(-?\d)", r"\1, \2", item)
                                .strip("[]")
                                .split(", "))) for item in reduced_embedding])

#Topic labels
labels = np.array(visualization_df['labels_1'], dtype=object)
labels_big = np.array(visualization_df['labels_2'], dtype=object)

#Hover Data and extra points data
titles = np.array(visualization_df.Title)
author = np.array(visualization_df.Authors.fillna('Unknown'))
region = np.array(visualization_df.region.fillna('Other'))
extra_data = pd.DataFrame(
    {"author":author, "region":region}
)

#Hover color data
color_mapping = {}
color_mapping["Global"] = "#a64531"
color_mapping["Global North"] = "#ff0000"
color_mapping["Global Majority"] = "#59bace"
color_mapping["Other"] = "#6c6c6c"
extra_data["color"] = extra_data.region.map(color_mapping)
marker_color_array = extra_data.region.map(color_mapping)

#Histogram Variables
visualization_df["Publication Year"]=pd.to_datetime(visualization_df["Publication Year"].fillna('2024').astype(int), format='%Y', errors='coerce')

In [None]:
visualization_df["Publication Year"].unique() # Check that dates are in the right format

### Prep additional HTML, CSS and JavaScript Components
Here below we create the custom HTML for the **hover text** so that we can include mor info when we hover over the points. 

In [None]:
badge_css = """
    border-radius:6px;
    width:fit-content;
    max-width:75%;
    margin:2px;
    padding: 2px 10px 2px 10px;
    font-size: 10pt;
"""
hover_text_template = f"""
<div>
    <div style="background-color:{{color}};color:#fff;{badge_css}">{{region}}</div>
    <div style="font-size:small;padding:2px;">{{hover_text}}</div>
    <div style="font-size:small;padding:2px;"><b>Author</b>: {{author}}</div>
</div>
"""

We hereby construct the **global north - global majority** selection button.

In [None]:
#### Region Button – css layout
custom_css="""
.row {
    display : flex;
    align-items : center;
}
.box {
    height:10px;
    width:10px;
    border-radius:2px;
    margin-right:5px;
    padding:0px 0 1px 0;
    text-align:center;
    color: white;
    font-size: 4px;
    cursor: pointer;
}
#region_button {
    position: absolute;
    bottom: 0;
    left: 0;
}
#title-container {
    max-width: 85%;
}
"""

#### HTML Content
custom_html = """
<div id="region_button" class="container-box">
    <div style="font-size:11pt;padding:2px;"><b>Select Region</b></div>
"""
for region, color in color_mapping.items():
    custom_html += f'    <div class="row" ><div id="{region}" class="box" style="background-color:{color};"></div>{region}</div>\n'
custom_html += """
</div>
"""

#### JavaScript Activation
custom_js = """
const region_button = document.getElementById("region_button");
const selectedRegions = new Set();

// Update the displayed data based on selected categories
function updateDisplayedData() {
    const selectedIndices = [];
    datamap.metaData.region.forEach((reg, i) => {
        if (selectedRegions.has(reg)) {
            selectedIndices.push(i);
        }
    });
    datamap.addSelection(selectedIndices, "region_button");
}

// Add event listener to the button container
region_button.addEventListener('click', function (event) {
    // Ensure the clicked element has an ID and is a valid button
    const selectedCategory = event.target.id;
    if (!selectedCategory) return;

    // Toggle the selection of the category
    if (selectedRegions.has(selectedCategory)) {
        selectedRegions.delete(selectedCategory);
        event.target.innerHTML = ""; // Uncheck
    } else {
        selectedRegions.add(selectedCategory);
        event.target.innerHTML = "✓"; // Check
    }

    // Update the data display
    updateDisplayedData();
});
"""

## Plotting

In [None]:
plot = datamapplot.create_interactive_plot(
    reduced_embedding,
    labels_big,
    labels,
    
    # Labels parameters
    text_collision_size_scale=3,
    min_fontsize=8,
    max_fontsize=28,
    
    # HoverText and search
    hover_text = titles,
    enable_search=True,
    on_click="window.open(`http://google.com/search?q=\"{hover_text}\"`)",
    
    # Title and Subtitle
    font_family="Montserrat",
    title="Information and Democracy Semantic Landscape",
    sub_title="A data map of literature from the Observatory on Information and Democracy's first research cycle",
    
    #Layout
    cluster_boundary_polygons=True,
    cluster_boundary_line_width=2,
    initial_zoom_fraction=6,
    darkmode=True,

    # Publication date histogram
    histogram_data=visualization_df["Publication Year"],
    histogram_group_datetime_by="year",
    histogram_range=(pd.to_datetime("2000-01-01"), pd.to_datetime("2025-08-08")),
    histogram_settings={
        "histogram_log_scale":False,
        "histogram_title":"Publication Year",
        "histogram_bin_fill_color":"#a64531",
        "histogram_bin_unselected_fill_color":"#d6a591",
        "histogram_bin_selected_fill_color":"#f68571",
        "histogram_width":300,
        "histogram_height":100,
    },
    
    # Add information for hovering
    extra_point_data=extra_data,
    hover_text_html_template=hover_text_template,
    
    # Add Global north - Global Majority Button
    custom_css=custom_css,
    custom_html=custom_html,
    custom_js=custom_js,
)

In [None]:
plot.save('OID_semantic_landscape_FINAL.html') # Save map as html 