In [1]:
import pandas as pd
import regex as re
import torch
from transformers import BertTokenizer, BertModel
import logging
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn import metrics
from sklearn.cluster import AffinityPropagation
from sklearn.datasets import make_blobs
from collections import Counter
import pickle
from TESI.APP import APosterioriaffinityPropagation as APP

## DATA CLEASING (can skip this part when you already have the clean file)

In [None]:
data = pd.read_csv('DATI.csv', sep = ";")

In [None]:
data[['Author', 'Year']] = data['Source'].str.extract(r'(.*?)\s*\((\d{4})', expand=True)
data = data.drop('Source', axis=1)

In [None]:
# Removing unwanted strings
data['Author'] = data['Author'].str.replace(" \(", "")
data['Year'] = data['Year'].str.replace(" \(", "").str.replace(" p. ...", "")

In [None]:
data = data.iloc[:, 1:]

In [None]:
data = data.sort_values(by='Year', ascending=True)

In [None]:
data['Year'] = data['Year'].replace('2005', '2006')

In [None]:
definition = data.iloc[:, 0]

In [None]:
clean_text = []
for row in definition:
    clean_row = row.lower()
    clean_row = re.sub(r'\([^)]*\)', '', clean_row)
    clean_row = re.sub(r'-', ' ', clean_row)
    clean_row = clean_row.replace("‘", "").replace("’", "")
    clean_rows = clean_row.replace("“", "").replace("”", "")
    clean_text.append(clean_rows)

In [None]:
texts = list(clean_text)

In [None]:
text_upgraded = []
for text in texts:
    text = text.replace(' ce ', ' circular economy ')
    text_upgraded.append(text)

In [None]:
count = 0
for text in text_upgraded:
    if ' ce ' in text:
        count += 1
print(count)

In [None]:
data['Clean Definitions'] = text_upgraded 

In [None]:
# Save the updated dataset in a csv file
data.to_csv('data_tesi.csv')

## Upload the clean dataset (once you have the embeddings file you can skip this part)

In [None]:
data = pd.read_csv('data_tesi.csv')

In [None]:
data[data['Year'] == 2017]['Clean Definitions'].count()

In [None]:
## create lists of definitions for each year
clust = {}

for year, defin in zip(data['Year'], data['Clean Definitions']):
    if year in clust:
        clust[year].append(defin)
    else:
        clust[year] = [defin]

# Extract BERT embeddings for 'circular economy' (once you have the embeddings file you can skip this part)

In [None]:
# Define a BertEmbedder class. The class will create embeddings for each indexed target expression 'circular economy', and compute the average
class BERTEmbedder:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)

    def get_embeddings(self, texts, target_phrase):
        phrase_words = target_phrase.split()
        embeddings = []

        for text in texts:
            encoded_inputs = self.tokenizer.encode_plus(
                text,
                return_tensors='pt',
                truncation=True,
                padding=True,
                max_length=512
            )
            outputs = self.model(**encoded_inputs)
            last_hidden_state = outputs.last_hidden_state
            tokens = self.tokenizer.convert_ids_to_tokens(encoded_inputs["input_ids"][0])
            
            # Check for the presence of the target phrase and extract the matching embeddings
            phrase_embeddings = self._extract_phrase_embeddings(last_hidden_state, tokens, phrase_words)
            embeddings.extend(phrase_embeddings)

        return embeddings
    #Extract the average embedding for 'circular economy'
    def _extract_phrase_embeddings(self, hidden_states, tokens, phrase_words):
        phrase_embeddings = []
        for i in range(len(tokens) - len(phrase_words) + 1):
            if tokens[i:i+len(phrase_words)] == phrase_words:
                phrase_embedding = torch.mean(hidden_states[0, i:i+len(phrase_words)], dim=0)
                phrase_embeddings.append(phrase_embedding)
        return phrase_embeddings

In [None]:
# Extract the embeddings for the target phrase
def prepare_embeddings_by_year(clust_dict, target_phrase):
    embedder = BERTEmbedder()

    embeddings_by_year = {}
    for year, texts in clust_dict.items():
        embeddings_by_year[year] = embedder.get_embeddings(texts, target_phrase)

    return embeddings_by_year

In [None]:
target_phrase = 'circular economy'
embeddings_by_year = prepare_embeddings_by_year(clust, target_phrase)

In [None]:
embeddings_by_year

In [None]:
## SAVE FILE
with open('TESI/embeddings_by_year.pkl', 'wb') as file:
     pickle.dump(embeddings_by_year, file)

## Upload the embeddings for clustering

In [2]:
## OPEN FILE
with open('TESI/embeddings_by_year.pkl', 'rb') as file:
      embeddings_by_year = pickle.load(file)

In [3]:
word_embeddings = [embedding.detach().numpy().flatten() for year in embeddings_by_year.keys() for embedding in embeddings_by_year[year]]

In [4]:
word_embeddings = {}
for year in embeddings_by_year.keys():
    word_embeddings[year] = [embedding.detach().numpy().flatten() for embedding in embeddings_by_year[year]]

In [60]:
for embedding in word_embeddings[2006]:
    print(type(embedding))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [5]:
for year in word_embeddings.keys():
    for embedding in word_embeddings[year]:
        print(embedding.shape)

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)

## Incremental Clustering WiDiD 

In [52]:
#SENZA TRIMMING FACTOR
def cluster_word_embeddings_aff_prop(word_embeddings, random_state=5):
    clustering = AffinityPropagation(random_state=random_state).fit(word_embeddings)
    labels = clustering.labels_
    counts = Counter(labels)
    exemplars = clustering.cluster_centers_
    return labels, exemplars

def incremental_affinity_propagation(embeddings_per_year, random_state=5):
    year_to_centroids = {}
    year_to_labels = {}
    
    for year, embeddings in sorted(embeddings_per_year.items()):
        if year == min(embeddings_per_year.keys()):
            # 2006 standard AP
            labels, centroids = cluster_word_embeddings_aff_prop(embeddings, random_state=random_state)
        else:
            # Following years, APP
            # Precedent embeddings are replaced by their centroids
            previous_centroids = year_to_centroids[year - 1]
            cumulative_embeddings = np.vstack((previous_centroids, embeddings))
            
            # Run AP again on the combined set of previous centroids and current embeddings
            labels, new_centroids = cluster_word_embeddings_aff_prop(cumulative_embeddings, random_state=random_state)
            
            # Since we have a new set of centroids, we need to separate them into those that correspond to previous centroids (which will be discarded)
            # and those that correspond to the current year
            #THIS STEP MIGHT BE WRONG
            centroids = new_centroids[len(previous_centroids):]
            
        # Store labels and centroids for the current year
        year_to_centroids[year] = centroids
        year_to_labels[year] = labels
        
    return year_to_labels, year_to_centroids

In [54]:
year_to_labels, year_to_centroids = incremental_affinity_propagation(word_embeddings)

In [55]:
year_to_labels

{2006: array([0, 1, 1, 1], dtype=int64),
 2007: array([0, 1, 1, 1, 1, 2, 3, 3, 3], dtype=int64),
 2008: array([0, 1, 2, 1, 1, 2, 3, 2, 2, 3, 3, 3, 3, 3, 4, 1, 4, 2, 4, 4, 4],
       dtype=int64),
 2009: array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64),
 2010: array([0, 0, 3, 1, 1, 1, 1, 2, 3, 3, 0], dtype=int64),
 2011: array([3, 3, 5, 1, 1, 5, 0, 1, 1, 1, 2, 3, 4, 4, 3, 5, 5], dtype=int64),
 2012: array([0, 1, 3, 3, 3, 2, 3, 3, 3], dtype=int64),
 2013: array([0, 3, 3, 0, 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3], dtype=int64),
 2014: array([3, 2, 2, 3, 3, 2, 0, 3, 3, 4, 1, 1, 4, 4, 2, 3, 2, 3, 3, 4, 3, 3,
        5, 5, 4], dtype=int64),
 2015: array([4, 1, 3, 0, 2, 4, 1, 1, 4, 1, 2, 3, 4, 4, 3, 5, 5, 5, 2, 3],
       dtype=int64),
 2016: array([4, 5, 0, 0, 0, 5, 4, 1, 2, 5, 0, 3, 3, 3, 4, 4, 3, 5, 5, 5],
       dtype=int64),
 2017: array([7, 0, 5, 2, 0, 7, 0, 1, 5, 0, 2, 2, 2, 2, 5, 2, 7, 3, 3, 3, 0, 4,
        4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 5, 3, 7, 5, 6, 7, 7, 5, 5, 2, 7, 5,
      

## Cluster Analysis