In [1]:
import pandas as pd
import gensim
from gensim.models.phrases import Phrases, Phraser, ENGLISH_CONNECTOR_WORDS
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tqdm import tqdm
import numpy as np
import enchant
from time import time 
import multiprocessing
from copy import deepcopy
from collections import defaultdict

In [2]:
tokens = np.load('tokens_per_year.npy',allow_pickle='FALSE').item()

In [3]:
#documents è la lista di liste
#doc sono le liste interne
def build_bigram_dictionary(documents, min_count, threshold):
    phrases = Phrases(documents, min_count=min_count, threshold=threshold, connector_words=ENGLISH_CONNECTOR_WORDS)
    bigram = Phraser(phrases)
    tokens_text_with_bigrams = [bigram[doc] for doc in documents]
    return tokens_text_with_bigrams

**TRAIN WORD2VEC MODEL ON THE WHOLE SET**.

In [4]:
# CREATE A VOCABULARY OF BIGRAMS THE DATASET PER YEAR
bigram_vocab = {}
for year, tokens in tokens.items():
    tokens_per_year = build_bigram_dictionary(tokens, 11, 20)
    bigram_vocab[year] = tokens_per_year

In [6]:
wholeset =  [doc for papers in bigram_vocab.values() for doc in papers]

In [7]:
word_freq = defaultdict(int)
for big in wholeset:
    for i in big:
        word_freq[i] += 1
len(word_freq)

20901

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:200]

In [8]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [21]:
#In this first step, we set up the parameters of the model one-by-one. we don't supply the parameter sentences, and therefore leave the model uninitialized, purposefully.
w2v_model = Word2Vec(sg = 1,
                     vector_size = 150,
                     min_count=10, #Ignores all words with total absolute frequency lower than this - (2, 100)                    
                     window=10,  #The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the right of our target
                     sample=6e-5,   #Dimensionality of the feature vectors
                     alpha=0.03,  #The initial learning rate
                     min_alpha=0.0007,  #Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
                     negative=5,  #If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)
                     workers=cores-1  #Use these many worker threads to train the model (=faster training with multicore machines)
                     )

In [22]:
t = time()
#Here it builds the vocabulary from a sequence of sentences and thus initialized the model. With the loggings, We can follow the progress and even more important, the effect of min_count and sample on the word corpus. We noticed that these two parameters, and in particular sample, have a great influence over the performance of a model. Displaying both allows for a more accurate and an easier management of their influence.
#Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them)

w2v_model.build_vocab(wholeset)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.01 mins


In [23]:
t = time()
w2v_model.train(wholeset, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 1.25 mins


In [24]:
# salva il modello 
# word2vec_sg modello con 10 negativity
# word2vec_sg2 modello con 5 negativity
model = w2v_model.save("word2vec_sg2.model")

**FINETUNING OF THE MODEL ON THE DATASET OF EACH YEAR**

In [25]:
model = Word2Vec.load("word2vec_sg2.model")

ft_models = {}
for year, documents in bigram_vocab.items():
    
    # Upload the deepcopy of the existing model each time the loop is restarted (for each year)
    ft_model = deepcopy(model)
    
    # Update the dictionary with the bigrams of the year and train the new model
    ft_model.build_vocab(documents, update=True)
    ft_model.train(documents, total_examples=ft_model.corpus_count, epochs=30)
    
    # Save the model
    ft_model.save(f"word2vec_{year}_sg2.model")
    ft_models[year] = ft_model

In [26]:
# SAVE THE VECTORS:
ft_vectors = {}
for year, model in ft_models.items():
    word_vectors = model.wv  
    ft_vectors[year] = word_vectors

In [40]:
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px

n_documents = [len(papers) for year, papers in bigram_vocab.items()]
vectors = np.array([word_vector['circular_economy'] for year, word_vector in ft_vectors.items()])
years = list(ft_vectors.keys())

pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(vectors)

import pandas as pd
df = pd.DataFrame({
    'PCA1': reduced_vectors[:, 0],
    'PCA2': reduced_vectors[:, 1],
    'Year': years
})

fig = px.scatter(df, x='PCA1', y='PCA2', text='Year', title='2D Representation of Circular Economy embeddings',
                 color='Year', template="plotly_white", size = n_documents)
fig.update_traces(textposition='top center')
fig.update_layout(showlegend=True)
fig.show()

In [41]:
# rappresentazione dei vettori nello spazio
import numpy as np
from sklearn.decomposition import PCA
import plotly.graph_objs as go

# Supponendo che ft_vectors contenga gli embedding di "circular_economy" per ogni anno
vectors = np.array([word_vector['circular_economy'] for year, word_vector in ft_vectors.items()])
years = list(ft_vectors.keys())
n_documents = [len(papers) for year, papers in bigram_vocab.items()]

# Riduzione delle dimensioni a 3 componenti con PCA
pca = PCA(n_components=3)
reduced_vectors = pca.fit_transform(vectors)

# Visualizzazione interattiva con Plotly
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers+text',
    marker=dict(
        size=n_documents,
        color=np.arange(len(reduced_vectors)),  # Usa anni o un altro colore scala
        colorscale='Viridis',  # Cambia a piacimento
        opacity=0.8
    ),
    text=years,
    textposition="top center"
)])

fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [42]:
import numpy as np
from sklearn.manifold import TSNE # Importa TSNE da scikit-learn
import plotly.express as px
import pandas as pd

vectors = np.array([word_vector['circular_economy'] for year, word_vector in ft_vectors.items()])
years = list(ft_vectors.keys())
n_samples = vectors.shape[0]
n_documents = [len(papers) for year, papers in bigram_vocab.items()]

# Utilizza t-SNE per la riduzione dimensionale
perplexity_value = 9
tsne = TSNE(n_components=2, random_state=42, perplexity = perplexity_value ) # Specifica n_components=2 per ottenere una rappresentazione 2D
reduced_vectors_tsne = tsne.fit_transform(vectors)

# Crea un DataFrame con i risultati di t-SNE
df_tsne = pd.DataFrame({
    't-SNE1': reduced_vectors_tsne[:, 0],
    't-SNE2': reduced_vectors_tsne[:, 1],
    'Year': years
})

# Visualizza i risultati con un grafico scatter
fig = px.scatter(df_tsne, x='t-SNE1', y='t-SNE2', text='Year', title='2D Representation of Circular Economy embeddings using t-SNE',
                 color='Year', template="plotly_white", size = n_documents)
fig.update_traces(textposition='top center')
fig.update_layout(showlegend=True)
fig.show()