In [2]:
import pandas as pd
import gensim
from gensim.models.phrases import Phrases, Phraser, ENGLISH_CONNECTOR_WORDS
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tqdm import tqdm
import numpy as np
import enchant
from time import time 
import multiprocessing
from copy import deepcopy
from collections import defaultdict
from SSE import build_bigram_dictionary, unite_concepts

In [3]:
tokens = np.load('tokens_per_year4.npy',allow_pickle='FALSE').item()

In [4]:
type(tokens)

dict

In [5]:
nsc_concepts =  [ ('consumer', 'awareness'),
 ('service', 'economy'),
 ('material', 'passport'),
 ('cradle', 'cradle'),
 ('plan', 'obsolescence'),
 ('durability', 'design'),
 ('design', 'durability'),
 ('resource', 'depletion'),                
 ('sustainable', 'consumption'),
 ('sustainable', 'production'),
 ('waste', 'prevention'),
 ('waste', 'reduction'),
 ('waste', 'minimization'),
 ('energy', 'efficiency'),
 ('lean', 'production'),
 ('ecological', 'efficiency'),
 ('resource', 'optimization'),
 ('green', 'energy'),
 ('resource', 'efficiency'),
 ('product', 'life'),
 ('sharing', 'economy'),
 ('product', 'service'),
 ('product', 'longevity'),
 ('reverse', 'logistic'),
 ('closed', 'loop'),
 ('waste', 'valorization'),
 ('resource', 'conservation'),
 ('resource', 'recovery'),
 ('material', 'recovery'),
 ('industrial', 'ecosystem'),
 ('industrial', 'symbiosis'),
 ('industrial', 'ecology'),
 ('waste', 'management'),
 ('waste', 'stream'),
 ('material', 'recovery'),
 ('bio', 'mimicry'),
 ('urban', 'metabolism'),
 ('zero', 'waste'), ('zero', 'emission'), ('net', 'zero') ]

In [6]:
tokens_bigr = unite_concepts(tokens, nsc_concepts)

In [7]:
for y, t in tokens_bigr.items():
    for l in t:
        i = 0  
        while i < len(l)-1:
            if l[i] == 'second' and l[i+1] == 'hand':
                l[i] = 'secondhand'  
                del l[i+1]  
            else:
                i += 1 

In [8]:
count = 0
for y, t in tokens_bigr.items():
    for l in t:
        for i in range(len(l)-1):
            if l[i] == 'second' and l[i+1] == 'hand':  
                count +=1
            
print(count)

0


**TRAIN WORD2VEC MODEL ON THE WHOLE SET**.

In [11]:
# CREATE A VOCABULARY OF BIGRAMS THE DATASET PER YEAR
bigram_vocab = {}
for year, tokens in tokens_bigr.items():
    tokens_per_year = build_bigram_dictionary(tokens, 11, 20)
    bigram_vocab[year] = tokens_per_year

In [12]:
bigram_vocabolary = np.save('tokens_per_year_bigrams2.npy', bigram_vocab)

In [13]:
tokens_bi = np.load('tokens_per_year_bigrams2.npy',allow_pickle='FALSE').item()

In [14]:
type(tokens_bi)

dict

In [15]:
wholeset = [doc for papers in tokens_bi.values() for doc in papers]

In [16]:
word_freq = defaultdict(int)
for big in wholeset:
    for i in big:
        word_freq[i] += 1
len(word_freq)

22586

In [17]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['circular_economy',
 'al',
 'use',
 'product',
 'material',
 'waste',
 'resource',
 'system',
 'environmental',
 'economic']

In [18]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [19]:
#In this first step, we set up the parameters of the model one-by-one. we don't supply the parameter sentences, and therefore leave the model uninitialized, purposefully.
w2v_model = Word2Vec(sg = 1,
                     vector_size = 200,#Dimensionality of the feature vectors
                     min_count=3, #Ignores all words with total absolute frequency lower than this - (2, 100)                    
                     window=7,  #The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the right of our target
                     sample=6e-5,   
                     alpha=0.03,  #The initial learning rate
                     min_alpha=0.0007,  #Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
                     negative=5,  #If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)
                     workers=cores-1  #Use these many worker threads to train the model (=faster training with multicore machines)
                     )

In [20]:
t = time()
#Here it builds the vocabulary from a sequence of sentences and thus initialized the model. With the loggings, We can follow the progress and even more important, the effect of min_count and sample on the word corpus. We noticed that these two parameters, and in particular sample, have a great influence over the performance of a model. Displaying both allows for a more accurate and an easier management of their influence.
#Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them)

w2v_model.build_vocab(wholeset)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.01 mins


In [21]:
t = time()
w2v_model.train(wholeset, total_examples=w2v_model.corpus_count, epochs=40, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 1.52 mins


In [22]:
# salva il modello 
# word2vec_sg modello con 10 negativity; window 10
# word2vec_sg2 modello con 5 negativity; window 10
# word2vec_sg3 modello con 5 negativity; window 5
#word2vec_cb2
#model = w2v_model.save("word2vec_sg3.model")
#word2vec_sg4.model con 
#model = w2v_model.save("word2vec_sg6.model") nuovo dataset
#model = w2v_model.save("word2vec_sg6.model")
#model = w2v_model.save("word2vec_sg7.model")
##model = w2v_model.save("word2vec_sg9.model") 40 epoche, sg
#model = w2v_model.save("word2vec_sg13.model") #min_count = 5, window=6, 
#model = w2v_model.save("word2vec_sg13_ndf.model")
model = w2v_model.save("word2vec_sg14_ndf.model") #with vector size 200

**FINETUNING OF THE MODEL ON THE DATASET OF EACH YEAR**

In [23]:
t = time()
model = Word2Vec.load("word2vec_sg14_ndf.model")

ft_models = {}
for year, documents in bigram_vocab.items():
    # Upload the deepcopy of the existing model each time the loop is restarted (for each year)
    ft_model = deepcopy(model)
    
    # Update the dictionary with the bigrams of the year and train the new model
    ft_model.build_vocab(documents, update=True)
    ft_model.train(documents, total_examples=ft_model.corpus_count, epochs=25)
    
    # Save the model
    ft_model.save(f"word2vec_{year}_sg17.model")
    ft_models[year] = ft_model

print('Time to finetune the models: {} mins'.format(round((time() - t) / 60, 2)))

Time to finetune the models: 1.27 mins


In [24]:
# SAVE THE VECTORS:
ft_vectors = {}
for year, model in ft_models.items():
    word_vectors = model.wv  
    ft_vectors[year] = word_vectors