This notebook trains word embedding spaces for CBA texts

In [None]:
# packages
import pandas as pd

from google.colab import drive

import gensim
from gensim.models import Word2Vec
import re

from gensim.models import KeyedVectors

import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download("punkt")

from itertools import chain

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
## Mount drive
drive.mount('/content/drive')

## Load data
data_path = '/content/drive/My Drive/dhh21/'
file_name = 'cba_texts'
extension = '.csv'
df = pd.read_csv(data_path+file_name+extension, encoding='latin1')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cba_id       1440 non-null   object
 1   country      1440 non-null   object
 2   countrycode  1440 non-null   int64 
 3   locale       1440 non-null   object
 4   text         1440 non-null   object
dtypes: int64(1), object(4)
memory usage: 56.4+ KB


In [None]:
# Make a column specifying just the language
df['language'] = df.apply(lambda row: row['locale'][:2], axis=1)

df['language'].unique()

array(['FR', 'EN', 'DA', 'MT', 'DE', 'SL', 'LT', 'SV', 'TR', 'IT', 'FI',
       'BG', 'SK', 'CS', 'PT', 'NL', 'EL', 'RO', 'HR', 'ES', 'HU', 'ET',
       'PL', 'KM', 'VI', 'BA', 'AM', 'SR'], dtype=object)

Each language gets one embedding space

## Preprocessing

In [None]:
# Preprocessing

## Tokenization white-space

# Tokenize to sentences
df['sen_tokenized'] = df.apply(lambda row: nltk.sent_tokenize(row['text']), axis=1)

# Tokenize to words with white-space
tokenizer = RegexpTokenizer(r'\w+')
tokenized = []
for index, row in df.iterrows():
  sent = row['sen_tokenized']
  tokens = [tokenizer.tokenize(sentence) for sentence in sent]
  tokenized.append(tokens)

df['tokenized'] = tokenized

# Make a new column with the length of each tokenized article
df['length'] = df.apply(lambda row: sum([len(x) for x in row['tokenized']]), axis=1)

# Check the total amount of tokens
print("Total tokens data-set: ",sum(df['length']))

# Check one text
df['tokenized'][10]

Total tokens data-set:  32320515


[['xml',
  'version',
  '1',
  '0',
  'encoding',
  'UTF',
  '8',
  'meta',
  'http',
  'equiv',
  'content',
  'type',
  'content',
  'text',
  'html',
  'charset',
  'UTF',
  '8',
  'title',
  'calcestruzzo',
  'title',
  'meta',
  'name',
  'generator',
  'content',
  'Amaya',
  'see',
  'http',
  'www',
  'w3',
  'org',
  'Amaya',
  'h1',
  'Contratto',
  'collettivo',
  'di',
  'lavoro',
  'per',
  'lâ',
  'industria',
  'svizzera',
  'dei',
  'prodotti',
  'in',
  'calcestruzzo',
  '2003',
  'h1',
  'div',
  'class',
  'cbaClause',
  'highlight',
  'id',
  'clause',
  'cbaratified',
  'h2',
  'Decreto',
  'del',
  'Consiglio',
  'federale',
  'che',
  'conferisce',
  'obbligatorietÃ',
  'generale',
  'al',
  'contratto',
  'collettivo',
  'di',
  'lavoro',
  'per',
  'lâ',
  'industria',
  'svizzera',
  'dei',
  'prodotti',
  'in',
  'calcestruzzo',
  'del',
  '10',
  'luglio',
  '2003',
  'h2',
  'div',
  'p',
  'fine',
  'del',
  'contratto',
  'collettivo',
  'di',
  'lavoro',

In [None]:
## Remove punctuation - keep only alphabetical characters

no_punct = []
for index, row in df.iterrows():
  text = list(chain.from_iterable(row['tokenized']))
  npun = [word for word in text if word.isalpha()]
  no_punct.append(npun)

df['no_punct'] = no_punct

# Check one text  
df['no_punct'][10]

['xml',
 'version',
 'encoding',
 'UTF',
 'meta',
 'http',
 'equiv',
 'content',
 'type',
 'content',
 'text',
 'html',
 'charset',
 'UTF',
 'title',
 'calcestruzzo',
 'title',
 'meta',
 'name',
 'generator',
 'content',
 'Amaya',
 'see',
 'http',
 'www',
 'org',
 'Amaya',
 'Contratto',
 'collettivo',
 'di',
 'lavoro',
 'per',
 'lâ',
 'industria',
 'svizzera',
 'dei',
 'prodotti',
 'in',
 'calcestruzzo',
 'div',
 'class',
 'cbaClause',
 'highlight',
 'id',
 'clause',
 'cbaratified',
 'Decreto',
 'del',
 'Consiglio',
 'federale',
 'che',
 'conferisce',
 'obbligatorietÃ',
 'generale',
 'al',
 'contratto',
 'collettivo',
 'di',
 'lavoro',
 'per',
 'lâ',
 'industria',
 'svizzera',
 'dei',
 'prodotti',
 'in',
 'calcestruzzo',
 'del',
 'luglio',
 'div',
 'p',
 'fine',
 'del',
 'contratto',
 'collettivo',
 'di',
 'lavoro',
 'quanto',
 'lo',
 'richieda',
 'il',
 'disbrigo',
 'delle',
 'pendenze',
 'o',
 'di',
 'altri',
 'avvenimenti',
 'che',
 'rientrano',
 'nella',
 'durata',
 'di',
 'validit

In [None]:
## Make lowercase
lowercase = []
for index, row in df.iterrows():
  text = row['no_punct']
  lc = [w.lower() for w in text]
  lowercase.append(lc)

df['lowercase'] = lowercase

## Subdividing by language

In [None]:
def make_list(lang, df):
  texts = df['lowercase'][(df['language'] == lang)]
  text = pd.DataFrame(texts)
  text_list = text['lowercase'].tolist() # Save the dataframe column as a list of lists
  text = [[re.sub("[ '\"]", "",article) for article in x] for x in text_list]
  return text

In [None]:
all_texts = {}
for language in df['language'].unique():
  all_texts["%s" %language] = make_list(language, df)

In [None]:
len(all_texts['NL'][1])

56051

## Build embedding spaces

In [None]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer

cores

2

In [None]:
for key,text in zip(all_texts.keys(),all_texts.values()):
  print(len(text),key)

138 FR
414 EN
14 DA
3 MT
11 DE
5 SL
1 LT
15 SV
25 TR
86 IT
3 FI
1 BG
8 SK
3 CS
94 PT
102 NL
8 EL
24 RO
6 HR
256 ES
10 HU
4 ET
10 PL
5 KM
5 VI
176 BA
12 AM
1 SR


In [None]:
# Parameters
m_count = 10  #10 20 30 minimum count: Ignores all words with a lower total absolute frequency
window = 5#the maximum distance between current and predicted word within a sentence
s = 300       #100 300 dimensionality of the feature vectors
model_type = 1 # 0:CBOW 1:skip-gram
sample = 0 # threshold to configure which high-frequency words are randomly downsampled
alpha = 0.05 #initial learning rate
epochs = 10
min_alpha = alpha/epochs #Learning rate will linearly drop to min_alpha as training progresses. set: alpha - (min_alpha * epochs) ~ 0.00
negative = 10 #If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. 
              #If set to 0, no negative sampling is used. - (5, 20)
workers = cores #how many workers train the model
seed = 2


# Train and save embeddings
for lan, text in zip(all_texts.keys(),all_texts.values()):
  # set parameters in model
  model = Word2Vec(min_count = m_count,
                   window=window, 
                   size=s,
                   alpha=alpha,
                   sg=model_type, 
                   min_alpha=min_alpha,  
                   negative=negative, 
                   seed = seed,
                   workers = workers) 
  
  file_name = lan
  
  # Build vocabulary table
  model.build_vocab(text, progress_per=10000)
  
  # Train model
  model.train(text, total_examples=model.corpus_count, epochs=epochs,report_delay=2)
  
  # Summarize model
  print(model)

  # Summarize vocabulary
  words = list(model.wv.vocab)
  print("\nVocabulary (first 50): \n", words[:50])

  # Save model
  model_path = '/content/drive/My Drive/dhh21/models/'

  word_vectors = model.wv

  word_vectors.save(model_path+file_name+"_word2vec.wordvectors") #Store just the words + their trained embeddings.
  model.save(model_path+file_name+"_word2vec.model") #saving the whole model

  # Load back with memory-mapping = read-only, shared across processes.
  #wv = KeyedVectors.load(model_path+file_name+"_word2vec.wordvectors", mmap='r')

Word2Vec(vocab=8633, size=300, alpha=0.05)

Vocabulary (first 50): 
 ['xml', 'version', 'encoding', 'utf', 'meta', 'http', 'equiv', 'content', 'type', 'text', 'html', 'charset', 'title', 'name', 'generator', 'amaya', 'see', 'www', 'org', 'contratto', 'collettivo', 'di', 'lavoro', 'dei', 'laboratori', 'div', 'class', 'cbaclause', 'highlight', 'id', 'clause', 'decreto', 'del', 'consiglio', 'federale', 'che', 'conferisce', 'obbligatorietã', 'generale', 'al', 'svizzeri', 'aprile', 'art', 'p', 'il', 'presente', 'entro', 'in', 'vigore', 'giugno']
Word2Vec(vocab=6331, size=300, alpha=0.05)

Vocabulary (first 50): 
 ['meta', 'http', 'equiv', 'content', 'type', 'text', 'html', 'charset', 'utf', 'title', 'name', 'generator', 'amaya', 'see', 'www', 'org', 'european', 'appendix', 'to', 'abb', 'group', 'social', 'policy', 'preamble', 'p', 'in', 'addition', 'groupâ', 's', 'which', 'is', 'wholly', 'supported', 'by', 'the', 'employees', 'council', 'europe', 'following', 'also', 'applies', 'specificall

## Semantic neighborhood density

In [None]:
# Load wordvectors
from gensim.models import word2vec
from gensim.models import KeyedVectors
from google.colab import drive
import numpy as np
from numpy import dot
from numpy.linalg import norm
from gensim.models import Word2Vec
import pandas as pd
import random


seed = 24

model_path = "/content/drive/My Drive/dhh21/models/"
extension = "_word2vec.wordvectors"

all_models = {}
for language in df['language'].unique():
  all_models["%s" %language] = KeyedVectors.load(model_path+language+extension, mmap='r')

In [None]:
all_models["EN"].wv.similarity("worker", "employee")

  """Entry point for launching an IPython kernel.


0.40750203

In [None]:
# Create a dictionary of dictionaries: for each language, every word in the vocabulary gets a SND score
all_snd = {}
for language in df['language'].unique(): # Language loop

  vectors = all_models[language]
  vocabulary = vectors.vocab
  
  sdn = {}
  for word in vocabulary: # for each word in the vocabulary of a language
    top10 = vectors.wv.most_similar(positive=word) # Return the top 10 nearest neighbord
    cos_scores = [x[1] for x in top10] # Save the cosine similarity scores of these 10 neighbors
    avg = sum(cos_scores)/len(cos_scores) # Calculate the semantic neighborhood density: average of the 10 nearest neighbors cosine similarity score
    sdn[word] = avg # Save the word and its SND score as a key-value combination
  all_snd["%s" %language] = sdn # Save the dictionary of SND scores in a dictionary per language


  # Remove the CWD from sys.path while we load stuff.


In [None]:
# Get the words scoring <0.1 and >0.9

low_high = {}
for language in df['language'].unique():
  d = {}
  for (key, value) in all_snd[language].items():
    if value > 0.9:
      d["%s" %key] = value
    elif value < 0.15:
      d["%s" %key] = value
  low_high["%s" %language] = d




In [None]:
low_high["IT"]

{'acute': 0.14214195013046266,
 'adolfo': 0.907177472114563,
 'ag': 0.9196115434169769,
 'aldo': 0.9217536270618438,
 'alessandra': 0.9100626349449158,
 'alessio': 0.9141308248043061,
 'alfonso': 0.9332591533660889,
 'alfredo': 0.928644448518753,
 'amendola': 0.936097902059555,
 'amministratori': 0.13145063146948816,
 'aneurismi': 0.13924963921308517,
 'annunziata': 0.927486139535904,
 'antonello': 0.9130688488483429,
 'asportazione': 0.13446995317935945,
 'assestamento': 0.11908338293433189,
 'assistibilitã': 0.1497451439499855,
 'attilio': 0.9212391257286072,
 'aum': 0.14159672036767007,
 'barbara': 0.9044173955917358,
 'bimbo': 0.12682541981339454,
 'capital': 0.9046370327472687,
 'carla': 0.9009617328643799,
 'carmelo': 0.9094054043293,
 'carmine': 0.9197932660579682,
 'cassagest': 0.1443023756146431,
 'cf': 0.1244855098426342,
 'cfu': 0.13114324808120728,
 'charset': 0.917548394203186,
 'chiara': 0.9434709072113037,
 'cinzia': 0.912238609790802,
 'colombo': 0.950298011302948,
 'co

In [None]:
# Save all SND scores
import pickle

with open('/content/drive/My Drive/dhh21/' + "SND_scores" + '.pkl', 'wb') as f:
  pickle.dump(all_snd, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# Open all SND scores
with open('/content/drive/My Drive/dhh21/' + "SND_scores" + '.pkl', 'rb') as f:
  test = pickle.load(f)

for (key, item) in test["EN"].items():
  if "contract" in key:
    print(item, key)


0.33133524656295776 contractual
0.3570935666561127 contract
0.3374549686908722 contracts
0.5082933962345123 contractseverancepay
0.48957888782024384 subcontractors
0.496165868639946 contractor
0.4308527082204819 contracted
0.42139846086502075 contracting
0.46933728754520415 contractors
0.5467705249786377 contracttrial
0.5552005469799042 contracttrialperiod


In [None]:
from math import sqrt
print(sqrt(2-(2*0.33)))
print(sqrt(2-(2*0.93)))

1.1575836902790224
0.374165738677394


## Percentage of abstract words per language

In [None]:
lowest15 = {}
for language in df['language'].unique():
  d = {}
  for (key, value) in all_snd[language].items():
    if value < 0.40:
      d["%s" %key] = value
  lowest15["%s" %language] = d

In [None]:
def abstract_ratio(lan):
  abstract = len(lowest15[lan])
  total = len(all_snd[lan])
  ratio = abstract/total
  return (lan, ratio)

for lan in df['language'].unique():
  print(abstract_ratio(lan))

('FR', 0.12602803197034634)
('EN', 0.3034275785815827)
('DA', 0.017094017094017096)
('MT', 0.009408602150537635)
('DE', 0.021505376344086023)
('SL', 0.014145810663764961)
('LT', 0.0)
('SV', 0.06621004566210045)
('TR', 0.05172413793103448)
('IT', 0.10406450909883404)
('FI', 0.015189873417721518)
('BG', 0.0)
('SK', 0.009719222462203024)
('CS', 0.0)
('PT', 0.14953751284686537)
('NL', 0.13967774420946627)
('EL', 0.0)
('RO', 0.07679558011049724)
('HR', 0.02040816326530612)
('ES', 0.26329814492159476)
('HU', 0.02598267821452365)
('ET', 0.0)
('PL', 0.012521343198634035)
('KM', 0.0)
('VI', 0.03508771929824561)
('BA', 0.27980956502921445)
('AM', 0.026954177897574125)
('SR', 0.0)
