# Imports

In [None]:
!pip install -U cade
!pip install git+https://github.com/valedica/gensim.git

In [None]:
!pip install gensim

In [None]:
import os
from gensim.models import Word2Vec
from collections import Counter
import random
from cade.cade import CADE
import string

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/')

# Functions 

In [None]:
def getsentences_W2V(file_name):
  #os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/Sentences_nl')
  sentences = []

  with open(file_name + '_nl.txt', 'r') as fp:
      for line in fp:
          x = line[1:-2]
          x = x.replace('\'','')
          x = x.replace(' ', '')
          x = x.split(',')

          sentences.append(x)
          
  return sentences

In [None]:
def getsentences_CADE(filename):
  #os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/Sentences_nl')
  sentences = ''

  with open(filename + '.txt', 'r') as fp:
      for line in fp:
          line = line.replace('[', '')
          line = line.replace(']', '')
          line = line.replace('\'','')
          line = line.replace(',', '')
          line = line.replace('\n', ' ')

          sentences += line

  with open(filename + '_nl_cade.txt', 'w') as fp:
      fp.write(sentences)

In [None]:
def training_W2V(sentences, holytext):
  #os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/' + holytext + '_nl_W2V_embeddings')
  #train 30 models and save them

  for it in range(30):
    model = Word2Vec(sentences = sentences,
                    #window = 5, default value
                    min_count=10, #not consider word with absolute frequency <10 
                    size=300, #vector size 
                    sg = 1, #skipgram algorithm
                    hs = 0,
                    negative = 5, #negative sampling with 5 noise words
                    workers = 5, #faster process
                    iter = 6 #6 iterations
                    )
  
    model.save(holytext.lower() + "_" + str(it) + ".model")

In [None]:
def training_CADE(holytext, religion):
  #os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/' + holytext + '_nl_CADE_embeddings')

  aligner = CADE(min_count=10,  
                  size=300,
                  sg = 1, 
                  #hs = 0,
                  ns = 5, 
                  workers = 5,
                  siter = 6)

  for it in range(30):
    aligner.train_compass('compass.txt', overwrite=True)

    os.rename(religion + '_sentences_nl_cade.txt', holytext + '_nl_cade'+str(it)+'.txt')

    slice_model = aligner.train_slice(holytext + '_nl_cade'+str(it)+'.txt', save=True)

    os.rename(holytext + '_nl_cade'+str(it)+'.txt', religion + '_sentences_nl_cade.txt')

# Introduction
We want to perform a semantic analysis of the corpora collected using word2vec embeddings. Since the main focus of our work is the exploration and comparison of the obtained embeddings we have to worry about the stability of this ones. 
Infact, due to their stochastic component, word2vec embeddings are not stable. For example, the most similar words to a given word could change between models even though the models are trained on the same corpora.

With the goal of increasing the stability of the models obtained we take the following decisions. We decide to use the Skip Gram method because it seems to work better than other methods on semantic tasks(Mikolov et al., 2013), like ours. We decide to use the Skip Gram Negative Sampling with 5 noise words because it seems to be more stable than the Skip Gram Hierarchical Softmax (Hellrich&Hann, 2016). We perform 6 iterations over the corpora for each embedding because it seems to be a good trade off between computational cost and stability obtained (Hellrich&Hann, 2016). We set a context window of 5, a minimum frequency of 10 and a vector size of 300, considering these to be commonly used values.

In addition, to increase the significance of our conclusions we train 30 embeddings for each corpora instead of one and we perform the analysis combining the results (Martina Schories, 2020).

We use Word2vec to create models to explore each corpora individually. We use CADE to align corpora and create other Word2vec models for comparison of corpora. We use the same parameters already described in both cases.


# Word2vec - training 

In [None]:
rel_dict = {'Christian_sentences' : 'Bible', 'Islam_sentences' : 'Quran', 'Hinduism_sentences' : 'VedasUpanishads', 'Buddhism_sentences' : 'Tripitaka'}

for text in rel_dict.keys():
  sentences = getsentences_W2V(text)
  training_W2V(sentences, rel_dict[text])

# CADE - training

In [None]:
for text in rel_dict.keys():
  sentences = getsentences_CADE(text)

In [None]:
#os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/' + holytext + '_nl_CADE_embeddings')

In [None]:
!cat Christian_sentences_nl_cade.txt Islam_sentences_nl_cade.txt Hinduism_sentences_nl_cade.txt Buddhism_sentences_nl_cade.txt > compass.txt

In [None]:
for text in rel_dict.keys():
   training_CADE(sentences, rel_dict[text])