This notebook creates a word embedding space from a preprocessed csv file containing texts

In [None]:
## Import packages

import pandas as pd
from google.colab import drive

import gensim
from gensim.models import Word2Vec
import re

from gensim.models import KeyedVectors

In [None]:
## Mount drive
drive.mount('/content/drive')

## Load data
data_path = '/path to data folder'
file_name = '1618_1791_preprocessed' # Change the file for which you want to create the embedding here
extension = '.csv'
df = pd.read_csv(data_path+file_name+extension, encoding='latin1')

Mounted at /content/drive


In [None]:
df.info()
df.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83882 entries, 0 to 83882
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   level_0     83882 non-null  int64 
 1   type        83882 non-null  object
 2   title       83881 non-null  object
 3   papertitle  83882 non-null  object
 4   date        83882 non-null  object
 5   text        83882 non-null  object
dtypes: int64(1), object(5)
memory usage: 4.5+ MB


Unnamed: 0,level_0,type,title,papertitle,date,text
0,0,artikel,PORTUGAL.,Johannes Enschede en Zoonen,1775/05/04 00:00:00,"['lissabon', 'den', 'april', 't', 'zeer', 'gem..."
1,1,artikel,PORTUGAL.,"Callenfels, Mandelgreen en Taillefert",1775/05/09 00:00:00,"['lissabon', 'deni', 'april', 'mszeer', 'femee..."
2,2,artikel,ZWEEDEN.,C. Barlinckhoff [etc];A.S. Hoitsema,1775/12/01 00:00:00,"['stokkolm', 'den', 'november', 'deezer', 'dag..."
3,3,advertentie,Advertentie,Jan Verlem,1785/09/05 00:00:00,"['etÃªr', 'Ã®', 'tiffetij', 'sfitts', 'ftict',..."
4,4,artikel,ITALIÃÂN.,"Callenfels, Mandelgreen en Taillefert",1768/10/08 00:00:00,"['romen', 'september', 'bevel', 'zyn', 'heil',..."


In [None]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer

cores

4

In [None]:
## Making a list from the text column
df['text'] = df['text'].str[1:-1].str.split(',').tolist() # Converting string to list in dataframe

text_dirty = df['text'].tolist() # Save the dataframe column as a list of lists

text = [[re.sub("[ '\"]", "",article) for article in x] for x in text_dirty] # Remove the ' , ", and spaces from each item in the list
text[10] # Check one item from the list

['genua',
 'den',
 'february',
 'engelfe',
 'conful',
 'alhier',
 'or',
 'dre',
 'hof',
 'groot',
 'brittannien',
 'onze',
 'regeering',
 'verklaart',
 'j',
 'dezelve',
 'uytwerkiogen',
 'gramfchap',
 'zyn',
 'groot',
 'brittan',
 'nifc',
 'majcfteyt',
 'gevoelen',
 'ingevalle',
 'dezelve',
 'direit',
 'indirect',
 'i',
 'proje',
 'cn',
 'liet',
 'hof',
 'vr',
 'n',
 'spanjen',
 'c',
 'zedert',
 'fteld',
 'zig',
 'raat',
 'defeiilie',
 'do',
 'weerzydli',
 'vlooten',
 'toulon',
 'i',
 'enbyde',
 'hicrifc',
 'eylanden',
 'noch',
 'tyding',
 'eenige',
 'verandering',
 'keui',
 'bn',
 'den',
 'i',
 'tbruary',
 'eenige',
 'dagen',
 'eene',
 'muur',
 'beboorende',
 'witte',
 'vrouwen',
 'kloofter',
 'onvoorziens',
 'ingeftort',
 'heefteen',
 'vrouws',
 'perfoon',
 'verpletterd',
 'ander',
 'zeergequetft',
 'i',
 'konncn',
 'ontdekken',
 'welke',
 'oorzaak',
 'inltorj',
 'ting',
 'dien',
 'jvluur',
 'weeft',
 'vertelt',
 'volgends',
 'gemelde',
 'klooller',
 'eene',
 'non',
 'opgetlootcn',
 

Gensim word2ved dosumentation
https://radimrehurek.com/gensim/models/word2vec.html

tutorial on word2vec and gensim https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

tutorial on streaming data to avoid ram issues
https://rare-technologies.com/data-streaming-in-python-generators-iterators-iterables/

In [None]:
# Parameters
m_count = 10  #10 20 30 minimum count: Ignores all words with a lower total absolute frequency
window = 5#the maximum distance between current and predicted word within a sentence
s = 300       #100 300 dimensionality of the feature vectors
model_type = 1 # 0:CBOW 1:skip-gram
sample = 0 # threshold to configure which high-frequency words are randomly downsampled
alpha = 0.05 #initial learning rate
epochs = 10
min_alpha = alpha/epochs #Learning rate will linearly drop to min_alpha as training progresses. set: alpha - (min_alpha * epochs) ~ 0.00
negative = 10 #If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. 
              #If set to 0, no negative sampling is used. - (5, 20)
workers = cores #how many workers train the model
seed = 2

# set parameters in model
model = Word2Vec(#df['text'],
                 min_count = m_count,
                 window=window, 
                 size=s,
                 alpha=alpha,
                 sg=model_type, 
                 min_alpha=min_alpha,  
                 negative=negative, 
                 seed = seed,
                 workers = workers) 

# Build vocabulary table
model.build_vocab(text, progress_per=10000)

# Train model
model.train(text, total_examples=model.corpus_count, epochs=epochs,report_delay=2)

(485044017, 632451880)

In [None]:
# Summarize model
print(model)

# Summarize vocabulary
words = list(model.wv.vocab)
print("\nVocabulary (first 50): \n", words[:50])

# Save model
model_path = 'path to folder where to save the model'

word_vectors = model.wv

word_vectors.save(model_path+file_name+"_word2vec.wordvectors") #Store just the words + their trained embeddings.
model.save(model_path+file_name+"_word2vec.model") #saving the whole model

# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load(model_path+file_name+"_word2vec.wordvectors", mmap='r')

#vector = wv['computer']  # Get numpy vector of a word

Word2Vec(vocab=232193, size=300, alpha=0.05)

Vocabulary (first 50): 
 ['lissabon', 'den', 'april', 't', 'zeer', 'gemeen', 'alle', 'lyden', 'veele', 'voorbeelden', 'gezien', 'vrouwen', 'mans', 'gewaad', 'verkleed', 'vÃ³Ã³r', 'mannen', 'zyn', 'gehouden', 'kleed', 'vrouw', 'geruimen', 'tyd', 'voer', 'doorgegaan', 'zulks', 'wy', 'nochtans', 'deezer', 'dagen', 'voorbeeld', 'gehad', 'eenen', 'zederd', 'jaaren', 'sexe', 'omtrent', 'midden', 'voorige', 'maand', 'wierd', 'gafthuis', 'hospitaal', 'peifoon', 'gebragt', 'welk', 'ten', 'ras', 'ha', 'ir']


In [None]:
wv.most_similar(positive=["vrouwen"]) # Test if the saved file works

[('vronwen', 0.5560520887374878),
 ('dooronania', 0.5454427599906921),
 ('bywyven', 0.5393435955047607),
 ('viouwen', 0.5298631191253662),
 ('uytgeroeid', 0.5151073336601257),
 ('huwbare', 0.5143763422966003),
 ('huwbaar', 0.5133192539215088),
 ('vtouwen', 0.5106829404830933),
 ('vrouwenen', 0.5078840851783752),
 ('ongetrouwden', 0.5046734809875488)]