# Gensim

In [1]:
# load an example corpus as list of tokenised sentences
doc = list()
with open("../data/cl-stanza.conllu", "rt") as f:
    sentence = list()
    for line in f:
        if line.strip() == "":
            continue
        if not line.startswith("#"):
            sentence.append(line.strip().split("\t")[2])
        elif line.startswith("# sent"):
            if len(sentence) > 0:
                doc.append(sentence)
            sentence = list()

In [2]:
from gensim import corpora

dictionary = corpora.Dictionary(doc)
print(dictionary)

Dictionary<1051 unique tokens: [',', '.', 'Computational', 'a', 'approach']...>


In [3]:
vec = dictionary.doc2bow(doc[10])
vec

[(1, 1),
 (8, 1),
 (14, 2),
 (25, 1),
 (28, 1),
 (32, 1),
 (41, 1),
 (62, 1),
 (63, 1),
 (101, 1),
 (108, 2),
 (126, 1),
 (127, 1),
 (128, 1)]

In [4]:
bow_corpus = [dictionary.doc2bow(sentence) for sentence in doc]
print(bow_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1)], [(0, 11), (1, 1), (8, 1), (14, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 2), (42, 1)], [(0, 1), (1, 1), (3, 2), (6, 1), (8, 1), (12, 1), (14, 1), (16, 1), (17, 3), (20, 2), (21, 1), (27, 1), (29, 2), (32, 1), (33, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1)], [(0, 1), (1, 1), (2, 1), (10, 1), (17, 2), (20, 5), (23, 1), (25, 2), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1)], [(1, 1), (2, 1), (3, 1), (6, 1), (8, 2), (12, 1), (14, 1), (17, 1), (19, 1), (20, 2), (60, 1), (61, 1), (68, 1), (72, 1)

# Word Vectors

## eigenes Modell erstellen

In [5]:
from gensim.models import Word2Vec

model = Word2Vec(doc, vector_size=24, epochs=100)
word_vectors = model.wv

print(word_vectors)

KeyedVectors<vector_size=24, 145 keys>


In [6]:
print(word_vectors['linguistics'])

[ 1.7967169  -0.20208105 -0.18179016  0.5991084   0.23164481 -0.2145266
  0.3477853   1.2548358   0.72315013  0.3746076   0.72004646 -0.6576926
  0.03615884 -0.17627965 -0.8289117  -1.3597018   0.27324116 -0.19430436
  0.16330776 -0.735643    0.28826642  0.10702375  1.4444692   0.11934896]


## Ähnlichkeiten berechnen

In [7]:
word_vectors.similarity("linguistics", "computational")

0.93920165

In [8]:
word_vectors.most_similar("linguistics")

[('computational', 0.9392016530036926),
 ('field', 0.8981980681419373),
 ('theoretical', 0.8688869476318359),
 ('area', 0.8649697303771973),
 ('apply', 0.8636681437492371),
 ('Computational', 0.8623982667922974),
 ('research', 0.8565813302993774),
 ('science', 0.8342342376708984),
 ('study', 0.8019986152648926),
 ('application', 0.7885540127754211)]

In [9]:
try:
    word_vectors.similarity("linguistics", "sociology")
except KeyError:
    pass

## vortrainierte Vektoren

In [None]:
import gensim.downloader as api
# this takes some time to download and load the model
word_vectors = api.load("glove-wiki-gigaword-100")

In [None]:
result = word_vectors.most_similar('linguistics')
most_similar_key, similarity = result[0]
print(f"{most_similar_key}: {similarity:.4f}")

In [None]:
result = word_vectors.most_similar('sociology')
most_similar_key, similarity = result[0]
print(f"{most_similar_key}: {similarity:.4f}")

In [None]:
result = word_vectors.most_similar('linguistics', negative='anthropology')
most_similar_key, similarity = result[0]
print(f"{most_similar_key}: {similarity:.4f}")

In [None]:
result = word_vectors.most_similar(['woman', 'king'], negative='man')
most_similar_key, similarity = result[0]
print(f"{most_similar_key}: {similarity:.4f}")

In [None]:
result = word_vectors.most_similar(['woman', 'king'])
most_similar_key, similarity = result[0]
print(f"{most_similar_key}: {similarity:.4f}")

In [None]:
result = word_vectors.most_similar(['woman', 'king'], negative='man')
most_similar_key, similarity = result[0]
print(f"{most_similar_key}: {similarity:.4f}")

## MWUs

In [None]:
similarity = word_vectors.n_similarity(['computational', 'linguistics'], ['natural', 'language', 'processing'])
print(f"{similarity:.4f}")

In [None]:
similarity = word_vectors.n_similarity(['corpus', 'linguistics'], ['anthropology'])
print(f"{similarity:.4f}")

In [None]:
similarity = word_vectors.n_similarity(['comutational', 'linguistics'], ['fun'])
print(f"{similarity:.4f}")