# Introducción a gensim

Los ejemplos aquí mostrados fueron tomados de https://www.machinelearningplus.com/nlp/gensim-tutorial/

# How to create a Dictionary from a list of sentences?

In [1]:
import gensim
from gensim import corpora
from pprint import pprint



In [2]:
# How to create a dictionary from a list of sentences?
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary)

Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


In [3]:
# Show the word to id map
print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}


In [4]:
documents_2 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

texts_2 = [[text for text in doc.split()] for doc in documents_2]

dictionary.add_documents(texts_2)

# If you check now, the dictionary should have been updated with the new words (tokens).
print(dictionary)

Dictionary(48 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


# How to create a Dictionary from one or more text files?

In [5]:
from gensim.utils import simple_preprocess
from smart_open import smart_open
import os

# Create gensim dictionary form a single tet file
dictionary = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in open('sample.txt', encoding='utf-8'))

# Token to Id map
print(dictionary.token2id)

{'army': 0, 'china': 1, 'chinese': 2, 'force': 3, 'liberation': 4, 'of': 5, 'people': 6, 'recently': 7, 'recruited': 8, 'rocket': 9, 'tank': 10, 'technicians': 11, 'the': 12, 'think': 13, 'companies': 14, 'daily': 15, 'from': 16, 'on': 17, 'pla': 18, 'private': 19, 'reported': 20, 'saturday': 21, 'and': 22, 'appointment': 23, 'at': 24, 'ceremony': 25, 'experts': 26, 'founding': 27, 'hao': 28, 'letters': 29, 'other': 30, 'received': 31, 'science': 32, 'technology': 33, 'zhang': 34, 'according': 35, 'by': 36, 'defense': 37, 'national': 38, 'panel': 39, 'published': 40, 'report': 41, 'to': 42, 'as': 43, 'fellow': 44, 'his': 45, 'honored': 46, 'will': 47, 'conduct': 48, 'design': 49, 'fields': 50, 'into': 51, 'like': 52, 'members': 53, 'overall': 54, 'research': 55, 'serve': 56, 'which': 57, 'five': 58, 'for': 59, 'launching': 60, 'missile': 61, 'missiles': 62, 'network': 63, 'system': 64, 'years': 65, 'counterparts': 66, 'enjoy': 67, 'firms': 68, 'owned': 69, 'said': 70, 'same': 71, 'stat

In [6]:
class ReadTxtFiles(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname), encoding='latin'):
                yield simple_preprocess(line)

path_to_text_directory = "lsa_sports_food_docs"

dictionary = corpora.Dictionary(ReadTxtFiles(path_to_text_directory))

In [7]:
dictionary.token2id

{'accompanying': 309,
 'according': 175,
 'achaya': 176,
 'across': 0,
 'activity': 1,
 'ad': 177,
 'added': 310,
 'advances': 69,
 'advantage': 486,
 'after': 96,
 'ago': 311,
 'aid': 386,
 'all': 143,
 'allow': 487,
 'along': 144,
 'already': 178,
 'also': 207,
 'alters': 488,
 'although': 2,
 'america': 466,
 'amongst': 268,
 'an': 97,
 'ancient': 179,
 'and': 3,
 'another': 269,
 'any': 98,
 'are': 4,
 'areas': 422,
 'around': 70,
 'as': 5,
 'association': 208,
 'associazione': 447,
 'at': 99,
 'attempts': 71,
 'available': 387,
 'back': 489,
 'badminton': 6,
 'baked': 423,
 'baking': 357,
 'ball': 58,
 'baseball': 59,
 'bases': 72,
 'bat': 60,
 'batsmen': 100,
 'batter': 73,
 'batting': 61,
 'be': 7,
 'beach': 8,
 'beans': 145,
 'became': 209,
 'because': 210,
 'become': 424,
 'been': 101,
 'between': 62,
 'birthplace': 180,
 'black': 235,
 'body': 236,
 'boiling': 312,
 'both': 401,
 'bounce': 490,
 'bounces': 491,
 'breakfast': 237,
 'breaks': 238,
 'broad': 358,
 'but': 281,
 '

# How to create a bag of words corpus in gensim?

In [8]:
# List with 2 sentences
my_docs = ["Who let the dogs out?",
           "Who? Who? Who? Who?"]

# Tokenize the docs
tokenized_list = [simple_preprocess(doc) for doc in my_docs]

# Create the Corpus
mydict = corpora.Dictionary()
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list]
pprint(mycorpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 4)]]


In [12]:
word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus]
pprint(word_counts)

[[('dogs', 1), ('let', 1), ('out', 1), ('the', 1), ('who', 1)], [('who', 4)]]


# How to create a bag of words corpus from a text file?

In [13]:
from gensim.utils import simple_preprocess
from smart_open import smart_open
import nltk
nltk.download('stopwords')  # run once
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fhbapto\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [14]:
class BoWCorpus(object):
    def __init__(self, path, dictionary):
        self.filepath = path
        self.dictionary = dictionary

    def __iter__(self):
        global mydict  # OPTIONAL, only if updating the source dictionary.
        for line in smart_open(self.filepath, encoding='latin'):
            # tokenize
            tokenized_list = simple_preprocess(line, deacc=True)

            # create bag of words
            bow = self.dictionary.doc2bow(tokenized_list, allow_update=True)

            # update the source dictionary (OPTIONAL)
            mydict.merge_with(self.dictionary)

            # lazy return the BoW
            yield bow

In [15]:
# Create the Dictionary
mydict = corpora.Dictionary()

# Create the Corpus
bow_corpus = BoWCorpus('sample.txt', dictionary=mydict)  # memory friendly

# Print the token_id and count for each line.
for line in bow_corpus:
    print(line)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)]
[(14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]
[(5, 2), (12, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]
[(3, 1), (9, 1), (12, 2), (18, 1), (22, 1), (26, 1), (32, 1), (33, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)]
[(15, 1), (17, 1), (18, 1), (21, 1)]
[(3, 1), (9, 1), (14, 1), (16, 1), (19, 1), (22, 2), (26, 2), (32, 1), (33, 1), (34, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1)]
[(3, 1), (5, 2), (9, 1), (10, 1), (12, 1), (13, 1), (18, 1), (43, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)]
[(12, 1), (22, 1), (33, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1)]
[(12, 3), (16, 1), (26, 1), (41, 1), (43, 1), (47, 1), (66, 1), (67, 1), (68, 1), (69, 1), (

# How to save a gensim dictionary and corpus to disk and load them back?

In [16]:
# Save the Dict and Corpus
mydict.save('mydict.dict')  # save dict to disk
corpora.MmCorpus.serialize('bow_corpus.mm', bow_corpus)  # save corpus to disk

In [17]:
# Load them back
loaded_dict = corpora.Dictionary.load('mydict.dict')

corpus = corpora.MmCorpus('bow_corpus.mm')
for line in corpus:
    print(line)

[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0)]
[(14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0), (19, 1.0), (20, 1.0), (21, 1.0)]
[(5, 2.0), (12, 1.0), (22, 2.0), (23, 1.0), (24, 1.0), (25, 1.0), (26, 1.0), (27, 1.0), (28, 1.0), (29, 1.0), (30, 1.0), (31, 1.0), (32, 1.0), (33, 1.0), (34, 1.0)]
[(3, 1.0), (9, 1.0), (12, 2.0), (18, 1.0), (22, 1.0), (26, 1.0), (32, 1.0), (33, 1.0), (35, 1.0), (36, 1.0), (37, 1.0), (38, 1.0), (39, 1.0), (40, 1.0), (41, 1.0), (42, 1.0)]
[(15, 1.0), (17, 1.0), (18, 1.0), (21, 1.0)]
[(3, 1.0), (9, 1.0), (14, 1.0), (16, 1.0), (19, 1.0), (22, 2.0), (26, 2.0), (32, 1.0), (33, 1.0), (34, 1.0), (43, 1.0), (44, 1.0), (45, 1.0), (46, 1.0), (47, 1.0)]
[(3, 1.0), (5, 2.0), (9, 1.0), (10, 1.0), (12, 1.0), (13, 1.0), (18, 1.0), (43, 1.0), (47, 1.0), (48, 1.0), (49, 1.0), (50, 1.0), (51, 1.0), (52, 1.0), (53, 1.0), (54, 1.0), (55, 1.0), (56, 1.0), (57, 1.0)]
[(12, 1.0)