In [1]:
import gensim
from gensim import corpora
from pprint import pprint

# How to create a dictionary from a list of sentences?
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]
print(texts)
# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary)

[['The', 'Saudis', 'are', 'preparing', 'a', 'report', 'that', 'will', 'acknowledge', 'that'], ['Saudi', 'journalist', 'Jamal', "Khashoggi's", 'death', 'was', 'the', 'result', 'of', 'an'], ['interrogation', 'that', 'went', 'wrong,', 'one', 'that', 'was', 'intended', 'to', 'lead'], ['to', 'his', 'abduction', 'from', 'Turkey,', 'according', 'to', 'two', 'sources.']]
Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


https://www.machinelearningplus.com/nlp/gensim-tutorial/
Here texts is the list of list of words from the documents.
dictionary is the list of unique tokens (words) or the bag of words
dictionary.token2id provides an id to each token(word) 
dictionary is a dictionary object
it is also possible to update an existing dictionary to include the new words.

In [2]:
# Show the word to id map
print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}


In [3]:
documents_2 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

texts_2 = [[text for text in doc.split()] for doc in documents_2]

dictionary.add_documents(texts_2)
print(dictionary)

Dictionary(48 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


In [4]:
from gensim.utils import simple_preprocess
from smart_open import smart_open
import os

# Create gensim dictionary form a single text file
dictionary_file = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in open('sample.txt', encoding='utf-8'))

# Token to Id map
print(dictionary_file.token2id)

{'being': 0, 'clear': 1, 'confidence': 2, 'correct': 3, 'either': 4, 'headed': 5, 'hypothesis': 6, 'is': 7, 'of': 8, 'or': 9, 'prediction': 10, 'state': 11, 'that': 12, 'action': 13, 'best': 14, 'chosen': 15, 'comes': 16, 'course': 17, 'effective': 18, 'fidere': 19, 'from': 20, 'latin': 21, 'most': 22, 'the': 23, 'which': 24, 'word': 25, 'having': 26, 'in': 27, 'means': 28, 'one': 29, 'self': 30, 'therefore': 31, 'to': 32, 'trust': 33, 'arrogance': 34, 'believing': 35, 'comparison': 36, 'hubris': 37, 'something': 38, 'this': 39, 'unmerited': 40, 'are': 41, 'capable': 42, 'excessive': 43, 'not': 44, 'overconfidence': 45, 'someone': 46, 'they': 47, 'when': 48, 'any': 49, 'be': 50, 'belief': 51, 'can': 52, 'failure': 53, 'for': 54, 'regard': 55, 'succeeding': 56, 'without': 57, 'and': 58, 'as': 59, 'because': 60, 'fail': 61, 'fulfilling': 62, 'it': 63, 'lack': 64, 'may': 65, 'prophecy': 66, 'those': 67, 'try': 68, 'ability': 69, 'an': 70, 'have': 71, 'innate': 72, 'rather': 73, 'succeed':

In [5]:
# create bag of words corpus in gensim
# List with 2 sentences
my_docs = ["Who let the dogs out?",
           "Who? Who? Who? Who?"]

# Tokenize the docs
tokenized_list = [simple_preprocess(doc) for doc in my_docs]

# Create the Corpus
mydict = corpora.Dictionary()
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list]
pprint(mycorpus)
#> [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 4)]]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 4)]]


simple_preprocess utility of gensim creates the list of words from the documnet
corpora.Dictionary can be used then to create the dictionary (ie. words with index)
mycorpus (list of lists) gives the token index and the number of times it appears in each string.
word_counts gives each word and the number of times it appears an a list of lists

In [6]:
word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus]
pprint(word_counts)
#> [[('dogs', 1), ('let', 1), ('out', 1), ('the', 1), ('who', 1)], [('who', 4)]]

[[('dogs', 1), ('let', 1), ('out', 1), ('the', 1), ('who', 1)], [('who', 4)]]


following is the way if you want to create bag of words from  a very big file, reading line by line instead of loading it altogether.

In [None]:
from gensim.utils import simple_preprocess
from smart_open import smart_open
import nltk
nltk.download('stopwords')  # run once
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


class BoWCorpus(object):
    def __init__(self, path, dictionary):
        self.filepath = path
        self.dictionary = dictionary

    def __iter__(self):
        global mydict  # OPTIONAL, only if updating the source dictionary.
        for line in smart_open(self.filepath, encoding='latin'):
            # tokenize
            tokenized_list = simple_preprocess(line, deacc=True)

            # create bag of words
            bow = self.dictionary.doc2bow(tokenized_list, allow_update=True)
            
            # update the source dictionary (OPTIONAL)
            mydict.merge_with(self.dictionary)

            # lazy return the BoW
            yield bow


# Create the Dictionary
mydict = corpora.Dictionary()

# Create the Corpus
bow_corpus = BoWCorpus('sample.txt', dictionary=mydict)  # memory friendly

# Print the token_id and count for each line.
for line in bow_corpus:
    print(line)

#> [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]
#> [(12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]
#> ... truncated ...

In [None]:
# save a gensim dictionary and corpus to disk and load them back
# Save the Dict and Corpus
mydict.save('mydict.dict')  # save dict to disk
corpora.MmCorpus.serialize('bow_corpus.mm', bow_corpus)  # save corpus to disk

# Load them back
loaded_dict = corpora.Dictionary.load('mydict.dict')

corpus = corpora.MmCorpus('bow_corpus.mm')
for line in corpus:
    print(line)

In [None]:
# create the TFIDF matrix (corpus) in gensimfrom gensim import models
from gensim import models
import numpy as np

documents = ["This is the first line",
             "This is the second sentence",
             "This third document"]

# Create the Dictionary and Corpus
mydict = corpora.Dictionary([simple_preprocess(line) for line in documents])
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in documents]

# Show the Word Weights in Corpus
for doc in corpus:
    print([[mydict[id], freq] for id, freq in doc])

# [['first', 1], ['is', 1], ['line', 1], ['the', 1], ['this', 1]]
# [['is', 1], ['the', 1], ['this', 1], ['second', 1], ['sentence', 1]]
# [['this', 1], ['document', 1], ['third', 1]]

# Create the TF-IDF model
tfidf = models.TfidfModel(corpus, smartirs='ntc')

# Show the TF-IDF weights
for doc in tfidf[corpus]:
    print([[mydict[id], np.around(freq, decimals=2)] for id, freq in doc])
# [['first', 0.66], ['is', 0.24], ['line', 0.66], ['the', 0.24]]
# [['is', 0.24], ['the', 0.24], ['second', 0.66], ['sentence', 0.66]]
# [['document', 0.71], ['third', 0.71]]

In [8]:
# Generate bigrams and trigrams
import gensim
from gensim import corpora
from pprint import pprint
from gensim.utils import simple_preprocess
from smart_open import smart_open
import os

# # Create gensim dictionary form a single text file
# dct = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in open('sample.txt', encoding='utf-8'))


# # dataset = [wd for wd in dataset]

# # dct = corpora.Dictionary(dataset)
# # corpus = [dct.doc2bow(line) for line in dataset]

# # Build the bigram models
# bigram = gensim.models.phrases.Phrases(dct, min_count=3, threshold=10)

# # Construct bigram
# print(bigram[dct[0]])

documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

documents_1 =["The French Revolution was a watershed event in modern European history that began in 1789 and ended in the late 1790s with the ascent of Napoleon Bonaparte. Although it failed to achieve all of its goals and at times degenerated into a chaotic bloodbath, the French Revolution played a critical role in shaping modern nations by showing the world the power inherent in the will of the people.",
              "During this period, French citizens razed and redesigned their country’s political landscape, uprooting centuries-old institutions such as absolute monarchy and the feudal system. The upheaval was caused by widespread discontent with the French monarchy and the poor economic policies of King Louis XVI, who met his death by guillotine, as did his wife Marie Antoinette. "]

# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents_1]
# print(texts)
# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
# print(dictionary)
bigram = gensim.models.phrases.Phrases(texts, min_count=3, threshold=10)
print(bigram[texts[0]])

# Build the trigram models
trigram = gensim.models.phrases.Phrases(bigram[texts], threshold=10)

# Construct trigram
print(trigram[bigram[texts[0]]])


['The', 'French', 'Revolution', 'was', 'a', 'watershed', 'event', 'in', 'modern', 'European', 'history', 'that', 'began', 'in', '1789', 'and', 'ended', 'in', 'the', 'late', '1790s', 'with', 'the', 'ascent', 'of', 'Napoleon', 'Bonaparte.', 'Although', 'it', 'failed', 'to', 'achieve', 'all', 'of', 'its', 'goals', 'and', 'at', 'times', 'degenerated', 'into', 'a', 'chaotic', 'bloodbath,', 'the', 'French', 'Revolution', 'played', 'a', 'critical', 'role', 'in', 'shaping', 'modern', 'nations', 'by', 'showing', 'the', 'world', 'the', 'power', 'inherent', 'in', 'the', 'will', 'of', 'the', 'people.']
['The', 'French', 'Revolution', 'was', 'a', 'watershed', 'event', 'in', 'modern', 'European', 'history', 'that', 'began', 'in', '1789', 'and', 'ended', 'in', 'the', 'late', '1790s', 'with', 'the', 'ascent', 'of', 'Napoleon', 'Bonaparte.', 'Although', 'it', 'failed', 'to', 'achieve', 'all', 'of', 'its', 'goals', 'and', 'at', 'times', 'degenerated', 'into', 'a', 'chaotic', 'bloodbath,', 'the', 'French

The following can be done using gensim:

Generate a Dictionary and a Corpus
Create a Dictionary from a list of sentences
Create a Dictionary from one or more text files
Create a bag of words corpus in gensim
Create a bag of words corpus from external text file
Save a gensim dictionary and corpus to disk and load them back
Create the TFIDF matrix (corpus) in gensim?
Use gensim downloader API to load datasets
Create bigrams and trigrams using Phraser models
Create topic models with LDA
Interpret the LDA Topic Model’s output
Create a LSI topic model using gensim
Train Word2Vec model using gensim
Update an existing Word2Vec model with new data
Extract word vectors using pre-trained Word2Vec and FastText models
Create document vectors using Doc2Vec
Compute similarity metrics like cosine similarity and soft cosine similarity
Summarize text documents