# Bag of words from text 

In [1]:
import gensim
from gensim import corpora
from pprint import pprint

In [15]:
from gensim.utils import simple_preprocess
from smart_open import smart_open

In [2]:
#list of docs
my_docs = ["Who let the cats in?",
           "Why? Why? Why? Why?"]

In [4]:
text_list=[simple_preprocess(doc) for doc in my_docs]

In [5]:
#tokenised words list for each doc
text_list

[['who', 'let', 'the', 'cats', 'in'], ['why', 'why', 'why', 'why']]

In [6]:
mydict=corpora.Dictionary(text_list)

In [7]:
#gives the token and its unique id
mydict.token2id

{'cats': 0, 'in': 1, 'let': 2, 'the': 3, 'who': 4, 'why': 5}

In [10]:
#to create bag of words from the dictionary
corpus=[mydict.doc2bow(doc, allow_update=True)for doc in text_list]

In [11]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(5, 4)]]

The (0, 1) in the above list means, the word with id=0 appears 1 time in the first document.
Likewise, the (5, 4) in the second list item means the word with id=5 appears 4 times in the second document.

### To get original texts back

In [12]:
word_counts=[[(mydict[id],count) for id,count in doc] for doc in corpus]
word_counts

[[('cats', 1), ('in', 1), ('let', 1), ('the', 1), ('who', 1)], [('why', 4)]]

# Bag of words from text file

In [16]:
class Bowcorpus(object):
    def __init__(self,path,dictionary):
        self.path=path
        self.dictionary=dictionary
        
    def __iter__(self):
        for line in smart_open(self.path):
            #tokenize the text
            tokenized_list=simple_preprocess(line)
            
            #bag of words for the words
            bow=self.dictionary.doc2bow(tokenized_list,allow_update=True)
            
            yield bow
            
            

In [23]:
mydict=corpora.Dictionary()
#bow corpus
bow_corpus=Bowcorpus(path='sample.txt',dictionary=mydict)

In [24]:
for line in bow_corpus:
    print(line)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 3), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)]


# Saving and loading back the dictionary

In [25]:
#save dict to the disk
mydict.save('mydict.dict')


In [26]:
#save corpus to the disk
corpora.MmCorpus.serialize('bow_corpus.mm',bow_corpus)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [28]:
#loading them back
loaded_dict=corpora.Dictionary.load('mydict.dict')
loaded_dict

<gensim.corpora.dictionary.Dictionary at 0x7f8db2ab10b8>

In [29]:
#loading the corpora
corpus=corpora.MmCorpus('bow_corpus.mm')
for line in corpus:
    print(line)

[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 3.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 2.0), (15, 1.0), (16, 1.0), (17, 2.0), (18, 1.0), (19, 1.0), (20, 1.0), (21, 1.0), (22, 1.0), (23, 1.0)]
