In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import timeit
import spacy
import bson
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, HdpModel, LdaModel, LdaMulticore
from nltk.corpus import stopwords
import helper as he
stop_words = stopwords.words('english')

## Load data

In [4]:
event_name = "aadhar" # "farmers", "gst", "demonetization"
print('Loading Documents...')
documents = []
with open( event_name+'-all.bson','rb') as f:
    data = bson.decode_all(f.read())
for article in data:
    documents.append(article['text'])

Loading Documents...


## Document Preprocessing

In [5]:
print('Simple Preprocessing')

data = documents.copy()
# Removes phrases with @ in them
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
# Truncates multiple consecutive whitespace to one
data = [re.sub('\s+', ' ', sent) for sent in data]
# Removes ' characters
data = [re.sub("\'", "", sent) for sent in data]

Simple Preprocessing


In [6]:
data_words = list(he.sent_to_words(data))
print('Building Bigrams')
# Making Bigrams - Higher the threshold, fewer the phrases
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
print('Removing Stopwords')
# Remove Stop Words
data_words_nostops = he.remove_stopwords(data_words, stop_words)
print('Forming Bigrams')
# Form Bigrams
data_words_bigrams = he.make_bigrams(data_words_nostops, bigram_mod)
print('Lemmatizing Data')
# Lemmatize Data
data_lemmatized = he.lemmatization(data_words_bigrams, allowed_postags=[
    'NOUN', 'ADJ', 'VERB', 'ADV'])

Building Bigrams
Removing Stopwords
Forming Bigrams
Lemmatizing Data


## Dictionary

In [7]:
# The keep_n parameter controls the size of the vocabulary.
# At this stage, we have to manually experiment with various vocabulary sizes to see what works best.
# I found that ~8-10% of the number of documents is a good size.
# For Digital India, I used vocab size of 1000 (12412 documents).
# For GST, I used a vocab size of 1500 (15k documents approx)

print('Creating Dictionary')
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Filter words
id2word.filter_extremes(no_below=5, no_above=0.95,
                        keep_n=1800, keep_tokens=None)

Creating Dictionary


In [8]:
# Lemmatized data is your corpus

print('Converting corpus using dictionary')
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

# Save Data in pickle file
with open('corp_'+event_name+'.pkl', 'wb') as f:
    pickle.dump((data_lemmatized, id2word, corpus), f)

Converting corpus using dictionary
