## Importing the Required Libraries

In [4]:
import nltk
#nltk.download("stopwords")
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis visualization 
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# import warnings
# warnings.filterwarnings("ignore", category=DepreciationWarning)



## Preparing Data

In [5]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [6]:
stopwords = stopwords.words("english")

In [8]:
data = load_data("data/testData.json")
print(list(data.values()))

['D. R. Horton, Inc. engages in the construction and sale of single-family housing. It operates through the following segments: Homebuilding and Financial Services. The Homebuilding segment includes the sub-segments East, Midwest, Southeast, South Central, Southwest and West regions. The Financial Services segment provides mortgage financing and title agency services to homebuyers in many of its homebuilding markets. The company was founded by Donald Ray Horton in 1978 and is headquartered in Arlington, TX.']


In [10]:
def lemmatization(texts, allowed_postags=["NOUN"]): # only allow nouns 
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
                
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


In [13]:
lemmatized_text = lemmatization(list(data.values()))
print(lemmatized_text)

['construction sale family housing segment homebuilding segment sub - segment region segment mortgage financing title agency service homebuyer homebuilding market company']


In [100]:
def gen_words(text):
    new = gensim.utils.simple_preprocess(text, deacc=True)
    return [new]

data_words = gen_words(lemmatized_text)
print(data_words)

[['construction', 'sale', 'family', 'housing', 'segment', 'homebuilding', 'segment', 'sub', 'segment', 'region', 'segment', 'mortgage', 'financing', 'title', 'agency', 'service', 'homebuyer', 'homebuilding', 'market', 'company']]


In [102]:
id2word = corpora.Dictionary(data_words)
print(id2word)
corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)
print (corpus)


Dictionary(16 unique tokens: ['agency', 'company', 'construction', 'family', 'financing']...)
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 4), (13, 1), (14, 1), (15, 1)]]


In [103]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [105]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
