**https://www.machinelearningplus.com/nlp/lemmatization-examples-python/**

In [2]:
import pandas as pd
df = pd.read_csv('../data/brownfields_data_with_county_geoid.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
study_fields = ['Description/History']
study_data = df[study_fields].rename(columns = {'Description/History' : 'Description'})
study_data.head()

Unnamed: 0,Description
0,The subject property consists of approximately...
1,Historic land use of the Bridgepoint Business ...
2,52 vacant lots in the City of St. Louis that w...
3,52 vacant lots in the City of St. Louis that w...
4,52 vacant lots in the City of St. Louis that w...


In [4]:
study_data = study_data.fillna('')

In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
cachedStopWords = stopwords.words("english")

  _nan_object_mask = _nan_object_array != _nan_object_array


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dineshmurali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dineshmurali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dineshmurali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
import sys
!{sys.executable} -m spacy download en

You should consider upgrading via the '/Users/dineshmurali/miniconda3/bin/python -m pip install --upgrade pip' command.[0m

[93m    Linking successful[0m
    /Users/dineshmurali/miniconda3/lib/python3.5/site-packages/en_core_web_sm
    -->
    /Users/dineshmurali/miniconda3/lib/python3.5/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [8]:
import spacy
from bs4 import BeautifulSoup
from textblob import TextBlob
import re


# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])

sentence = "The striped bats are hanging on their feet for best"

# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(sentence)

# Extract the lemma for each token and join
" ".join([token.lemma_ for token in doc])

'the strip bat be hang on -PRON- foot for good'

In [22]:
# My list of stop words. these were generated based on few iterations of the model
stop_list = ['site', 'property', 'use', 'approximately', 'building', 'build', 'inc', 'llc', 'mary']

# Updates spaCy's default stop words list with my additional words. 
nlp.Defaults.stop_words.update(stop_list)

def spacyLemmatize(text):
    #1 remove html tags
    # Initialize the BeautifulSoup object to strip off html tags     
    textNoHtml = BeautifulSoup(text, "html.parser").get_text()
    #2 remove numbers and punctuation
    # Use regular expressions to do a find-and-replace
    lettersOnly = re.sub("[^a-zA-Z]"," ",textNoHtml)
    # 3. Convert to lower case, split into individual words
    words = lettersOnly.lower().split()
    #3 remove stop words
    # In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(cachedStopWords)
    woStopWords = [word for word in words if not word in stops]

    doc = nlp(" ".join(woStopWords))
    
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    return ([token.lemma_ for token in doc if token.lemma_ != '-PRON-'])
    


lemmatized = study_data['Description'].apply(spacyLemmatize)

In [17]:
lemmatized[:20]

0     [subject, property, consist, approximately, ac...
1     [historic, land, use, bridgepoint, business, p...
2     [vacant, lot, city, st, louis, develop, reside...
3     [vacant, lot, city, st, louis, develop, reside...
4     [vacant, lot, city, st, louis, develop, reside...
5     [vacant, lot, city, st, louis, develop, reside...
6     [vacant, lot, city, st, louis, develop, reside...
7     [vacant, lot, city, st, louis, develop, reside...
8     [vacant, lot, city, st, louis, develop, reside...
9     [vacant, lot, city, st, louis, develop, reside...
10    [vacant, lot, city, st, louis, develop, reside...
11    [vacant, lot, city, st, louis, develop, reside...
12    [vacant, lot, city, st, louis, develop, reside...
13    [acre, parcel, contain, two, vacant, building,...
14    [acre, parcel, contain, two, vacant, building,...
15    [past, us, include, commerical, use, parking, ...
16    [property, always, use, commercially, historic...
17    [acre, property, formerly, use, store, tra

In [23]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(lemmatized)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in lemmatized]

In [24]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [20]:
from pprint import pprint

#1st iteration
pprint(lda_model.print_topics(num_words=10))

[(0,
  '0.118*"site" + 0.069*"use" + 0.032*"former" + 0.029*"station" + '
  '0.025*"currently" + 0.025*"operate" + 0.021*"city" + 0.021*"own" + '
  '0.019*"vacant" + 0.018*"owner"'),
 (1,
  '0.064*"site" + 0.030*"tank" + 0.027*"ust" + 0.026*"storage" + 0.026*"oil" + '
  '0.018*"remove" + 0.015*"petroleum" + 0.014*"material" + 0.013*"waste" + '
  '0.013*"underground"'),
 (2,
  '0.121*"property" + 0.024*"parcel" + 0.024*"portion" + 0.023*"subject" + '
  '0.021*"develop" + 0.021*"residential" + 0.019*"land" + 0.017*"least" + '
  '0.017*"commercial" + 0.017*"vacant"'),
 (3,
  '0.077*"school" + 0.033*"store" + 0.023*"center" + 0.021*"hospital" + '
  '0.019*"grocery" + 0.017*"community" + 0.015*"tract" + 0.013*"new" + '
  '0.011*"mary" + 0.011*"theater"'),
 (4,
  '0.052*"company" + 0.040*"operation" + 0.033*"industrial" + 0.027*"include" '
  '+ 0.026*"facility" + 0.022*"railroad" + 0.022*"storage" + 0.021*"yard" + '
  '0.019*"use" + 0.019*"manufacturing"'),
 (5,
  '0.106*"shop" + 0.077*"repa

In [25]:
from pprint import pprint

#1st iteration
pprint(lda_model.print_topics(num_words=10))

[(0,
  '0.068*"building" + 0.044*"use" + 0.034*"property" + 0.024*"commercial" + '
  '0.023*"residential" + 0.023*"currently" + 0.023*"build" + 0.019*"vacant" + '
  '0.017*"former" + 0.017*"station"'),
 (1,
  '0.033*"property" + 0.032*"city" + 0.020*"phase" + 0.016*"purchase" + '
  '0.015*"site" + 0.015*"county" + 0.011*"assessment" + 0.010*"esa" + '
  '0.010*"community" + 0.010*"redevelopment"'),
 (2,
  '0.032*"mr" + 0.030*"owner" + 0.024*"sell" + 0.019*"paper" + 0.016*"llc" + '
  '0.015*"ownership" + 0.012*"substance" + 0.012*"purchase" + 0.012*"inc" + '
  '0.011*"ravine"'),
 (3,
  '0.101*"shop" + 0.074*"repair" + 0.068*"auto" + 0.042*"automotive" + '
  '0.031*"sale" + 0.031*"service" + 0.026*"car" + 0.025*"garage" + '
  '0.022*"machine" + 0.022*"maintenance"'),
 (4,
  '0.103*"property" + 0.055*"site" + 0.029*"portion" + 0.028*"subject" + '
  '0.025*"develop" + 0.021*"least" + 0.019*"since" + 0.019*"occupy" + '
  '0.017*"historical" + 0.015*"land"'),
 (5,
  '0.049*"parcel" + 0.044*"s