In [1]:
import pandas as pd
import requests
import io
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)


def reader(url):
  url = url
  s = requests.get(url).content
  t = io.StringIO(s.decode('utf-8'))
  ds = pd.read_csv(t)
  return(ds)

data_2019 = reader("https://www-static.bouldercolorado.gov/docs/opendata/CouncilEmails_PlainText2019.csv")
data_2020 = reader("https://www-static.bouldercolorado.gov/docs/opendata/CouncilEmails_PlainText2020.csv")

Note that this is the second iteration created. I did the first iteration of the LDA model in R however, I was coming up against memory constraints using my personal computer. I switched over to a python environment so I could use the free Google Colab tool and have access to cloud compute.

## Data Cleaning

The data cleaning we do is pretty simple.

1. We remove all no reply type emails. These are typically automated emails that aren't relevant to this study.
1. We remove all stop words that do not provide value in a probablistic model such as LDA
1. We stem and lemmatize the documents.


In [11]:
display(data_2019)

Unnamed: 0,SentFrom,SentTo,SentCC,ReceivedDate,EmailSubject,PlainTextBody,MessageIdentifier
0,Petition For Boulder Homeless Services,Council,,2019-12-31 22:44:46.0000000 +00:00,"Petition For Homeless Services, Signature Numb...","Dear Boulder City Council members, We are writ...",AAMkADQ2ZmVlYWI4LWI1MmEtNDc1NC05ZjhkLTI5YTA3ZD...
1,Petition For Boulder Homeless Services,Council,,2019-12-31 20:20:04.0000000 +00:00,"Petition For Homeless Services, Signature Numb...","Dear Boulder City Council members, We are writ...",AAMkADQ2ZmVlYWI4LWI1MmEtNDc1NC05ZjhkLTI5YTA3ZD...
2,Petition For Boulder Homeless Services,Council,,2019-12-31 19:28:02.0000000 +00:00,"Petition For Homeless Services, Signature Numb...","Dear Boulder City Council members, We are writ...",AAMkADQ2ZmVlYWI4LWI1MmEtNDc1NC05ZjhkLTI5YTA3ZD...
3,Petition For Boulder Homeless Services,Council,,2019-12-31 15:37:14.0000000 +00:00,"Petition For Homeless Services, Signature Numb...","Dear Boulder City Council members, We are writ...",AAMkADQ2ZmVlYWI4LWI1MmEtNDc1NC05ZjhkLTI5YTA3ZD...
4,Petition For Boulder Homeless Services,Council,,2019-12-31 15:26:31.0000000 +00:00,"Petition For Homeless Services, Signature Numb...","Dear Boulder City Council members, We are writ...",AAMkADQ2ZmVlYWI4LWI1MmEtNDc1NC05ZjhkLTI5YTA3ZD...
...,...,...,...,...,...,...,...
7630,David Figueroa,"Ana Vangelena, Council",,2019-01-01 13:36:34.0000000 +00:00,Jon Benet Ramsey 2019,“My vision concerning Jon Benet Ramsey By Davi...,AAMkADQ2ZmVlYWI4LWI1MmEtNDc1NC05ZjhkLTI5YTA3ZD...
7631,Meagan Arango,Council,,2019-01-01 11:51:35.0000000 +00:00,Severe Weather Shelter - Support ASAP,"Greetings Council, and Happy New Year to you. ...",AAMkADQ2ZmVlYWI4LWI1MmEtNDc1NC05ZjhkLTI5YTA3ZD...
7632,Max Weller,Council,,2019-01-01 11:45:27.0000000 +00:00,STAY SOBER and stay alive outdoors!,"Dear Council members, Same message I've been p...",AAMkADQ2ZmVlYWI4LWI1MmEtNDc1NC05ZjhkLTI5YTA3ZD...
7633,Kenneth Flowe,Council,,2019-01-01 08:54:04.0000000 +00:00,Updated: 2019 Martin Luther King Day Talent Sh...,This is a note to invite you to a very excitin...,AAMkADQ2ZmVlYWI4LWI1MmEtNDc1NC05ZjhkLTI5YTA3ZD...


In [7]:

from nltk.corpus import stopwords
stop = stopwords.words('english')
stemmer = SnowballStemmer('english')
def cleaner(raw_data):
    data = raw_data[raw_data.SentFrom != "No Reply"]
    data = data.dropna(subset=['PlainTextBody'])
    return(data)

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result    
    
    
#.apply(lambda x: [item for item in x if item not in stop])   
#data = cleaner( 
#    pd.concat([data_2019, data_2020]))

data = cleaner(pd.concat([data_2019, data_2020]))
#display(data)
processed_docs = data['PlainTextBody'].map(preprocess)



In [12]:
processed_docs[:10]

0    [dear, boulder, citi, council, member, write, ...
1    [dear, boulder, citi, council, member, write, ...
2    [dear, boulder, citi, council, member, write, ...
3    [dear, boulder, citi, council, member, write, ...
4    [dear, boulder, citi, council, member, write, ...
5    [dear, boulder, citi, council, member, write, ...
6    [dear, boulder, citi, council, member, write, ...
7    [dear, boulder, citi, council, member, write, ...
8    [dear, boulder, citi, council, member, write, ...
9    [hello, council, member, write, today, respons...
Name: PlainTextBody, dtype: object

## Generate Bag of Words


In [14]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
        
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


0 absorb
1 access
2 addit
3 address
4 adjud
5 affili
6 appoint
7 appropri
8 area
9 aris
10 base


In [17]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 38 ("fund") appears 4 time.


TypeError: 'NoneType' object is not subscriptable