### Step1. Getting the cleaned tiwitter data from object storage and displaying first 5 rows

In [1]:
import urllib.request
import pandas as pd 

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import nltk
from nltk.corpus import wordnet as wn

target_url="https://swift-yeg.cloud.cybera.ca:8080/v1/AUTH_233e84cd313945c992b4b585f7b9125d/geeky-summit/tweets_cleaned.csv"
file_name="tweets_cleaned1.csv"
urllib.request.urlretrieve(target_url, file_name)

tweets = pd.read_csv(file_name,parse_dates=['created_at_date']) 
tweets.head()

Unnamed: 0,created_at_date,hashtags_string,user_string,user_location,lang,longitude,latitude,name,screen_name,extended_tweet,extended_tweet_cleaned
0,2018-11-02 21:01:56,,Symin16,Toronto ✈ Calgary,en,,,♠,jessmayumba85,@Symin16 I’d like to know who,I’d like to know who
1,2018-11-02 21:02:01,,TwoCanSamAdams,YYC,en,,,hannahrae cuddleslug,thimblewad,@TwoCanSamAdams Legit. There are still chunks ...,Legit. There are still chunks in the sink an...
2,2018-11-02 21:02:05,job Calgary SupplyChain Hiring CareerArc,,Calgary,en,51.004583,-114.007914,TMJ - CAL Manuf Jobs,tmj_cal_manuf,Can you recommend anyone for this #job in #Cal...,"Can you recommend anyone for this in , AB? ..."
3,2018-11-02 21:02:10,,,🌎📱,en,,,Sunny Rai,TheSunsRay,Kids See Ghosts: love this track,Kids See Ghosts: love this track
4,2018-11-02 21:02:13,Calgary job,,Calgary,en,50.997882,-114.074005,TMJ-CAL Retail Jobs,tmj_cal_retail,"See our latest #Calgary, AB #job and click to ...","See our latest , AB and click to apply: bar..."


### Step2 Prepare data for LDA
#### Subsetting by day

In [2]:
tweets_subset_nov5=tweets.loc[tweets["created_at_date"].dt.day==5]

#### Splitting "extended_tweet_cleaned" column into tokens (words)

In [3]:
data=tweets_subset_nov5["extended_tweet_cleaned"].tolist()
print (data[:2])

['Here some the   :)  ', 'this is the most beautiful picture i have ever seen']


In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(sentence,min_len=4, max_len=15))  

data_tokens = list(sent_to_words(data))

print(data_tokens[:2])

[['here', 'some'], ['this', 'most', 'beautiful', 'picture', 'have', 'ever', 'seen']]


#### Excluding stopwords and lemmatizing

In [5]:
nltk.download('stopwords')

stop_words = set(nltk.corpus.stopwords.words('english'))
#stop_words.add('calgary')
print(stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tatianameleshko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
{"you've", 'some', 'yourselves', 't', 'we', 'hadn', 'above', 'this', 'same', 'further', 'our', 'y', 'while', 'were', 'yourself', 'his', 'wasn', 'they', 'not', 'so', "you'd", 'are', 's', 'isn', 'over', 'for', 'or', "doesn't", "shouldn't", 'against', 'whom', 'does', 'down', 'each', 'nor', 'me', 'where', 'a', 'up', 'own', 'you', "don't", 'below', 'been', 'very', 'will', 'having', 'at', 'ourselves', 'has', 'shan', 'itself', 'yours', 'mustn', 'theirs', 'too', "you're", 'no', 'ain', 'under', 'on', 'when', 'mightn', 'than', 'more', "didn't", 'here', 'wouldn', 'your', 'am', 'do', 're', "won't", 'have', 'herself', 'from', 'by', 'doesn', 'i', 'all', 'won', 'about', 'was', 'did', 'as', 'of', 'such', 'only', 'but', 'doing', 'both', 'these', "wouldn't", 'most', 'ours', 'just', 'm', "hadn't", 'in', "you'll", "hasn't", "wasn't", "mustn't", 'because', 

In [6]:
nltk.download('wordnet')

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tatianameleshko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
print(get_lemma("goes"))

go


In [8]:
def tokens_to_lda(data_tokens):
    for data_token in data_tokens:
        tokens = [token for token in data_token if token not in stop_words]
        tokens = [get_lemma(token) for token in tokens]
        yield tokens
tokens = list(tokens_to_lda(data_tokens))
print(tokens[:2])

[[], ['beautiful', 'picture', 'ever', 'see']]


#### Creating dictionary and corpus objects for LDA model

In [9]:
from gensim import corpora

dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(text) for text in tokens]

### Step3 Building LDA model with 7 topics and displaying top 8 words for every topic

In [10]:
NUM_TOPICS = 7
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, random_state=100,passes=15)

topics = ldamodel.print_topics(num_words=8)
for topic in topics:
    print(topic)

(0, '0.026*"calgary" + 0.018*"click" + 0.017*"want" + 0.015*"work" + 0.012*"alberta" + 0.012*"latest" + 0.012*"apply" + 0.012*"like"')
(1, '0.013*"anyone" + 0.013*"would" + 0.009*"recommend" + 0.008*"check" + 0.008*"come" + 0.008*"make" + 0.007*"work" + 0.007*"calgary"')
(2, '0.008*"scotty" + 0.006*"support" + 0.006*"ever" + 0.005*"different" + 0.005*"say" + 0.005*"think" + 0.005*"need" + 0.005*"like"')
(3, '0.015*"request" + 0.013*"close" + 0.011*"good" + 0.009*"love" + 0.008*"people" + 0.007*"make" + 0.006*"open" + 0.006*"snow"')
(4, '0.013*"great" + 0.011*"request" + 0.009*"happy" + 0.009*"time" + 0.009*"open" + 0.008*"birthday" + 0.007*"close" + 0.007*"amaze"')
(5, '0.011*"know" + 0.010*"great" + 0.010*"interest" + 0.010*"could" + 0.008*"want" + 0.007*"make" + 0.007*"year" + 0.006*"years"')
(6, '0.011*"know" + 0.009*"right" + 0.008*"like" + 0.007*"need" + 0.006*"contact" + 0.006*"canadian" + 0.005*"question" + 0.005*"people"')


### Step4 Visualizing the model 

In [11]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary,sort_topics=True)
pyLDAvis.display(lda_display)

####  Excersise: try diffrent days, try modifying stopwords, number of topics  and min_len /max_len in `gensim.utils.simple_preprocess`