### Step1. Getting the cleaned tiwitter data from object storage and displaying first 5 rows

In [27]:
import urllib.request
import ast
import pandas as pd 

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


target_url="https://swift-yeg.cloud.cybera.ca:8080/v1/AUTH_233e84cd313945c992b4b585f7b9125d/geeky-summit/tweets_cleaned.csv"
file_name="tweets_cleaned1.csv"
urllib.request.urlretrieve(target_url, file_name)

tweets = pd.read_csv(file_name,parse_dates=['created_at_date']) 
tweets.head()

Unnamed: 0,created_at_date,hashtags_string,user_string,user_location,lang,longitude,latitude,name,screen_name,extended_tweet,extended_tweet_cleaned
0,2018-11-02 21:01:56,,Symin16,Toronto ✈ Calgary,en,,,♠,jessmayumba85,@Symin16 I’d like to know who,I’d like to know who
1,2018-11-02 21:02:01,,TwoCanSamAdams,YYC,en,,,hannahrae cuddleslug,thimblewad,@TwoCanSamAdams Legit. There are still chunks ...,Legit. There are still chunks in the sink an...
2,2018-11-02 21:02:05,job Calgary SupplyChain Hiring CareerArc,,Calgary,en,51.004583,-114.007914,TMJ - CAL Manuf Jobs,tmj_cal_manuf,Can you recommend anyone for this #job in #Cal...,"Can you recommend anyone for this in , AB? ..."
3,2018-11-02 21:02:10,,,🌎📱,en,,,Sunny Rai,TheSunsRay,Kids See Ghosts: love this track,Kids See Ghosts: love this track
4,2018-11-02 21:02:13,Calgary job,,Calgary,en,50.997882,-114.074005,TMJ-CAL Retail Jobs,tmj_cal_retail,"See our latest #Calgary, AB #job and click to ...","See our latest , AB and click to apply: bar..."


### Step2 Prepare dataset for LDA
#### Subsetting by day

In [81]:
tweets_subset_nov5=tweets.loc[tweets["created_at_date"].dt.day==5]

#### Splitting "extended_tweet_cleaned" column into tokens (words)

In [103]:
data=tweets_subset_nov5["extended_tweet_cleaned"].tolist()
print (data[:2])

['Here some the   :)  ', 'this is the most beautiful picture i have ever seen']


In [102]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(sentence,min_len=4, max_len=15))  

data_tokens = list(sent_to_words(data))

print(data_tokens[:2])

[['here', 'some'], ['this', 'most', 'beautiful', 'picture', 'have', 'ever', 'seen']]


#### Excluding stopwords and lemmatizing

In [93]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))
#stop_words.add('calgary')
print(stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tatianameleshko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
{'wasn', 'those', 'too', 'ain', 'whom', 'o', 'don', 'couldn', 'by', 'both', "haven't", 'do', 'her', 'while', 'of', 'below', 'such', 'these', 'aren', 'being', 'what', 'we', "hasn't", 'she', 'our', 'you', 'them', 'that', 'was', "isn't", 'doing', 'out', 'as', 'own', 'at', 'm', "you'll", 'any', 'hadn', 'this', "doesn't", 'is', 'yourselves', 'should', 'shouldn', "wasn't", 'did', 'and', 'their', 'herself', "couldn't", 'between', 'myself', 'for', 'in', 'd', 'himself', 'if', "won't", 'all', 'wouldn', 'most', 'a', 're', 'some', "aren't", "she's", 'doesn', 'it', 'off', 'there', 'no', "you'd", 'theirs', 'him', 'where', "that'll", 'again', 'other', 'haven', "hadn't", 'can', 'until', 'ours', "shan't", 'or', 'won', 'needn', 'then', 'am', "mustn't", 'weren', 'to', 'with', "weren't", 'who', 'nor', 'the', 'on', 'over', 'itself', 'very', 'your', "needn't

In [85]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tatianameleshko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [86]:
print(get_lemma("goes"))

go


In [101]:
def tokens_to_lda(data_tokens):
    for data_token in data_tokens:
        tokens = [token for token in data_token if token not in stop_words]
        tokens = [get_lemma(token) for token in tokens]
        yield tokens
tokens = list(tokens_to_lda(data_tokens))
print(tokens[:2])

[[], ['beautiful', 'picture', 'ever', 'see']]


#### Creating dictionary and corpus objects for LDA model

In [95]:
from gensim import corpora

dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(text) for text in tokens]

### Step3 Building LDA model with 8 topics and displaying top 8 words for every topic

In [90]:
NUM_TOPICS = 8
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)

#lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                           id2word=id2word,
#                                           num_topics=20, 
#                                           random_state=100,
#                                           update_every=1,
#                                           chunksize=100,
#                                           passes=10,
#                                           alpha='auto',
 #                                          per_word_topics=True)
#
topics = ldamodel.print_topics(num_words=8)
for topic in topics:
    print(topic)

(0, '0.023*"love" + 0.019*"know" + 0.011*"best" + 0.009*"would" + 0.009*"come" + 0.008*"need" + 0.007*"today" + 0.006*"find"')
(1, '0.010*"want" + 0.010*"good" + 0.009*"calgary" + 0.009*"click" + 0.009*"like" + 0.008*"work" + 0.008*"details" + 0.007*"scotty"')
(2, '0.012*"request" + 0.010*"open" + 0.010*"year" + 0.008*"like" + 0.008*"back" + 0.008*"need" + 0.007*"street" + 0.006*"iphone"')
(3, '0.008*"think" + 0.008*"know" + 0.007*"make" + 0.007*"right" + 0.007*"like" + 0.005*"thank" + 0.005*"today" + 0.005*"event"')
(4, '0.012*"vote" + 0.008*"right" + 0.008*"left" + 0.007*"wrong" + 0.007*"america" + 0.006*"good" + 0.006*"city" + 0.006*"ever"')
(5, '0.011*"happy" + 0.010*"make" + 0.007*"life" + 0.007*"right" + 0.006*"join" + 0.006*"still" + 0.005*"better" + 0.005*"long"')
(6, '0.019*"calgary" + 0.017*"latest" + 0.014*"anyone" + 0.014*"great" + 0.014*"alberta" + 0.013*"opening" + 0.012*"work" + 0.011*"apply"')
(7, '0.020*"close" + 0.015*"request" + 0.011*"concern" + 0.009*"complete" + 0

### Step4 Visualizing the model 

In [91]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

####  Excersise: try diffrent days, try modifying stopwords, number of topics  and min_len /max_len in `gensim.utils.simple_preprocess`