https://github.com/susanli2016/Machine-Learning-with-Python

In [None]:
import csv
%load_ext autoreload
%autoreload 2
from progress import ProgressTracker

In [None]:
import pandas as pd
import numpy as np

In [None]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [None]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

In [None]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

In [None]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [None]:
import random
text_data = []
with open('data/dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

In [None]:
maxTweets = 4000000
tracker = ProgressTracker(maxTweets)
full_texts = []
twitter_accounts = []
with open('data/tweetsDFMin.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row, line_count in zip(csv_reader, range(maxTweets)):
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
        else:
            try:
                full_texts.append(row[1])
            except IndexError:
                continue
            try:
                twitter_accounts.append(row[2])
            except IndexError:
                full_texts.pop()
                pass
        tracker.update(line_count)
    print(f'Processed {line_count} lines.')
    print(f'Got {len(full_texts)} tweets')
    print(f'Got {len(twitter_accounts)} account ids')

In [None]:
tweetsDF = pd.DataFrame({"full_text": full_texts, "twitter_account": twitter_accounts})

In [None]:
tweetsDF.to_csv("data/tweetsDFMinReadable.csv", index=False)

In [None]:
tweetsDF = pd.read_csv("data/tweetsDFMinReadable.csv")

In [None]:
tweetsDF

In [None]:
maxTweets = 100000
rawTweets = tweetsDF.loc[:maxTweets,"full_text"]

## Or, choosing to treat tweet sets as documents

In [None]:
rawTweets = tweetsDF.groupby('twitter_account')['full_text'].apply(lambda x: "%s" % ' --\n '.join(x))

In [None]:
# preppedTweets = [prepare_text_for_lda(text) for text in rawTweets]

In [None]:
tracker = ProgressTracker(len(rawTweets), 5)
preppedTweets = []
for i, text in enumerate(rawTweets):
    preppedTweets.append(prepare_text_for_lda(text))
    tracker.update(i)

In [None]:
text_data = preppedTweets

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [None]:
# Set up log to terminal, per https://miningthedetails.com/blog/python/lda/GensimLDA/
import logging
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
logging.info("hi")

In [None]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [None]:
import pickle
pickle.dump(corpus, open('data/corpus.pkl', 'wb'))
dictionary.save('data/dictionary.gensim')

### Try 20 topics

In [None]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('LDAModels/model20.gensim')

In [None]:
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel.load("LDAModels/model20.gensim")

In [None]:
i = 4
print(ldamodel.get_document_topics(corpus[i]))
print(text_data[i])

In [None]:
tweetsDF.loc[i,"full_text"]

In [None]:
for NUM_TOPICS in (5, 10):
    print(NUM_TOPICS)
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    ldamodel.save(f'LDAModels/model{NUM_TOPICS}.gensim')

In [None]:
maxTweets

In [None]:
maxTweets = 100000
for doGroup in (False, True):
    print(f'Start doGroup={doGroup}')
    if doGroup:
        # 500 docs by long
        rawTweets = tweetsDF[:maxTweets*10].groupby('twitter_account')['full_text'].apply(lambda x: "%s" % ' --\n '.join(x))
    else:
        # maxTweets docs by short
        rawTweets = tweetsDF.loc[:maxTweets,"full_text"]

    preppedTweets = []
    for i, text in enumerate(rawTweets):
        preppedTweets.append(prepare_text_for_lda(text))

    text_data = preppedTweets
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]

    for NUM_TOPICS in (5, 10, 20):
        if doGroup == False and NUM_TOPICS == 20:
            break
        print(doGroup, NUM_TOPICS)
        ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
        ldamodel.save(f'LDAModels/model{NUM_TOPICS}.grouped={doGroup}.gensim')

In [None]:
maxTweets = 100000
tweetsDF[:maxTweets*10].groupby('twitter_account')['full_text'].apply(lambda x: "%s" % ' --\n '.join(x))

## Which model is best?

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel.load("LDAModels/model5.grouped=False.gensim")
i = 4
print(ldamodel.get_document_topics(corpus[i]))
print(text_data[i])

In [None]:
class LDATwitterModel(object):
    def __init__(tweetsDF, maxTweets=None):
        self.tweetsDF = tweetsDF[:maxTweets]
        self.groupedDF = tweetsDF[:maxTweets].groupby('twitter_account')['full_text'].apply(lambda x: "%s" % ' --\n '.join(x)).reset_index()
    
    def generateModel(self):
        pass
    
    def exposeModel(NUM_TOPICS=5, grouped=False, tweetsDF):
        ldamodel = gensim.models.ldamodel.LdaModel.load(f"LDAModels/model{NUM_TOPICS}.grouped={grouped}.gensim")

In [None]:
tweetsDF[:maxTweets].groupby('twitter_account')['full_text'].apply(lambda x: "%s" % ' --\n '.join(x)).reset_index()

In [None]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('LDAModels/model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('LDAModels/model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

### pyLDAvis

In [None]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('data/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('LDAModels/model5.gensim')

In [None]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

In [None]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

In [None]:
pickle.load(open("data/NLTKprocessedGroupedCorpus.pkl", "rb"))