https://github.com/susanli2016/Machine-Learning-with-Python

## Always

In [4]:
import csv
%load_ext autoreload
%autoreload 2
from progress import ProgressTracker

In [11]:
import pandas as pd
import numpy as np

In [61]:
import pickle

## LDA Preprocess

In [12]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [13]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/daveyproctor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [15]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [16]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daveyproctor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

## Skip

In [None]:
import random
text_data = []
with open('data/dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

In [None]:
maxTweets = 4000000
tracker = ProgressTracker(maxTweets)
full_texts = []
twitter_accounts = []
with open('data/tweetsDFMin.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row, line_count in zip(csv_reader, range(maxTweets)):
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
        else:
            try:
                full_texts.append(row[1])
            except IndexError:
                continue
            try:
                twitter_accounts.append(row[2])
            except IndexError:
                full_texts.pop()
                pass
        tracker.update(line_count)
    print(f'Processed {line_count} lines.')
    print(f'Got {len(full_texts)} tweets')
    print(f'Got {len(twitter_accounts)} account ids')

In [None]:
tweetsDF = pd.DataFrame({"full_text": full_texts, "twitter_account": twitter_accounts})

In [None]:
tweetsDF.to_csv("data/tweetsDFMinReadable.csv", index=False)

## Get data

In [71]:
tweetsDF = pd.read_csv("data/tweetsDFMinReadable.csv")

In [72]:
maxTweets = None
rawTweets = tweetsDF.loc[:maxTweets,"full_text"]

## Or, choosing to treat tweet sets as documents

In [None]:
rawTweets = tweetsDF.groupby('twitter_account')['full_text'].apply(lambda x: "%s" % ' --\n '.join(x))

In [None]:
# preppedTweets = [prepare_text_for_lda(text) for text in rawTweets]

## Too slow

In [30]:
test = rawTweets[0]
import json
json.loads(json.dumps(prepare_text_for_lda(text)))

['SCREEN_NAME',
 '1)some',
 'thought',
 'value',
 'reading',
 'fiction',
 'early',
 'book',
 'mostly']

In [31]:
tweetsDF.loc[0, "tweet_word_roots"] = json.dumps(prepare_text_for_lda(text))

## Slightly faster if you initialize column first, not much

In [38]:
tweetsDF["tweet_word_roots"] = np.nan

## Preprocess all

In [73]:
tracker = ProgressTracker(len(rawTweets), 1)
preppedTweets = []
for i, text in enumerate(rawTweets):
    preppedTweets.append(prepare_text_for_lda(text))
    tracker.update(i)

1 percent done. i: 29334. Time Elapsed: 30.657868146896362
2 percent done. i: 58668. Time Elapsed: 61.07813882827759
3 percent done. i: 88002. Time Elapsed: 93.2192280292511
4 percent done. i: 117336. Time Elapsed: 119.55691194534302
5 percent done. i: 146670. Time Elapsed: 148.64076280593872
6 percent done. i: 176004. Time Elapsed: 181.2862069606781
7 percent done. i: 205338. Time Elapsed: 208.6675248146057
8 percent done. i: 234672. Time Elapsed: 237.32731199264526
9 percent done. i: 264006. Time Elapsed: 269.7783181667328
10 percent done. i: 293340. Time Elapsed: 297.81162190437317
11 percent done. i: 322674. Time Elapsed: 328.09203696250916
12 percent done. i: 352007. Time Elapsed: 361.8807260990143
13 percent done. i: 381341. Time Elapsed: 394.57716608047485
14 percent done. i: 410675. Time Elapsed: 425.6127791404724
15 percent done. i: 440009. Time Elapsed: 456.46178698539734
16 percent done. i: 469343. Time Elapsed: 488.3932740688324
17 percent done. i: 498677. Time Elapsed: 522

In [74]:
pickle.dump(preppedTweets, open("data/LDApreprocessedTweets.pkl", "wb"))

In [76]:
preppedTweets1 = pickle.load(open("data/LDApreprocessedTweets.pkl", "rb"))
preppedTweets == preppedTweets1

True

In [77]:
tweetsDF["preppedTweets"] = np.nan

In [79]:
jsons = [json.dumps(preppedTweets[i]) if i < len(preppedTweets) else np.nan for i in range(len(tweetsDF))]

In [80]:
tweetsDF["preppedTweets"] = jsons

In [81]:
json.loads(tweetsDF.loc[0,"preppedTweets"])

['SCREEN_NAME',
 '1)some',
 'thought',
 'value',
 'reading',
 'fiction',
 'early',
 'book',
 'mostly']

In [82]:
tweetsDF.to_csv("data/tweetsDFMinReadableLDApreprocessed.csv", index=False)

In [83]:
tweetsDF = pd.read_csv("data/tweetsDFMinReadableLDApreprocessed.csv")

## Get groups in a good spot

In [89]:
sum(preppedTweets[1:10]

TypeError: unsupported operand type(s) for +: 'int' and 'list'

In [253]:
maxTweets = 500000
tweetsDF[:maxTweets].groupby('twitter_account')['full_text'].apply(lambda x: "%s" % ' --\n '.join(x)).reset_index()

Unnamed: 0,twitter_account,full_text
0,HuntingtonMayor,RT @medcouragement: Great Ted Talk by one of m...
1,Ianpgary,RT @NikoLusiani: What an opening salvo in the ...
2,IlhanMN,We are literally watching a manufactured crisi...
3,InhofePress,RT @tulsaworld: Sen. @JimInhofe defends milita...
4,JAHimes,@dyanna27 @RepJohnLarson @RepJoeCourtney @rosa...
5,JMVivancoHRW,El influyente congresista @RepMcGovern dice qu...
6,JPenaMelnyk,So honored to receive this award with @CherylK...
7,JRClemmons,I proudly joined the @TNJusticeCenter &amp; #h...
8,JacksonLeeTX18,This is wonderful news. I am pleased that #ju...
9,JacksonforIndy,#Composure #GodsGrace https://t.co/CnwJUZfpWs ...


In [273]:
tweetsDF.preppedTweets[0]

'["SCREEN_NAME", "1)some", "thought", "value", "reading", "fiction", "early", "book", "mostly"]'

In [276]:
tweetsDF.columns

Index(['full_text', 'twitter_account', 'preppedTweets'], dtype='object')

In [279]:
tracker = ProgressTracker(2065, 5)
def decodeFlatten(df):
    print(df.columns)
    jsonlsts = df.preppedTweets
    print(jsonlsts[0])
    print(df.preppedTweets[0])
    raise RuntimeError
    tracker.update()
#     print(jsonlsts)
    lsts = [json.loads(arr) for arr in jsonlsts] 
    flatlst = []
    for lst in lsts:
        for item in lst:
            flatlst.append(item)
#     print("flatlist", flatlst)
    return json.dumps(lst)

In [280]:
groupedDF = tweetsDF[:maxTweets].groupby('twitter_account').aggregate(decodeFlatten).reset_index()
groupedDF

Index(['full_text', 'twitter_account', 'preppedTweets'], dtype='object')


AttributeError: 'Series' object has no attribute 'columns'

In [None]:
tweetsDF

In [244]:
groupedDF.preppedTweets[0]

'["video", "SCREEN_NAME", "white", "house", "morning", "look"]'

In [141]:
groupedDF.to_csv("data/tweetsDFMinReadableLDApreprocessedGrouped.csv", index=False)
groupedDF = pd.read_csv("data/tweetsDFMinReadableLDApreprocessedGrouped.csv")

## Done preprocessing ^

# LDA

In [162]:
import gensim

In [148]:
from gensim import corpora

In [163]:
import pyLDAvis.gensim

In [149]:
# Set up log to terminal, per https://miningthedetails.com/blog/python/lda/GensimLDA/
import logging
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [227]:
tweetsDF = pd.read_csv("data/tweetsDFMinReadableLDApreprocessed.csv")

In [225]:
groupedDF = pd.read_csv("data/tweetsDFMinReadableLDApreprocessedGrouped.csv")

In [226]:
groupedDF

Unnamed: 0,twitter_account,preppedTweets
0,1globalneighbor,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
1,1victorgomez,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
2,4budcook,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
3,73eldridge,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
4,7BOOMERESIASON,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
5,92ndKSHouseDist,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
6,ABrindisiNY,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
7,AC2016RNC,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
8,AD26Mathis,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
9,ADonovanCDFI,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."


In [151]:
open("hi1")

FileNotFoundError: [Errno 2] No such file or directory: 'hi1'

In [154]:
pickle.load

<function _pickle.load(file, *, fix_imports=True, encoding='ASCII', errors='strict')>

In [155]:
corpora.Dictionary.load

<bound method SaveLoad.load of <class 'gensim.corpora.dictionary.Dictionary'>>

'data/tweetsDFMinReadableLDApreprocessed.csv'

In [187]:
import sys

In [218]:
class LDATwitterModel(object):
    def __init__(self, tweetsDF=None, docNum=None, NUM_TOPICS=5, grouped=False, passes=15):
        """ tweetsDF should have a preppedTweets field """
        
        helperPrefix = "LDAHelpers/"
        helperSuffix = f".docNum={docNum}.grouped={grouped}"
        self.modelPrefix = "LDAModels/"
        self.modelSuffix = helperSuffix + f".NUM_TOPICS={NUM_TOPICS}.gensim"
        self.modelPath = self.modelPrefix + "model" + self.modelSuffix
        
        # Params
        self.NUM_TOPICS = NUM_TOPICS
        self.passes = passes
        
        # Get documents
        sys.stdout.write("Getting Documents...")
        if tweetsDF is None:
            Grouped = "Grouped"
            empty = ""
            dfPath = f"data/tweetsDFMinReadableLDApreprocessed{Grouped if grouped else empty}.csv"
            sys.stdout.write(f"path {dfPath}...")
            self.tweetsDF = pd.read_csv(dfPath)
        else:
            self.tweetsDF = tweetsDF
        sys.stdout.write("DF loaded...")
        self.text_data = [json.loads(arr) for arr in self.tweetsDF.loc[:docNum,"preppedTweets"]]
        print("Done.")

        sys.stdout.write("Getting Dictionary...")
        try:
            dictPath = helperPrefix + "dictionary" + helperSuffix + ".gensim"
            self.dictionary = corpora.Dictionary.load(dictPath)
            print(f"Reloaded from {dictPath}")
        except FileNotFoundError:
            self.dictionary = corpora.Dictionary(self.text_data)
            self.dictionary.save(dictPath)
        print("Done.")
        
        sys.stdout.write("Getting Corpus...")
        try:
            corpPath = helperPrefix + "corpus" + helperSuffix + ".pkl"
            self.corpus = pickle.load(open(corpPath, "rb"))
            print(f"Reloaded from {corpPath}")
        except FileNotFoundError:
            self.corpus = [self.dictionary.doc2bow(text) for text in self.text_data]
            pickle.dump(corpus, open(corpPath, 'wb'))
        print("Done.")
        
    def train(self):
        try:
            self.ldamodel = gensim.models.ldamodel.LdaModel.load(self.modelPath)
            print(f"Found pretrained model {self.modelPath}")
            return
        except FileNotFoundError:
            pass
        self.ldamodel = gensim.models.ldamodel.LdaModel(self.corpus, num_topics = self.NUM_TOPICS, id2word=self.dictionary, passes=self.passes)
        self.ldamodel.save(self.modelPath)

#         lda_display = pyLDAvis.gensim.prepare(self.ldamodel, self.corpus, self.dictionary, sort_topics=False)
#         pyLDAvis.display(lda_display)

In [219]:
model = LDATwitterModel(tweetsDF=groupedDF, docNum=None, grouped=True)

Getting Documents...DF loaded...Done.
Getting Dictionary...Done.
Getting Corpus...Done.


In [220]:
model.train()

In [221]:
topics = model.ldamodel.print_topics(num_words=4)

In [222]:
topics

[(0, '0.167*"morning" + 0.167*"SCREEN_NAME" + 0.167*"house" + 0.167*"white"'),
 (1, '0.167*"morning" + 0.167*"SCREEN_NAME" + 0.167*"house" + 0.167*"white"'),
 (2, '0.167*"morning" + 0.167*"SCREEN_NAME" + 0.167*"house" + 0.167*"white"'),
 (3, '0.167*"morning" + 0.167*"SCREEN_NAME" + 0.167*"house" + 0.167*"white"'),
 (4, '0.439*"video" + 0.211*"look" + 0.139*"white" + 0.091*"house"')]

In [223]:
model.tweetsDF

Unnamed: 0,twitter_account,preppedTweets
0,1globalneighbor,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
1,1victorgomez,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
2,4budcook,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
3,73eldridge,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
4,7BOOMERESIASON,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
5,92ndKSHouseDist,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
6,ABrindisiNY,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
7,AC2016RNC,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
8,AD26Mathis,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
9,ADonovanCDFI,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."


In [224]:
groupedDF

Unnamed: 0,twitter_account,preppedTweets
0,1globalneighbor,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
1,1victorgomez,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
2,4budcook,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
3,73eldridge,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
4,7BOOMERESIASON,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
5,92ndKSHouseDist,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
6,ABrindisiNY,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
7,AC2016RNC,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
8,AD26Mathis,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."
9,ADonovanCDFI,"[""video"", ""SCREEN_NAME"", ""white"", ""house"", ""mo..."


## Scrap

In [85]:
text_data = preppedTweets

In [47]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [48]:
corpus[0:2] # document is tuples of word index, count. bag of words.

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(1, 1),
  (7, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1)]]

In [52]:
dictionary.doc2bow(["hello", "hello"])

[(2460, 2)]

In [None]:
import pickle
pickle.dump(corpus, open('data/corpus.pkl', 'wb'))
dictionary.save('data/dictionary.gensim')

### Try 20 topics

In [None]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('LDAModels/model20.gensim')

In [None]:
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel.load("LDAModels/model20.gensim")

In [None]:
i = 4
print(ldamodel.get_document_topics(corpus[i]))
print(text_data[i])

In [None]:
tweetsDF.loc[i,"full_text"]

In [None]:
for NUM_TOPICS in (5, 10):
    print(NUM_TOPICS)
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    ldamodel.save(f'LDAModels/model{NUM_TOPICS}.gensim')

In [None]:
maxTweets

In [None]:
maxTweets = 100000
for doGroup in (False, True):
    print(f'Start doGroup={doGroup}')
    if doGroup:
        # 500 docs by long
        rawTweets = tweetsDF[:maxTweets*10].groupby('twitter_account')['full_text'].apply(lambda x: "%s" % ' --\n '.join(x))
    else:
        # maxTweets docs by short
        rawTweets = tweetsDF.loc[:maxTweets,"full_text"]

    preppedTweets = []
    for i, text in enumerate(rawTweets):
        preppedTweets.append(prepare_text_for_lda(text))

    text_data = preppedTweets
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]

    for NUM_TOPICS in (5, 10, 20):
        if doGroup == False and NUM_TOPICS == 20:
            break
        print(doGroup, NUM_TOPICS)
        ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
        ldamodel.save(f'LDAModels/model{NUM_TOPICS}.grouped={doGroup}.gensim')

In [None]:
maxTweets = 100000
tweetsDF[:maxTweets*10].groupby('twitter_account')['full_text'].apply(lambda x: "%s" % ' --\n '.join(x))

## Which model is best?

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel.load("LDAModels/model5.grouped=False.gensim")
i = 4
print(ldamodel.get_document_topics(corpus[i]))
print(text_data[i])

In [9]:
ldamodel = LDATwitterModel(tweetsDF)

In [10]:
len(corpus)

NameError: name 'corpus' is not defined

In [None]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('LDAModels/model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('LDAModels/model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

### pyLDAvis

In [None]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('data/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('LDAModels/model5.gensim')

In [None]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

In [None]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

In [None]:
pickle.load(open("data/NLTKprocessedGroupedCorpus.pkl", "rb"))