In [4]:
# Import raw comment data
import json
from id_hash import hash

with open('data/physics_podcast_1.json', 'r') as f:
    corpus = json.load(f)

messages = [c['message'] for c in corpus]
for c in corpus:
    c['id'] = hash(c['message'], c['message'], c['message'])
    c['likes'] = 0
print(len(corpus))

1323


In [18]:
# Analyze each comment
import re
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

nlp = spacy.load('en_core_web_lg')
nlp.add_pipe('spacytextblob')

def clean(dirty):
    clean = dirty.lower().strip()
    clean = re.sub("(\'s)", "", clean)
    clean = re.sub(r'<.*?( \/)*>', '', clean)
    return clean

def analyze(m):
    doc = nlp(m)
    topics = []
    ents = []
    sentiment = doc._.polarity
    
    for ent in doc.ents:
        ents.append(ent.text)
        topics.append((clean(ent.text), ent.start, ent.end, ent.label_))
        
    for chunk in doc.noun_chunks:
        start, end = chunk.start, chunk.end
        for token in chunk:
            if not token.is_stop:
                break
            start += 1 # Remove stop words from start
        span = doc[start:end]
        if (not span) | (clean(span.text) in ents):
            pass; # Avoid duplicates
        topics.append((clean(span.text), span.start, span.end, 'NOUN_CHUNK'))
        
    return topics, sentiment

for c in corpus:
    topics, sentiment = analyze(c['message'])
    c['topics'] = topics
    c['sentiment'] = round(sentiment, 3)
    
print(corpus[0])

{'time': -400, 'message': 'Love your channel these days ... killing it with the guests ...', 'id': '1e28d7861f918e66', 'likes': 0, 'topics': [('these days', 3, 5, 'DATE'), ('channel', 2, 3, 'NOUN_CHUNK'), ('', 8, 8, 'NOUN_CHUNK'), ('guests', 10, 11, 'NOUN_CHUNK')], 'sentiment': 0.5}


In [48]:
# Cluster comment topics
from collections import Counter

def cluster_comments(comments, user_labs=[]): 
    clustered = {} # by token
    for c in comments:
        if 'topics' not in c.keys():
            pass
        for token, start, end, label in c['topics']:
            edge = (c['id'], label, c['likes'], c['sentiment'])            
            if token in clustered.keys():
                clustered[token].append(edge)
            else:
                clustered[token] = [edge] # Sort by token

    # Group and Count
    named_entities = []
    noun_chunks = []
    frequency = {}
    for token in clustered:# {...token: [...(id, label, likes, sentiment)]}
        t = clustered[token] 
        n = len(t)
        if label in user_labs:
            label = user_labs[label]
        else:
            all_labels = [lab for (i, lab, lik, s) in t]
            label = Counter(all_labels).most_common(1)[0][0]
        edges = [(i, lik, s) for (i, lab, lik, s) in clustered[token]]
        if label == 'NOUN_CHUNK':
            noun_chunks.append((token, label, edges))
            frequency[token] = len(edges)
        else:
            named_entities.append((token, label, edges))
            frequency[token] = len(edges)
    return named_entities, noun_chunks, frequency

named_entities, noun_chunks, frequency = cluster_comments(corpus)

print(named_entities[0], noun_chunks[0], len(frequency))

('these days', 'DATE', [('1e28d7861f918e66', 0, 0.5), ('548aa7a35387fd32', 0, 0.25), ('79931d6d28e2b50b', 0, 0.375)]) ('channel', 'NOUN_CHUNK', [('1e28d7861f918e66', 0, 0.5), ('3ee7f3cf37c22024', 0, 0.0)]) 2299


In [49]:
# Parse named entities
def cluster_ne(named_entities, frequency, user_subs=[]):
    subs = {}
    subbed = {}
    def add(token, label, edges):
        if token in subbed.keys():
            new_edges = subbed[token][1] + edges
            subbed[token] = (label, new_edges)
        else:
            subbed[token] = (label, edges)

    named_entity_longest = sorted(named_entities, key=lambda e: len(e[0]), reverse=True)
    for token, label, edges in named_entity_longest:
        used = False
        if token in user_subs:
            repl = user_subs[token]
            subs[token] = repl  
            add(repl, label, edges)
            used = True
        elif label == 'PERSON':
            included = [(key, frequency[key]) for key in subbed.keys() if token in key]
            if len(included) > 0:
                (repl, freq) = sorted(included, key=lambda i: i[1], reverse=True)[0]
                collision = freq / frequency[token]
                if (freq > 2) & (collision > 0.3):
                    subs[token] = repl if not repl in subs.keys() else subs[repl]
                    add(subs[token], label, edges)
                    used = True
        elif label in ['CARDINAL', 'TIME', 'QUANTITY']:
            used = True # Remove
        if used == False:
            add(token, label, edges)
            
    named_entities_subbed = []
    for token in subbed:
        (label, edges) = subbed[token]
        named_entities_subbed.append((token, label, len(edges), edges))
    named_entities_sorted = sorted(named_entities_subbed, key=lambda c: c[2], reverse=True)
    return named_entities_sorted, subs
    
user_subs = {'eric': 'eric weinstein', 'max': 'max tegmark', 'gu': 'geometric unity'}
named_entities_sorted, ne_subs = cluster_ne(named_entities, frequency, user_subs)
print(named_entities_sorted[5])

('@church', 'ORG', 18, [('8ad2436dc945ea88', 0, 0.375), ('ad1a615ee1dc55bd', 0, 0.375), ('d425ff0c2880f7da', 0, 0.402), ('917cf84f93cad293', 0, 0.0), ('40d4379d6e02af0b', 0, 0.0), ('40d4379d6e02af0b', 0, 0.0), ('668d696dc728aebb', 0, 0.0), ('0b2db982638ae2f5', 0, -0.9), ('0b2db982638ae2f5', 0, -0.9), ('97bdaa337027d1c7', 0, 0.0), ('a5b7455a6685d7fe', 0, 0.0), ('fdc28f0a5fd1045d', 0, 0.0), ('3bce5158010f9b64', 0, 0.175), ('3fc86ff46d40c063', 0, 0.0), ('9d6ababab4d32e10', 0, 0.6), ('9d6ababab4d32e10', 0, 0.6), ('3b8419c73cbbfd9d', 0, -0.175), ('911733f598249d40', 0, 0.0)])


In [56]:
# Parse noun chunks
import html
from itertools import combinations

def cluster_nc(noun_chunks, ne_tokens, frequency, user_subs=[]):
    ngrams = {}
    for edge in noun_chunks:
        cleaned = edge[0]
        cleaned = html.unescape(cleaned)

        # Search all permutations of words
        words = cleaned.split(" ")
        local_ngrams = []            
        max_n = 4
        for n in range(max(len(words), max_n)):
            local_ngrams += [list(x) for x in combinations(words, n)]

        used = False    
        for n in local_ngrams:
            cleaned = " ".join(n).strip()
            if (cleaned not in ne_tokens) & (cleaned not in user_subs) & (cleaned != ""):
                if cleaned in ngrams:
                    ngrams[cleaned].append(edge)
                    used = True
        if not used:
            ngrams[cleaned] = [edge]            
    ngrams_by_length = sorted([(n, len(ngrams[n]), ngrams[n]) for n in ngrams], key=lambda i : len(i[0]), reverse=True)

    ng_clust = {}
    ngram_subs = {}
    for token, freq, edges in ngrams_by_length:
        included = [(c, len(ngrams[c])) for c in ng_clust if token in c]

        if len(included) == 0:
            ng_clust[token] = ngrams[token]
        else:
            # most similar !== most frequent
            most_similar = sorted(included, key=lambda i: i[1], reverse=True)[0]

            if (most_similar[1] > 2) & (most_similar[1] > len(ngrams[token])*0.6):
                ngram_subs[token] = most_similar[0]
                ng_clust[most_similar[0]] += ngrams[token]
            else:
                ng_clust[token] = ngrams[token]

    ng_topics = [(n, ng_clust[n][0][1], len(ng_clust[n][0][2]), ng_clust[n][0][2]) for n in ng_clust]
    ng_topics_sorted = sorted(ng_topics, key=lambda t : t[2], reverse=True)
    return ng_topics_sorted, ngram_subs
                    
ne_tokens = [token for (token, label, n, edges) in named_entities_sorted]
noun_chunks_sorted, ngram_subs = cluster_nc(noun_chunks, ne_tokens, frequency, ne_subs)
print(noun_chunks_sorted[10])

('god', 'NOUN_CHUNK', 17, [('21bb304bbc32c13d', 0, 0.0), ('8b83ab32a47f7e76', 0, 0.0), ('061ba9aa8d19dabc', 0, 0.0), ('ac16cc7d0e8b5862', 0, -0.188), ('d98d6c33f78113dd', 0, 0.0), ('d2029b7113a115a9', 0, 0.0), ('c2a9f53fae851fe5', 0, 0.292), ('f2d72379797356a2', 0, 0.183), ('c3315c3e5a40ec3b', 0, 0.0), ('c3315c3e5a40ec3b', 0, 0.0), ('48f4894a154dff49', 0, 0.8), ('48f4894a154dff49', 0, 0.8), ('c01be916764d2444', 0, 0.2), ('bb50b1c468270f1a', 0, 0.0), ('a803ef2f327a5fb9', 0, -0.25), ('e2bd2aa941ddf209', 0, -0.3), ('49967926f90c4dd1', 0, -0.312)])


In [73]:
# Sort
def topic_score(n, all_likes):
    return n + int(all_likes)*0.1

def sort(named_entities_sorted, noun_chunks_sorted, n_topics=200):
    all_ents = named_entities_sorted[:n_topics] + noun_chunks_sorted[:n_topics]
    all_parsed = []
    for (token, label, n, edges) in all_ents:
        commentIds = []
        all_likes = 0
        all_sentiment = {'pos': 0, 'neg': 0, 'neu': 0}
        for (commentId, likes, sentiment) in edges:
            commentIds.append(commentId)
            all_likes += likes
            if sentiment > 0: 
                all_sentiment['pos'] += 1
            elif sentiment < 0:
                all_sentiment['neg'] += 1
            else:
                all_sentiment['neu'] += 1
            score = topic_score(n, all_likes)
        all_parsed.append((token, label, score, all_likes, all_sentiment, commentIds))
    all_sorted = sorted(all_parsed, key=lambda e: e[2], reverse=True)
    return all_sorted[:n_topics]

all_sorted = sort(named_entities_sorted, noun_chunks_sorted)
for s in all_sorted[:10]:
    print(s[0:5])


('eric weinstein', 'PERSON', 215.0, 0, {'pos': 95, 'neg': 21, 'neu': 99})
('brian keating', 'PERSON', 123.0, 0, {'pos': 62, 'neg': 6, 'neu': 55})
('max tegmark', 'PERSON', 101.0, 0, {'pos': 43, 'neg': 16, 'neu': 42})
('physics', 'NOUN_CHUNK', 51.0, 0, {'pos': 21, 'neg': 5, 'neu': 25})
('universe', 'NOUN_CHUNK', 39.0, 0, {'pos': 12, 'neg': 5, 'neu': 22})
('geometric unity', 'PERSON', 30.0, 0, {'pos': 4, 'neg': 4, 'neu': 22})
('entropy', 'NOUN_CHUNK', 29.0, 0, {'pos': 11, 'neg': 3, 'neu': 15})
('time', 'NOUN_CHUNK', 29.0, 0, {'pos': 7, 'neg': 7, 'neu': 15})
('science', 'NOUN_CHUNK', 24.0, 0, {'pos': 8, 'neg': 5, 'neu': 11})
('@brett harris', 'PERSON', 22.0, 0, {'pos': 5, 'neg': 3, 'neu': 14})


In [86]:
import time

class Timer():
    
    def __init__(self):
        self.last_time = time.time()
        
    def step(self, message):
        this_time = time.time()
        print(f'{message}: {this_time - self.last_time}')
        self.last_time = this_time

In [87]:
timer = Timer()

with open('data/tsla_q420_earnings.json', 'r') as f:
    corpus = json.load(f)
timer.step("open file")

for c in corpus:
    c['id'] = hash(c['message'], c['message'], c['message'])
    c['likes'] = 0

    topics, sentiment = analyze(c['message'])
    c['topics'] = topics
    c['sentiment'] = round(sentiment, 3)
timer.step("analyze")
    
named_entities, noun_chunks, frequency = cluster_comments(corpus)
timer.step("cluster")

user_subs = {'eric': 'eric weinstein', 'max': 'max tegmark', 'gu': 'geometric unity'}
named_entities_sorted, ne_subs = cluster_ne(named_entities, frequency, user_subs)
timer.step("named entities")

ne_tokens = [token for (token, label, n, edges) in named_entities_sorted]
noun_chunks_sorted, ngram_subs = cluster_nc(noun_chunks, ne_tokens, frequency, ne_subs)
timer.step("noun chunks")

all_sorted = sort(named_entities_sorted, noun_chunks_sorted)
timer.step("sort")

for s in all_sorted[:10]:
    print(s[0:5])


open file: 0.021181821823120117
analyze: 22.96806311607361
cluster: 0.026554107666015625
named entities: 0.004396677017211914
noun chunks: 0.1861402988433838
sort: 0.004126787185668945
('elon', 'NOUN_CHUNK', 193.0, 0, {'pos': 65, 'neg': 16, 'neu': 112})
('tesla', 'NOUN_CHUNK', 189.0, 0, {'pos': 52, 'neg': 24, 'neu': 113})
('rob', 'PERSON', 161.0, 0, {'pos': 70, 'neg': 6, 'neu': 85})
('gme', 'ORG', 124.0, 0, {'pos': 15, 'neg': 16, 'neu': 93})
('tsla', 'NOUN_CHUNK', 91.0, 0, {'pos': 15, 'neg': 14, 'neu': 62})
('fsd', 'ORG', 87.0, 0, {'pos': 26, 'neg': 12, 'neu': 49})
('stock', 'NOUN_CHUNK', 82.0, 0, {'pos': 15, 'neg': 6, 'neu': 61})
('amc', 'ORG', 69.0, 0, {'pos': 7, 'neg': 13, 'neu': 49})
('audio', 'NOUN_CHUNK', 60.0, 0, {'pos': 36, 'neg': 10, 'neu': 14})
('wsb', 'NOUN_CHUNK', 54.0, 0, {'pos': 4, 'neg': 19, 'neu': 31})
