In [None]:
import json
!pip install jsonlines
import jsonlines
import random

In [None]:
# Tweetset documentation
#https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/MBOJNS

In [None]:
tweets = []
with jsonlines.open('tweets-senate-010120-present.jsonl', 'r') as f:
    for tweet in f:
        tweets.append(tweet)

In [None]:
len(tweets)

In [None]:
sample = random.sample(tweets, int(len(tweets)/ 5))

In [None]:
fields_to_keep = {'full_text': None, 
                 'retweet_count': None,
                 'created_at': None,
                 'user': ['name', 'screen_name']
                 }

In [None]:
tweetset = []
for tweet in sample:
    reduced_tweet = {}
    for k, v in fields_to_keep.items():
        if not v:
            reduced_tweet[k] = tweet[k]
        else:
            for k2 in v:
                reduced_tweet[k2] = tweet[k][k2]
    tweetset.append(reduced_tweet)

In [None]:
len(tweetset)

In [None]:
with open('./senate-tweetset-sample-2020.json', 'w') as f:
    json.dump(tweetset, f)

In [None]:
len([tweet for tweet in tweetset if 'COVID' in tweet['full_text']])

In [None]:
len([tweet for tweet in tweetset if 'COVID' in tweet['full_text'].upper()])

In [None]:
!pip install -U spacy

In [None]:
!python -m spacy download en_core_web_md

In [None]:
import en_core_web_md

In [None]:
nlp = en_core_web_md.load()

import spacy
nlp = spacy.load("en_core_web_md")

In [None]:
%time docs = list(nlp.pipe([tweet['full_text'] for tweet in tweetset]))

In [None]:
len(docs)

In [None]:
docs[0]

In [None]:
docs[1]

In [None]:
[token for token in docs[1] if token.like_url]

In [None]:
# Inspecting entities
# https://spacy.io/api/annotation#named-entities

In [None]:
entities = {ent.text: ent.label_ for doc in docs for ent in doc.ents}

In [None]:
entities

In [None]:
# Calculating top persons discussed
from collections import Counter
person_count = Counter()
for doc in docs:
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            person_count[ent.text] += 1

In [None]:
person_count.most_common(50)

In [None]:
# Calculating average word length
word_lengths = [len(doc) for doc in docs]

In [None]:
sum(word_lengths) / len(word_lengths)

In [None]:
# Parts of speech
# https://spacy.io/api/annotation#pos-tagging

In [None]:
from collections import defaultdict
pos_dict = defaultdict(Counter)
for doc in docs:
    for token in doc:
        pos = token.pos_
        lemma = token.lemma_
        pos_dict[pos][lemma] += 1

In [None]:
pos_dict['ADJ'].most_common(50)

In [None]:
# Mapping ents to accounts

In [None]:
favorite_persons = defaultdict(Counter)
for i, doc in enumerate(docs):
    account = tweetset[i]['name']
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            favorite_persons[account][ent.text] += 1

In [None]:
# Most frequent users
users = Counter()
for tweet in tweetset:
    users[tweet['name']] += 1

In [None]:
users.most_common(10)

In [None]:
favorite_persons['Elizabeth Warren'].most_common(10)

In [None]:
favorite_persons['Senator Ted Cruz'].most_common(10)

In [None]:
# Similarity vectors
banana = nlp('banana')
orange = nlp('orange')
apple = nlp('apple')
dog = nlp('dog')
cat = nlp('cat')

In [None]:
banana.similarity(orange)

In [None]:
banana.similarity(dog)

In [None]:
banana.similarity(apple)

In [None]:
dog.similarity(cat)

In [None]:
dog.similarity(nlp('wolf'))

In [None]:
# Removing stop words, white space, punctuation, and links
docs[0]

In [None]:
[token for token in docs[0] if not token.is_stop and not token.is_space and not token.is_punct and not token.like_url]

In [None]:
def remove_stops(doc):
    tokens = []
    for token in doc:
        if not token.is_stop and not token.is_space and not token.is_punct and not token.like_url:
            tokens.append(token)
    return tokens

In [None]:
# Think of similarity as a measure of the probability that these two words would occur in the same sentence 

In [None]:
import numpy as np

In [None]:
# Get the vector of a document = average of the token vectors
# We can use this to get the vector of the tokens minus stopwords, etc.
def vectorize_without_stops(doc):
    vectors = np.array([token.vector for token in remove_stops(doc)])
    return np.mean(vectors, axis=0)

In [None]:
doc_vecs = [vectorize_without_stops(doc) for doc in docs]

In [None]:
from numpy.linalg import norm
from numpy import inner
import pandas as pd

In [None]:
def cosine_sim(doc1, doc2):
    if np.isnan(np.sum(doc2)):
        return 0
    return inner(doc1, doc2) / (norm(doc1) * norm(doc2))

In [None]:
tweet_df = pd.DataFrame.from_records(tweetset)

In [None]:
warren_df = tweet_df.loc[tweet_df['name'].str.contains('Warren')]

In [None]:
warren_df

In [None]:
warren_df.loc[17].full_text

In [None]:
target = doc_vecs[17]
sim_scores = Counter()
for i, doc in enumerate(doc_vecs):
    score = cosine_sim(target, doc)
    sim_scores[i] = score

In [None]:
scores = sim_scores.most_common(10)

In [None]:
tweet_df.loc[[score[0] for score in scores]]

In [None]:
# What about an average vector for each Senator -- can we plot this?
!pip install sklearn
from sklearn.decomposition import PCA

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
top_tweets = tweet_df.sort_values(by='retweet_count', ascending=False).head(100)

In [None]:
# Sort the index so we can match it up to the list of parsed documents
top_tweets = top_tweets.sort_index()

In [None]:
top_tweet_vecs = {i: doc_vec for i, doc_vec in enumerate(doc_vecs) if i in top_tweets.index}

In [None]:
sim_scores = {}
for i, vec in top_tweet_vecs.items():
    row = {}
    for j, vec2 in top_tweet_vecs.items():
        row[j] = cosine_sim(vec, vec2)
    sim_scores[i] = row

In [None]:
sim_matrix = pd.DataFrame.from_dict(sim_scores, orient='index')

In [None]:
sim_matrix

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne_model = TSNE(n_components=2)
feature_vecs = tsne_model.fit_transform(list(top_tweet_vecs.values()))

In [None]:
xs, ys = feature_vecs[:,0], feature_vecs[:,1]

In [None]:
plt.rcParams['figure.figsize'] = (12, 10)
plt.scatter(xs, ys)
for i, name in enumerate(top_tweets.index): 
    plt.annotate(name, (xs[i], ys[i]))

In [None]:
top_tweets.loc[[1138, 2912]].full_text.values

In [None]:
top_tweets.loc[[7143, 4060]].full_text.values

In [None]:
top_tweets.loc[[6565, 5851]].full_text.values

In [None]:
top_tweets.loc[[1990, 6406, 5357, 6916]].full_text.values

In [None]:
top_tweets.loc[[5732, 7368]].full_text.values

In [None]:
docs[7940].similarity(docs[6372])

In [None]:
# Build a map of each Tweet's vectorized representation to its author
senator_scores = defaultdict(list)
for i, vec in enumerate(doc_vecs):
    if tweetset[i]['name'] not in top_tweeters:
        continue
    if np.isnan(np.sum(vec)):
        continue
    senator_scores[tweetset[i]['screen_name']].append(vec)

In [None]:
# Now average these 
for name, scores in senator_scores.items():
    senator_scores[name] = np.mean(np.array(scores), axis=0)

In [None]:
embedded = PCA(n_components=2).fit_transform(list(senator_scores.values()))

In [None]:
xs, ys = embedded[:,0], embedded[:,1]
plt.rcParams["figure.figsize"] = (20, 15)
for i, name in enumerate(senator_scores.keys()): 
    plt.scatter(xs[i], ys[i])
    plt.annotate(name, (xs[i], ys[i]))