In [None]:
import api_wrapper as api
import os
import pandas as pd
import time

### Query user profile info

In [None]:
token = pd.read_csv("/Users/dankoban/Documents/EM6586_DB_Management/bearer_token.csv")
os.environ["BEARER_TOKEN"] = token['token'][0]

seed_accounts = ['NFL']
seed_account_info = api.get_users(usernames=seed_accounts)                    
seed_account_info.head()

### Query for posting activity

In [None]:
tweets = api.get_user_activity(usernames=seed_accounts, record_count = 10000)

In [None]:
(tweets.
 sort_values('like_count', ascending = False).
 filter(['author_screen_name', 'text', 'conversation_id', 
         'like_count', 'reply_count', 'retweet_count', 
         'referenced_tweet_type']).
 head())

### Extract Most Liked URLs

In [None]:
most_liked_urls = tweets['id'][tweets['like_count'] >= 10000].tolist()
len(most_liked_urls)

In [None]:
resp = api.get_user_activity_simple('cnn', token = 0)

In [None]:
resp

In [None]:
el = api.extract_el(tweets)
urls = el[el['edge_type'] == 'url'].rename(columns={'to': 'url'})
urls[['status_id', 'url']][urls['status_id'].isin(most_liked_urls)]

### Extract Hashtags

In [None]:
hashtags = el[el['edge_type'] == 'hashtag'].rename(columns={'to': 'hashtag'})
top_hashtags = (hashtags.groupby('hashtag').
                agg({'status_id': len}).                  
                reset_index().
                rename(columns={'hashtag': 'hashtag', 'status_id': 'hashtag_count'}).
                sort_values('hashtag_count', ascending = False).
                reset_index(drop = True).
                query('hashtag_count >= 5'))
top_hashtags

### Tweets by hashtag

In [None]:
pd.set_option('display.max_colwidth', None)
tweets_by_hashtag = (hashtags[['hashtag', 'status_id']][hashtags['hashtag'].isin(top_hashtags.hashtag)].             
                     merge(tweets[['id','text']], how='left', left_on='status_id', right_on='id').
                     filter(['hashtag', 'text']).
                     sort_values('hashtag').
                     reset_index(drop=True))
tweets_by_hashtag.head(10)

### Clean text

In [None]:
import re
tweets_by_hashtag['text'] = tweets_by_hashtag['text'].apply(lambda x: re.sub(r'http\S+', ' ', x))
tweets_by_hashtag['text'] = tweets_by_hashtag['text'].apply(lambda x: re.sub(r'@\S+', ' ', x))
tweets_by_hashtag['text'] = tweets_by_hashtag['text'].apply(lambda x: re.sub(r'\W+', ' ', x))
tweets_by_hashtag.head(10)

### Pool tweets by hashtag

In [None]:
pooled_df = tweets_by_hashtag.groupby("hashtag")["text"].apply(lambda x: "".join(x)).reset_index()
pooled_df 

### Create a corpus and dictionary

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
from gensim import corpora, models, matutils

texts = pooled_df['text'].tolist()

vectorizer = CountVectorizer(stop_words='english', 
                             ngram_range = (1,1), 
                             token_pattern="\\b[a-z][a-z][a-z]+\\b",
                             max_df=1.0, 
                             min_df=3,
                             max_features=1000000) 

vectorizer.fit(texts)
doc_word = vectorizer.transform(texts).transpose()
corpus = matutils.Sparse2Corpus(doc_word)
    
word2id = dict((v, k) for v, k in vectorizer.vocabulary_.items())
id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

dictionary = corpora.Dictionary()
dictionary.id2token = id2word
dictionary.token2id = word2id
    
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

### Fit a topic model

In [None]:
from gensim.models.wrappers import LdaMallet
from gensim.models import CoherenceModel

cm_values = []
for k in range(5,15):
    os.environ.update({'MALLET_HOME':r'/Users/dankoban/mallet-2.0.8/'})

    lda = LdaMallet(mallet_path = '/Users/dankoban/mallet-2.0.8/bin/mallet', 
                    corpus=corpus, num_topics=k, id2word=dictionary, 
                    workers = 20, iterations = 500, random_seed = 1)

    cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass')
    coherence_val = cm.get_coherence()
    print(str(k) + ": " + str(coherence_val))
    cm_values.append(coherence_val)

In [None]:
plot_df = pd.DataFrame({'k': range(5,15), 'val': cm_values})
plot_df

In [None]:
lda = LdaMallet(mallet_path = '/Users/dankoban/mallet-2.0.8/bin/mallet', 
                corpus=corpus, num_topics=12, id2word=dictionary, 
                workers = 20, iterations = 500, random_seed = 1)

In [None]:
tm_results = lda[corpus]

corpus_topics = [sorted(topics, 
                        key=lambda record: -record[1])[0] for topics in tm_results]

topics = [[(term, round(wt, 3)) for term, wt in lda.show_topic(n, topn=10)] 
                                for n in range(0, lda.num_topics)]

topics_df = pd.DataFrame([', '.join([term for term, wt in topic]) for topic in topics], 
                         columns = ['Terms per Topic'], 
                         index=['Topic'+str(t) for t in range(1, lda.num_topics+1)] )
topics_df

### Evaluate topic quality

In [None]:
cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass')
cm.get_coherence_per_topic()

### Import tokens

In [1]:
import pandas as pd
tokens = pd.read_csv("/Users/dankoban/Desktop/api_keys/tokens.csv")
awis_token = tokens['key'][tokens['name'] == 'awis_token'].tolist()[0]
awis_access_id = tokens['key'][tokens['name'] == 'awis_access_id'].tolist()[0]
twitter_bearer_token = tokens['key'][tokens['name'] == 'twitter_academic_bearer_token'].tolist()[0]

### Query news sites for Alexa rank

https://www.alexa.com/popular-articles/public-health#

In [2]:
from util import get_alexa_rank

domains = ['cnn.com', 'dailywire.com', 'publichealth.lacounty.gov']
for domain in domains:
    print(get_alexa_rank(domain, api_key = awis_token, access_id = awis_access_id))

{'DataUrl': 'cnn.com/', 'Rank': '73'}
{'DataUrl': 'dailywire.com/', 'Rank': '3001'}
{'DataUrl': 'publichealth.lacounty.gov/', 'Rank': '4333'}


### Query Twitter 

https://developer.twitter.com/en/docs/twitter-api/early-access

In [143]:
import api_wrapper as api
import os

os.environ["BEARER_TOKEN"] = twitter_bearer_token

seed_accounts = ['starsandstripes']
seed_account_info = api.get_users(usernames=seed_accounts)                    
seed_account_info.head()

Unnamed: 0,user_id,username,name,description,location,created_at,followers_count,following_count,tweet_count,listed_count
0,9130702,starsandstripes,Stars and Stripes,Stars and Stripes provides independent news an...,Washington D.C.,2007-09-27T19:58:13.000Z,251314.0,477.0,146891.0,3538.0


In [156]:
def rehydrate_status_ids(ids, token = 0):
    # authenticate with end point
    bearer_token = os.environ.get("BEARER_TOKEN")
    tweet_fields = "tweet.fields=author_id,conversation_id,public_metrics"
    ids = [str(id) for id in ids]
    ids = "ids=" + ",".join(ids)        
    url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
    
    # submit GET request - submit a query to the API      
    response = requests.request("GET", url, 
                                headers = {"Authorization": "Bearer {}".format(bearer_token)})   
    
    return response

r = rehydrate_status_ids(ids = ['1374747080602828806'])
r.json()

{'data': [{'id': '1374747080602828806',
   'public_metrics': {'retweet_count': 22,
    'reply_count': 6,
    'like_count': 24,
    'quote_count': 4},
   'text': 'Russia outsources disinformation efforts to foreign troll farms\n https://t.co/BKrOO6knnh',
   'author_id': '9130702',
   'conversation_id': '1374747080602828806'}]}

In [155]:
def rehydrate_conversation(conversation_id):
    # authenticate with end point
    bearer_token = os.environ.get("BEARER_TOKEN")
    tweet_fields = "tweet.fields=lang,author_id,conversation_id"
    url = 'https://api.twitter.com/2/tweets/search/recent?query=conversation_id:' + conversation_id
    
    # submit GET request - submit a query to the API      
    response = requests.request("GET", url, 
                                headers = {"Authorization": "Bearer {}".format(bearer_token)})   
    
    return response

r = rehydrate_conversation(conversation_id = '1374747080602828806')
r.json()

{'data': [{'id': '1374754758183985157',
   'text': '@starsandstripes Biden/Harris used Indian troll farms to spread theirs ... but y’all aren’t ready to talk about the manufactured consent in the US'},
  {'id': '1374752841252880386',
   'text': '@starsandstripes They now have a mutual agreement with CNN to counter the internal propaganda for China'},
  {'id': '1374751696497283073',
   'text': '@starsandstripes While we are at it\nalso a quantitative number\nthat are State Dept/USAID/CIA funded SM accounts\nlinked to troll farms (paid to tweet/like)\nBoth inside US and foreign'},
  {'id': '1374750663788400643',
   'text': "@starsandstripes Now do DNC funded type\nACORN foreign paid trollops\nand their farms in China, Africa, EU, and India\nI'll wait"},
  {'id': '1374749150403776516',
   'text': '@starsandstripes And this is new? Been happening since the Soviet Cold War.'},
  {'id': '1374748906085515270',
   'text': '@starsandstripes Nah!  America government are simply clowns! America is

In [159]:
def find_retweets(conversation_id):
    # authenticate with end point
    bearer_token = os.environ.get("BEARER_TOKEN")
    tweet_fields = "tweet.fields=lang,author_id,conversation_id"
    url = 'https://api.twitter.com/2/tweets/search/recent?query=Russia outsources disinformation efforts to foreign troll farms\n'
    
    # submit GET request - submit a query to the API      
    response = requests.request("GET", url, 
                                headers = {"Authorization": "Bearer {}".format(bearer_token)})   
    
    return response

r = find_retweets(conversation_id = '1374747080602828806')
r.json()

{'data': [{'id': '1374770808896196608',
   'text': 'RT @starsandstripes: Russia outsources disinformation efforts to foreign troll farms\n https://t.co/BKrOO6knnh'},
  {'id': '1374770278702669832',
   'text': 'Russia outsources disinformation efforts to foreign troll farms\n\nhttps://t.co/sVGjAs4cX6'},
  {'id': '1374765785621614597',
   'text': 'RT @starsandstripes: Russia outsources disinformation efforts to foreign troll farms\n https://t.co/BKrOO6knnh'},
  {'id': '1374764428143386627',
   'text': 'RT @DFRLab: "Governments and their spy agencies launching influence operations against their adversaries... are increasingly turning to com…'},
  {'id': '1374762930449784833',
   'text': 'RT @DFRLab: "Governments and their spy agencies launching influence operations against their adversaries... are increasingly turning to com…'},
  {'id': '1374762640522813441',
   'text': 'RT @starsandstripes: Russia outsources disinformation efforts to foreign troll farms\n https://t.co/BKrOO6knnh'},
  {'

In [89]:
import requests
def search_url(url, token = 0):
    # authenticate with end point
    bearer_token = os.environ.get("BEARER_TOKEN")
    
    # query profile info to get a user_id
    #user_info = get_users(username)
    #user_id = user_info['user_id'][0]
    
    # generate query string    
    # 'https://api.twitter.com/2/tweets/search/all?query=caturday%20has:images%20-is:retweet&tweet.fields=created_at,author_id,lang&max_results=20'
    # 'url:stackoverflow.com'
    # https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
    if token == 0: 
        url =  'https://api.twitter.com/2/tweets/search/all?query=' + url + '%20has:images%20-is:retweet&tweet.fields=created_at,author_id,lang&max_results=20'
    else:
        url =  'https://api.twitter.com/2/tweets/search/?query=' + url + '/tweets?tweet.fields=conversation_id,in_reply_to_user_id,created_at,author_id,entities,public_metrics,geo,lang,referenced_tweets&max_results=100&pagination_token=' + token    
    
    # submit GET request - submit a query to the API      
    response = requests.request("GET", url, 
                                headers = {"Authorization": "Bearer {}".format(bearer_token)})   
    
    return response

r = search_url(url = 'url:stackoverflow.com', token = 0)
r.json()


{'data': [{'lang': 'en',
   'created_at': '2021-03-24T16:30:27.000Z',
   'text': 'PHP Interactive shell open when run any command in Command prompt https://t.co/h3ycb59LKe #php #command #commandprompt https://t.co/mLaLewHJC6',
   'author_id': '1098709061334429696',
   'id': '1374760533325455360'},
  {'lang': 'en',
   'created_at': '2021-03-24T16:25:25.000Z',
   'text': 'Push one branch to another while ignoring a file in github https://t.co/9KGq1qKAhl #github #git #azuredevops https://t.co/sDE23vY3Vb',
   'author_id': '1098709061334429696',
   'id': '1374759267161477123'},
  {'lang': 'en',
   'created_at': '2021-03-24T16:20:23.000Z',
   'text': 'Adding ST_Transform in a PL/pgSQL function https://t.co/JhqtdFyVPv #postgresql #postgis #plpgsql #coordinatetransformation https://t.co/ZGalnyBGnm',
   'author_id': '1098709061334429696',
   'id': '1374758000850796551'},
  {'lang': 'en',
   'created_at': '2021-03-24T16:15:21.000Z',
   'text': 'Is it legal to create a null rvalue reference in a 

{'meta': {'result_count': 0}}

In [28]:
q = 'http%3A%2F%2Fwww.theguardian.com%2Fhealthcare-network%2Fviews-from-the-nhs-frontline%2F2015%2Fjul%2F13%2Fwasting-gps-time-no-i-cant-prescribe-you-new-shoes'
q
'https://api.twitter.com/2/tweets/search/all?query=' + q +  '&tweet.fields=created_at,author_id,lang&max_results=20'
'https://twitter.com/i/api/1.1/search/typeahead.json?q=http%3A%2F%2Fwww.theguardian.com%2Fhealthcare-network%2Fviews-from-the-nhs-frontline%2F2015%2Fjul%2F13%2Fwasting-gps-time-no-i-cant-prescribe-you-new-shoes&src=search_box&result_type=events%2Cusers%2Ctopics'

'https://api.twitter.com/2/tweets/search/all?query=http%3A%2F%2Fwww.theguardian.com%2Fhealthcare-network%2Fviews-from-the-nhs-frontline%2F2015%2Fjul%2F13%2Fwasting-gps-time-no-i-cant-prescribe-you-new-shoes&tweet.fields=created_at,author_id,lang&max_results=20'