In [13]:
from goose3 import Goose

In [16]:
from readability import Document

In [17]:
import requests

In [18]:
# determine the number of times that common twitter reference phrases appear

twitter_phrases = ['tweeted', 'to twitter', 'tweets', 'tweeting', 'retweet', 'in a tweet', 'to tweet', 'tweet from', 
                  'wrote on twitter', 'said on twitter']


def count_twitter_phrases(article_text):
    twitter_phrase_count = 0
    # find the first occurence of the twitter phrase, then continue searching for the 
    # next occurence of the twitter phrase from the index of end of the current twitter phrase
    # instance until there are no more twitter phrases located 
    for twitter_phrase in twitter_phrases:
        start_index = 0
        phrase_index = 0
        while (phrase_index != -1):
            phrase_index = article_text.find(twitter_phrase, start_index)
            start_index = phrase_index + len(twitter_phrase)
            if phrase_index != -1:
                twitter_phrase_count += 1
    return twitter_phrase_count


In [75]:
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    
    def __init__(self):
        self.tweet_embed_count = 0
        self.twitter_phrase_count = 0
        HTMLParser.__init__(self)
    
    
    def handle_starttag(self, tag, attrs):
        # check for anchor tags 
        if tag == 'a':
            # check if href is defined in the anchor tag
            for name, value in attrs:
                if name == 'href':
                    # check if twitter.com is in the href url
                    if 'twitter.com' in value:
                        self.tweet_embed_count += 1
    
    
                        
                    
                        

In [96]:
import json
def count_tweets_json(story_id):
    with open('stories/' + str(story_id) + '.json') as f:
        article_json = json.load(f)
    html_parser = MyHTMLParser()
    article_full_text = article_json['story_text']
    article_html = article_json['raw_first_download_file']
    article_text = Document(article_html)
    html_parser.feed(article_text.summary())
    tweet_embed_count = html_parser.tweet_embed_count
    return (count_twitter_phrases(article_full_text), tweet_embed_count)


In [97]:
def count_tweet_embed_json(story_id):
    return (count_tweets_json(story_id))[0]

In [98]:
def count_tweet_refs_json(story_id):
    return (count_tweets_json(story_id))[1]

In [99]:
import pandas as pd
# import csv file with the urls for each article with labeled amount of tweets per article 
# into a dataframe

story_csv_url = 'tweet_story_ids.csv'
story_df = pd.read_csv(story_csv_url)

In [102]:
def generate_tweet_counts_json(article_df):
    article_df['tweet_mentions'] = article_df.apply(lambda row: count_tweet_refs_json(row['stories_id']), axis = 1)
    article_df['tweet_embeds'] = article_df.apply(lambda row: count_tweet_embed_json(row['stories_id']), axis = 1)
    article_df['goose3_tweet_embeds'] = article_df.apply(lambda row: count_tweets_goose(row['url']), axis = 1)
    
    return article_df

In [103]:
generate_tweet_counts_json(story_df)

Unnamed: 0.1,Unnamed: 0,stories_id,title,url,tweet_embeds,tweet_mentions,goose3_tweet_embeds
0,0,1948372128,Neil Gaiman calls out fans complaining about S...,https://www.monstersandcritics.com/tv/neil-gai...,1,1,0
1,1,1880746751,Brazilian politician's cunning plan to fight C...,https://www.theguardian.com/world/2021/mar/16/...,1,0,0
2,2,1872948216,Royal Family Missed 'Greatest Opportunity for ...,https://www.newsweek.com/royal-family-missed-g...,2,0,1
3,3,1822028130,Virtual memorial for late Phoenix civil rights...,http://rssfeeds.azcentral.com/~/641472518/0/ph...,1,2,0
4,4,1863258802,Labor Groups And Progressives Urge Biden To Su...,https://www.huffpost.com/entry/labor-groups-an...,1,1,0
...,...,...,...,...,...,...,...
96,96,1845803266,"Earnings Scheduled For February 8, 2021",http://feeds.benzinga.com/~r/benzinga/~3/NLo--...,0,0,0
97,97,1817680913,US Capitol unrest leads to Steve Kerr's condem...,https://upstract.com/p/yrtxefw3?ref=rss&rd=1,0,0,0
98,98,1842855441,Santa Anita horse racing consensus picks for F...,https://www.ocregister.com/2021/02/04/santa-an...,0,0,0
99,99,1839643569,The 'Cloffice' Is the New Cubicle: Expert Insp...,https://www.sfgate.com/realestate/article/The-...,0,0,0


In [94]:
# return the total 'twitter occurences'
def count_tweets(article_url):
    r = requests.get(article_url) 
    article_full_text = (r.text).lower()
    article_text = Document(article_full_text)
    html_parser = MyHTMLParser()
    html_parser.feed(article_text.summary())
    tweet_embed_count = html_parser.tweet_embed_count
    return (count_twitter_phrases(article_full_text), tweet_embed_count)
    


In [79]:
def count_tweets_goose(article_url):
    g = Goose()
    article = g.extract(url=article_url)
    tweet_embed_count = len(article.tweets)
    return tweet_embed_count

In [6]:
def count_tweet_embed(article_url):
    tweet_info = count_tweets(article_url)
    return tweet_info[1]

In [5]:
def count_tweet_refs(article_url):
    tweet_info = count_tweets(article_url)
    return tweet_info[0]

In [4]:
# calculate recall: true positives / true positives + false negatives 
# calculate precision: true positives / total positives 

def calculate_recall_and_precision(gold_labels, given_labels):
    true_pos = 0
    false_neg = 0
    total_pos = 0
    for label_index in range(len(gold_labels)):
        if given_labels[label_index]:
            total_pos += 1
            if gold_labels[label_index]:
                true_pos += 1
        else:
            if gold_labels[label_index]:
                false_neg += 1
    recall = true_pos / (true_pos + false_neg)
    precision = true_pos / total_pos
    print('Recall: ' , recall)
    print('Precision: ', precision)
    return recall, precision

In [3]:
import pandas as pd
# import csv file with the urls for each article with labeled amount of tweets per article 
# into a dataframe

article_csv_url = 'tweet_embed_data.csv'
article_df = pd.read_csv(article_csv_url)

In [80]:
def generate_tweet_counts(article_df):
    article_df['has_tweets'] = article_df.apply(lambda row: row['tweet_count'] > 0, axis = 1)
    article_df['calc_embedded'] = article_df.apply(lambda row: count_tweet_embed(row['url']), axis = 1)
    article_df['goose3_tweet_embeds'] = article_df.apply(lambda row: count_tweets_goose(row['url']), axis = 1)
    article_df['calc_tweet_refs'] = article_df.apply(lambda row: count_tweet_refs(row['url']), axis = 1)
    article_df['calc_has_tweets'] = article_df.apply(lambda row: row['calc_embedded'] > 0 or 
                                                     row['calc_tweet_refs'] > 0, axis = 1)
    
    # calculate and print precision and recall
    recall, precision = calculate_recall_and_precision(article_df['has_tweets'], article_df['calc_has_tweets'])
    return article_df
    

In [95]:
generate_tweet_counts(article_df)

Recall:  0.9166666666666666
Precision:  0.868421052631579


Unnamed: 0,url,tweet_count,has_tweets,calc_embedded,goose3_tweet_embeds,calc_tweet_refs,calc_has_tweets
0,https://www.theguardian.com/world/2021/may/13/...,2,True,0,0,4,True
1,https://www.usatoday.com/story/sports/mlb/colu...,2,True,2,0,5,True
2,https://www.cnn.com/2021/05/23/europe/belarus-...,1,True,0,0,4,True
3,https://www.foxnews.com/opinion/biden-southern...,1,True,1,0,0,True
4,https://newrepublic.com/article/161084/republi...,1,True,0,1,2,True
5,https://www.cnn.com/2021/05/17/investing/bitco...,6,True,4,1,20,True
6,https://www.breitbart.com/economy/2021/03/03/w...,4,True,0,3,2,True
7,https://www.csmonitor.com/World/Asia-South-Cen...,1,True,0,0,1,True
8,https://www.vice.com/en/article/wx8wm5/arkansa...,1,True,0,0,3,True
9,https://www.staradvertiser.com/2021/05/17/brea...,1,True,0,0,0,False
