In [26]:
import requests
import json
import time
import math
import os

In [27]:
import pandas as pd
from flair.models import TextClassifier
from flair.data import Sentence
import re
from segtok.segmenter import split_single
from textblob import TextBlob


In [28]:
api_key = os.environ['NYT_SECRET']
comments_endpoint = "https://api.nytimes.com/svc/community/v3/user-content/url.json"

comments_api_args = {
    "offset": 0,
    "url": ""
}

In [29]:
#functions to do API stuff

def call_nyt_api(endpoint, api_key, api_args):
    """A (theoretically...) reusable function to call any NYT api endpoint.
    api_args is a dict of parameters that are appended to the API request string
    returns a json object"""
    args = []
    for arg in api_args.items():
        full_arg = "&{key}={value}".format(key=arg[0],value=arg[1])
        args.append(full_arg)
    arg_string = ''.join(args)
    request_string = ("{endpoint}?api-key={api_key}{arg_string}".format(endpoint=endpoint, api_key=api_key, arg_string=arg_string))
    api_obj = requests.get(request_string)
    print(f"HTTP Response: {api_obj.status_code}")
    return api_obj.json()


def get_comment_page(offset, url):
    """get a single page of comments. returns a JSON object"""
    endpoint = comments_endpoint
    return call_nyt_api(endpoint, api_key, {"offset": offset, "url": url})

def get_all_comments(url):
    """iterate through all pages of comments. returns a JSONL string object"""
    comments = []
    page0 = get_comment_page(0,url)
    total_comments = page0["results"]["totalParentCommentsFound"]
    print(f"Fetched Page 0. {total_comments} total comments found. Fetching {math.ceil(total_comments/25)} pages")
    for comment in page0["results"]["comments"]:
        comments.append({"comment_id": comment["commentID"],"comment":comment["commentBody"]})
    offset = 25
    #free tier api limit - sleep at least 6 seconds to avoid getting rate-limited
    time.sleep(7)
# to get all comments
     while offset < total_comments:
# to get just 2 pages of comments for faster testing
#    while offset < 50:
        page = get_comment_page(offset,url)
        for comment in page["results"]["comments"]:
            comments.append({"comment_id": comment["commentID"],"comment":comment["commentBody"]})
        print(f"Fetched Page {offset/25}. {total_comments - offset} comments left to fetch.")
        offset += 25
        #free tier api limit - sleep at least 6 seconds to avoid getting rate-limited
        time.sleep(7)
    comments_with_url = []
    for comment in comments:
        comments_with_url.append(json.dumps({"article_url": url, "comment_id": comment["comment_id"], "comment": comment["comment"]}))
    comments_jsonl = '\n'.join(comments_with_url)
    return comments_jsonl

In [50]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    
    I stole this from stack overflow because it's fun :D 
    
    https://stackoverflow.com/questions/3173320/text-progress-bar-in-terminal-with-block-characters
    
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent if int(float(percent)) < 100 else 100}% {suffix}', end = printEnd)


In [41]:
#functions to do sentiment analysis stuff

def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

def make_sentences(text):
    """ Break apart text into a list of sentences """
    sentences = [sent for sent in split_single(text)]
    return sentences

def predict(sentence):
    """ Predict the sentiment of a sentence """
    if sentence == "":
        return {'flair':0,'textblob':0}
    text = Sentence(sentence)
    # stacked_embeddings.embed(text)
    classifier.predict(text)
    value = text.labels[0].to_dict()['value']
    #print(text.to_dict())
    if value == 'POSITIVE':
        flair_result = text.to_dict()['labels'][0]['confidence']
    else:
        flair_result = -(text.to_dict()['labels'][0]['confidence'])
    if sentence == "":
        textblob_result = 0
    blob = TextBlob(sentence)
    textblob_result = blob.sentiment.polarity
    textblob_subjectivity = blob.sentiment.subjectivity
    return {'flair':round(flair_result, 3), 'textblob':round(textblob_result,3), 'subjectivity':round(textblob_subjectivity,3)}

In [42]:
# gets the URL we want to analyze comments on

url = input('Input Article URL: ')


Input Article URL: https://www.nytimes.com/2022/05/04/opinion/math-education.html


In [43]:
# fetch all comments. this is a JSONL string, with the assumption that at some point I might want to put this in bigquery and do the sentiment analysis on vertexAI

all_comments = get_all_comments(url)

HTTP Response: 200
Fetched Page 0. 104 total comments found. Fetching 5 pages
HTTP Response: 200
Fetched Page 1.0. 79 comments left to fetch.
HTTP Response: 200
Fetched Page 2.0. 54 comments left to fetch.


In [44]:
classifier = TextClassifier.load('en-sentiment')

sentence_scores = []

2022-05-04 14:57:30,547 loading file /Users/danaholmes/.flair/models/sentiment-en-mix-distillbert_4.pt


In [51]:


l = len(all_comments.splitlines())

printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)

for i, comment in enumerate(all_comments.splitlines(), start=1):
    p = json.loads(comment)
    # print(p["comment"])
    p_blob = p["comment"]
    sentences = make_sentences(clean(p_blob))
    printProgressBar(i + 1, l, prefix = 'Reticulating Splines:', suffix = 'Complete', length = 50)
    for sentence in sentences:
        #print({'commentID': p["comment_id"],'sentence':sentence, 'score':predict(sentence)})
        prediction = predict(sentence)
        sentence_scores.append({'commentID': p["comment_id"],'sentence':sentence, 'flair_score':prediction['flair'], 'text_blob_score':prediction['textblob'], 'subjectivity':prediction['subjectivity']})

Reticulating Splines: |██████████████████████████████████████████████████| 100% Completee

In [52]:
df = pd.DataFrame()

# print(sentence_scores[0])
for sentence in sentence_scores:
#     #print({"comment_id": sentence["commentID"], "sentence": sentence["sentence"], "score": sentence["score"]})
    df = df.append({"comment_id": sentence["commentID"], "sentence": sentence["sentence"], "flair_score": sentence["flair_score"], "textblob_score": sentence["text_blob_score"], "subjectivity": sentence["subjectivity"]}, ignore_index=True)

In [53]:
pd.set_option('display.max_colwidth', None)

#df.loc[df['subjectivity'] > .5]

Unnamed: 0,comment_id,flair_score,sentence,subjectivity,textblob_score
1,118165246.0,-0.999,"Otherwise, we are always behind.We also aren't talking about remedial education that is sorely needed to prepare young people that were ill-served by their poor schools just because they were born poor.",0.533,-0.233
2,118165241.0,0.994,"Parents can be a terrific help in math, even at an early age.",0.650,0.050
5,118165241.0,0.633,It is not difficult when you make it a game.,0.700,-0.075
6,118165228.0,-0.982,Why is it a surprise that US students are behind in math?,0.700,-0.400
7,118165228.0,0.997,"This country is behind in every metric compared to other advanced countries who put their money into health care, education, and quality of life for the average citizen as opposed to giving huge tax cuts to the oligarchs who fund their chosen politicians.",0.595,0.025
...,...,...,...,...,...
450,118160839.0,-0.984,"As a high school math teacher with 28 years of experience, it's my opinion that poor math performance in the U.S. isn't due to any one cause but is instead a perfect storm of several factors:(1) A culture of immediate gratification and shortened attention spans.",0.503,0.127
451,118160839.0,-0.992,"Teens embody the ""I want it now"" attitude prevlant today which allows little patience for mastering the many facts and skills necessary for mathematics.",0.667,0.104
453,118160839.0,-1.000,"The dopamine bursts they seek in Tik-Tok videos, Instagram posts and the like are no match for the static images and abstractions inherent in math.",0.900,0.500
454,118160839.0,-0.966,NYU professor Jonathan Haidt has written extensively about the need to limit screen time and cell phone use.(2) Unstable homes where children are anxious and unhappy.,0.744,-0.283


In [56]:
#loc method to filter to only comments with subjectivity >.5
average_by_comment = df.loc[df['subjectivity'] > .5].groupby(['comment_id'], as_index = False).mean()


average_comment_score_flair = average_by_comment["flair_score"].mean(axis = 'index')
average_comment_score_textblob = average_by_comment["textblob_score"].mean(axis = 'index')
count_comments = average_by_comment["comment_id"].count()
# deviation = average_by_comment["score"].std()

print(f"Total Comments: {count_comments}")
print("Average Comment Scores:")
print(f"  Flair: {average_comment_score_flair}")
print(f"  TextBlob: {average_comment_score_textblob}")
# print(f"Standard Deviation: {deviation}")


Total Comments: 62
Average Comment Scores:
  Flair: -0.08499543010752689
  TextBlob: 0.11792930107526882
