# Bring in Libraries and Packages

In [1]:
import html
import json
import itertools
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
from urllib.parse import urlparse
import preprocessor as p 
import re
from nrclex import NRCLex
import operator

# Bring in Data

Downloading in `cancercovid_nodups_expand` skips steps 4 through 6

In [2]:
all_telemedcovid_210425 = pd.read_json("/Volumes/LaCie/Cancer_COVID/Data/210425_all_telemedicine_all_covid_tweets_210425_start.jsonl", lines = True)
all_telemedcovid_200828 = pd.read_json("/Volumes/LaCie/Cancer_COVID/Data/210425_all_telemedicine_all_covid_tweets_200828_start.jsonl", lines = True)
all_telemedcovid_200415 = pd.read_json("/Volumes/LaCie/Cancer_COVID/Data/210425_all_telemedicine_all_covid_tweets_200415_start.jsonl", lines = True)

telemedcovid_nodups_expand_sentiment = pd.read_csv("210425_telemedicine_covid_nodups_expand.csv")

# Functions

In [3]:
def get_vader_score(sid, text, dataframe):
    """
    Function that will take in a text and return an estimated valence. 
    Note that this assumes that the column names match the VADER output
    (i.e., "neg", "neu", "pos", "compound")
    param sid (str): Name of the SentimentIntensityAnalyzer() defined outside the function 
    param text (str): A string of text to analyze
    param dataframe (DataFrame): The pandas dataframe to append results to
    """
    scores = sid.polarity_scores(text)
    return(dataframe.append(scores, ignore_index = True))

In [4]:
def link_extractor(url):
    try:
      #print(url)
        urlopen(url).geturl() #Try to turn into a real URL
    except (HTTPError, URLError):
      #print("URL is dead, leaving in abbreviated format") #Sometimes this doesn't work
        return url #Keep original URL
    else:
        return urlopen(url).geturl() #Keep new URL

In [5]:
def url_replacement(text, token_type = "url"):
    """
    Function that will take in a block of text and replace the url with a token of some type
    text (str): A block of text that contains a url
    token_type (str): A specfication on what token should replace the url. Default is "url", which just returns "url". Other options include: 
        "domain", which returns the domain (e.g., ".gov url")
        "host", which returns the host of the website and domain (e.g., "cdc.gov url")
    """
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    # Get a list of all urls
    
    if urls == []: # If list is blank
        return text #Return previous text since nothing needs to change
    
    new_text = text[:] #Deep copy of the text to a new object
    
    if token_type == "url": #For the base case
        for url in urls:
            new_text = new_text.replace(url, "url") #Replace each url with the "url" token

    elif token_type == "domain": #For the case of just extracting the domain
        for url in urls:
            try: 
                urlparse(url).netloc.split(".")[-1] #Extract just the domain 
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc.split(".")[-1] #Extract just the domain
                
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
            
    else: #Extracting full host name
        for url in urls:
            try:
                urlparse(url).netloc #Extract the full host name
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc #Extract the full host name
            #if domain == "t.co":
            #    domain = link_extractor(url)
                
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
        
    return new_text

In [23]:
def get_emotions(text, dataframe):
    """
    Function that will take a text at return a dataframe of emotions (excluding positive and negative)
    Params:
        text (str): The tweet of interest
        dataframe (Pandas DataFrame): The emotions dataframe to add values
        
    Return:
        Updated dataframe with the new row of interest
    """
    nrc_text = NRCLex(text)
    nrc_dict = nrc_text.affect_frequencies
    nrc_dict.pop('positive', None)
    nrc_dict.pop('negative', None)
    nrc_dict.pop('anticip', None)
    nrc_max_emotion = max(nrc_dict.items(), key=operator.itemgetter(1))[0]
    nrc_dict["max_emotion"] = nrc_max_emotion
    dataframe = dataframe.append(nrc_dict, ignore_index= True)
    return dataframe

# Data Processing

In [7]:
telemedcovid = pd.concat([all_telemedcovid_210425, all_telemedcovid_200828, all_telemedcovid_200415])

In [8]:
print(telemedcovid.head())
print(telemedcovid.shape) #361,752

                                                text     author_id  \
0  The future of #healthcare delivery isn’t #tele...  1.144224e+18   
1  5 Reasons Why Every Small Business Needs an Em...  2.454696e+07   
2  Time Management Secrets for College Students: ...  2.194020e+08   
3  Procrastination Solutions For College Students...  2.194020e+08   
4      What a mess.....  \n\nhttps://t.co/0fb51cePEf  1.381735e+18   

                                      public_metrics  \
0  {'retweet_count': 0, 'reply_count': 0, 'like_c...   
1  {'retweet_count': 2, 'reply_count': 0, 'like_c...   
2  {'retweet_count': 3, 'reply_count': 0, 'like_c...   
3  {'retweet_count': 3, 'reply_count': 0, 'like_c...   
4  {'retweet_count': 0, 'reply_count': 0, 'like_c...   

                 created_at            id  \
0 2021-04-24 23:45:04+00:00  1.386104e+18   
1 2021-04-24 23:40:43+00:00  1.386103e+18   
2 2021-04-24 23:37:12+00:00  1.386102e+18   
3 2021-04-24 23:36:56+00:00  1.386102e+18   
4 2021-04-24 23:3

## Drop Duplicates

In [9]:
telemedcovid_nodups = telemedcovid.drop_duplicates(subset=['id'])
print(telemedcovid_nodups.shape) #352,490 rows

(352490, 14)


## Expand columns 

In [11]:
telemedcovid_nodups_expand = pd.concat([telemedcovid_nodups.drop(['public_metrics'], axis=1), telemedcovid_nodups['public_metrics'].apply(pd.Series)], axis=1)
telemedcovid_nodups_expand = pd.concat([telemedcovid_nodups_expand.drop(['entities'], axis=1), telemedcovid_nodups_expand['entities'].apply(pd.Series)], axis=1)
telemedcovid_nodups_expand = pd.concat([telemedcovid_nodups_expand.drop(['geo'], axis=1), telemedcovid_nodups_expand['geo'].apply(pd.Series)], axis=1)

# Text Processing

## Actual Processing

In [13]:
telemedcovid_nodups_expand['processed_text'] = telemedcovid_nodups_expand['text']
print("Now processing text!")
telemedcovid_nodups_expand['processed_text'] = telemedcovid_nodups_expand.processed_text.apply(str) #Change to string
telemedcovid_nodups_expand['processed_text'] = telemedcovid_nodups_expand.processed_text.apply(html.unescape) #Remove HTML escape characters
telemedcovid_nodups_expand['processed_text'] = telemedcovid_nodups_expand.processed_text.apply(lambda x: url_replacement(x, "host"))
telemedcovid_nodups_expand['processed_text'] = telemedcovid_nodups_expand.processed_text.apply(p.clean) #Preprocessor removes hashtags and cleans text

Now processing text!


## Processed write-out to csv

In [14]:
telemedcovid_nodups_expand.to_csv("/Volumes/LaCie/Cancer_COVID/Data/210425_telemedicine_covid_nodups_expand.csv")

# Sentiment Analysis with Vader

## Run Sentiment Analysis

Takes about an hour

In [15]:
sid = SentimentIntensityAnalyzer()
sentiments = pd.DataFrame(columns = ['neg', 'neu', 'pos', 'compound'])
for text in telemedcovid_nodups_expand['processed_text']:
    sentiments = get_vader_score(sid, text, sentiments)

## Combine the Data Sets

In [16]:
telemedcovid_nodups_expand.reset_index(drop=True, inplace=True)
sentiments.reset_index(drop=True, inplace=True)
telemedcovid_nodups_expand_sentiment = pd.concat([telemedcovid_nodups_expand, sentiments], axis=1)

## Write Out to CSV

In [18]:
telemedcovid_nodups_expand_sentiment.to_csv("/Volumes/LaCie/Cancer_COVID/Data/210425_telemedicine_covid_nodups_expand.csv")

# Emotion Analysis with NRCLex

In [24]:
emotion_df = pd.DataFrame(columns = ['fear', 'anger', 'trust', 'surprise', 'sadness', 'disgust', 'joy', 'anticipation', 'max_emotion'])
for text in telemedcovid_nodups_expand_sentiment.processed_text:
    emotion_df = get_emotions(text, emotion_df)

KeyboardInterrupt: 

In [62]:
#cancercovid_nodups_expand.reset_index(drop=True, inplace=True)
#emotion_df.reset_index(drop=True, inplace=True)
#test = pd.concat([cancercovid_nodups_expand, emotion_df], axis=1)
#test.head()

Unnamed: 0,author_id,text,created_at,id,context_annotations,users,newest_id,oldest_id,result_count,next_token,...,fear,anger,anticip,trust,surprise,sadness,disgust,joy,anticipation,max_emotion
0,1.325606e+18,@mtgreenee You ran knowing unopposed (so guara...,2021-04-12 23:59:48+00:00,1.381759e+18,,,,,,,...,,,,,,,,,,
1,251410700.0,So this Covid vaccination is called the Johnso...,2021-04-12 23:58:51+00:00,1.381759e+18,"[{'domain': {'id': '65', 'name': 'Interests an...",,,,,,...,,,,,,,,,,
2,405966400.0,@LadyDi53086 @EricaRN4USA @ky_statesman See my...,2021-04-12 23:58:37+00:00,1.381759e+18,"[{'domain': {'id': '123', 'name': 'Ongoing New...",,,,,,...,,,,,,,,,,
3,1.17416e+18,@HotepJesus &gt;made a microchip to detect ear...,2021-04-12 23:56:46+00:00,1.381758e+18,"[{'domain': {'id': '65', 'name': 'Interests an...",,,,,,...,,,,,,,,,,
4,1.205541e+18,"@WessHerman @MaximeBernier I'm old, but not th...",2021-04-12 23:56:40+00:00,1.381758e+18,"[{'domain': {'id': '123', 'name': 'Ongoing New...",,,,,,...,,,,,,,,,,


In [89]:
emotion_df.head()

Unnamed: 0,fear,anger,trust,surprise,sadness,disgust,joy,anticipation,max_emotion
0,0.095238,0.095238,0.095238,0.047619,0.142857,0.047619,0.047619,0.047619,sadness


In [19]:
emotion_df = pd.DataFrame(columns = ['fear', 'anger', 'trust', 'surprise', 'sadness', 'disgust', 'joy', 'anticipation', 'max_emotion'])
nrc_text = NRCLex(telemedcovid_nodups_expand_sentiment.text[0])
nrc_dict = nrc_text.affect_frequencies
nrc_dict.pop('positive', None)
nrc_dict.pop('negative', None)
nrc_dict.pop('anticip', None)
nrc_max_emotion = max(nrc_dict.items(), key=operator.itemgetter(1))[0]
nrc_dict["max_emotion"] = nrc_max_emotion
emotion_df = emotion_df.append(nrc_dict, ignore_index= True)
print(emotion_df)

   fear  anger     trust  surprise  sadness  disgust  joy  anticipation  \
0   0.0    0.0  0.142857       0.0      0.0      0.0  0.0      0.285714   

    max_emotion  
0  anticipation  


In [21]:
NRCLex(telemedcovid_nodups_expand_sentiment.text[0]).affect_frequencies

{'fear': 0.0,
 'anger': 0.0,
 'anticip': 0.0,
 'trust': 0.14285714285714285,
 'surprise': 0.0,
 'positive': 0.5714285714285714,
 'negative': 0.0,
 'sadness': 0.0,
 'disgust': 0.0,
 'joy': 0.0,
 'anticipation': 0.2857142857142857}