# Read in Libraries and Packages

In [1]:
import html
import nltk
import numpy as np
import pandas as pd
import preprocessor as p 
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from urllib.parse import urlparse


# Functions

## URL Processing

In [2]:
def url_replacement(text, token_type = "url"):
    """
    Function that will take in a block of text and replace the url with a token of some type
    text (str): A block of text that contains a url
    token_type (str): A specfication on what token should replace the url. Default is "url", which just returns "url". Other options include: 
        "domain", which returns the domain (e.g., ".gov url")
        "host", which returns the host of the website and domain (e.g., "cdc.gov url")
    """
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    # Get a list of all urls
    
    if urls == []: # If list is blank
        return text #Return previous text since nothing needs to change
    
    new_text = text[:] #Deep copy of the text to a new object
    
    if token_type == "url": #For the base case
        for url in urls:
            new_text = new_text.replace(url, "url") #Replace each url with the "url" token

    elif token_type == "domain": #For the case of just extracting the domain
        for url in urls:
            try: 
                urlparse(url).netloc.split(".")[-1] #Extract just the domain 
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc.split(".")[-1] #Extract just the domain
                
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
            
    else: #Extracting full host name
        for url in urls:
            try:
                urlparse(url).netloc #Extract the full host name
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc #Extract the full host name
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
        
    return new_text

## VADER Sentiment Analysis

In [3]:
def get_vader_score(sid, text, dataframe):
    """
    Function that will take in a text and return an estimated valence. 
    Note that this assumes that the column names match the VADER output
    (i.e., "neg", "neu", "pos", "compound")
    param sid (str): Name of the SentimentIntensityAnalyzer() defined outside the function 
    param text (str): A string of text to analyze
    param dataframe (DataFrame): The pandas dataframe to append results to
    """
    scores = sid.polarity_scores(text)
    return(dataframe.append(scores, ignore_index = True))

# Bring in Data

In [4]:
file_path = "/Users/catherinepollack/Documents/dartmouth/research/aim3_facebook_covid19_obesity/data"
file_name = "/211124_tokenized_sentences_expanded_health_liwc.csv"
health_liwc_labeled = pd.read_csv(str(file_path + file_name))

file_name = "/211124_tokenized_sentences_expanded_nonhealth_liwc.csv"
nonhealth_liwc_labeled = pd.read_csv(str(file_path + file_name))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Text Processing

## General Preprocessing

In [5]:
print("Now processing text!")
health_liwc_labeled['processed_text'] = health_liwc_labeled.A.apply(str) #Change to string
health_liwc_labeled['processed_text'] = health_liwc_labeled.processed_text.apply(html.unescape) #Remove HTML escape characters
health_liwc_labeled['processed_text_bert'] = health_liwc_labeled['processed_text'] #Create new column for BERT-specific embeddings (don't want to remove additional information)
health_liwc_labeled['processed_text'] = health_liwc_labeled.processed_text.apply(lambda x: url_replacement(x, "host"))
health_liwc_labeled['processed_text'] = health_liwc_labeled.processed_text.apply(p.clean) #Preprocessor removes hashtags and cleans text
health_liwc_labeled['processed_text'] = health_liwc_labeled.processed_text.apply(str.lower) #Convert to lowercase
health_liwc_labeled['processed_text'] = health_liwc_labeled.processed_text.apply(lambda x: ''.join([i for i in x if not i.isdigit()])) #Remove numbers
health_liwc_labeled['processed_text'] = health_liwc_labeled.processed_text.apply(lambda x: re.sub('[^a-zA-z]', " ", x)) #Remove non-letters

print("Now processing text!")
nonhealth_liwc_labeled['processed_text'] = nonhealth_liwc_labeled.A.apply(str) #Change to string
nonhealth_liwc_labeled['processed_text'] = nonhealth_liwc_labeled.processed_text.apply(html.unescape) #Remove HTML escape characters
nonhealth_liwc_labeled['processed_text_bert'] = nonhealth_liwc_labeled['processed_text'] #Create new column for BERT-specific embeddings (don't want to remove additional information)
nonhealth_liwc_labeled['processed_text'] = nonhealth_liwc_labeled.processed_text.apply(lambda x: url_replacement(x, "host"))
nonhealth_liwc_labeled['processed_text'] = nonhealth_liwc_labeled.processed_text.apply(p.clean) #Preprocessor removes hashtags and cleans text
nonhealth_liwc_labeled['processed_text'] = nonhealth_liwc_labeled.processed_text.apply(str.lower) #Convert to lowercase
nonhealth_liwc_labeled['processed_text'] = nonhealth_liwc_labeled.processed_text.apply(lambda x: ''.join([i for i in x if not i.isdigit()])) #Remove numbers
nonhealth_liwc_labeled['processed_text'] = nonhealth_liwc_labeled.processed_text.apply(lambda x: re.sub('[^a-zA-z]', " ", x)) #Remove non-letters

Now processing text!
URL parse error with https://[n.neurology.org/content/63/12/2240.short](http://n.neurology.org/content/63/12/2240.short)
URL parse error with http://www.quranrevolution.com](http://www.quranrevolution.com/?fbclid=IwAR1fuqGkhOetgeMuYF9jCT63pFjrPMxPgtuni1I9D7EHR3fn52YbZxIeJw0)
URL parse error with https://www.scimedregister.com](https://www.scimedregister.com/)**
URL parse error with https://811.novascotia.ca](https://811.novascotia.ca/)
URL parse error with https://henote.com](https://henote.com/)/
Now processing text!
URL parse error with http://www.keenanmckenzie.com]
URL parse error with http://abesun.com](http://abesun.com/)
URL parse error with https://pelicanthree.bandcamp.com](https://l.facebook.com/l.php?u=https%3A%2F%2Fpelicanthree.bandcamp.com%2F%3Ffbclid%3DIwAR239gGXKf1JlQKeI4EuVuuuqXl655HmQMKE2Kn8vVDw56CDWGH1Oo6e2dQ&h=AT0rVc4qLafqU2Ob_4o3x5Hnpx4bWNz_Kl0K3Pi4pkLpktY7nPEC5XaoTUhtVjYcPj_60KbkkCQRCyBe7K72raTeFooBRN2kY2izQHhf67ATsAtaWepgidz2if2p_bMcUq3KXFKtyw

## Tokenization

In [6]:
print("Now tokenizing!")
health_liwc_labeled['tokens'] = health_liwc_labeled.processed_text.apply(word_tokenize)
nonhealth_liwc_labeled['tokens'] = nonhealth_liwc_labeled.processed_text.apply(word_tokenize)

Now tokenizing!


## Stop Word Removal

In [7]:
stop_words = set(stopwords.words('english'))
health_liwc_labeled['tokens'] = health_liwc_labeled.tokens.apply(lambda x: [item for item in x if item not in stop_words])
nonhealth_liwc_labeled['tokens'] = nonhealth_liwc_labeled.tokens.apply(lambda x: [item for item in x if item not in stop_words])

## Lemmatizing

In [8]:
lemma = WordNetLemmatizer()
health_liwc_labeled['tokens'] = health_liwc_labeled.tokens.apply(lambda x: [lemma.lemmatize(word = w, pos = 'v') for w in x])
nonhealth_liwc_labeled['tokens'] = nonhealth_liwc_labeled.tokens.apply(lambda x: [lemma.lemmatize(word = w, pos = 'v') for w in x])

## Converting Back to Final String

In [9]:
health_liwc_labeled['final_text'] = health_liwc_labeled.tokens.apply(lambda x: ' '.join(x))
nonhealth_liwc_labeled['final_text'] = nonhealth_liwc_labeled.tokens.apply(lambda x: ' '.join(x))

# Health Feature Matrix

In [11]:
print("Now running TFIDF!")
count_vectorizer = CountVectorizer(ngram_range = (2,2), min_df = 0.01, max_df = 0.75)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,2), min_df = 0.01, max_df = 0.75)
print("Count Vectorizer")
count_matrix = count_vectorizer.fit_transform(health_liwc_labeled['final_text'])
print("Get Feature Names")
features = count_vectorizer.get_feature_names()
print("TFIDF Vectorizer")
tfidf_matrix = tfidf_vectorizer.fit_transform(health_liwc_labeled['final_text'])
scores = np.asarray(tfidf_matrix)
sums = tfidf_matrix.sum(axis = 0)
data = []
print("Compiling Rankings")
for col, term in enumerate(features):
    data.append((term, sums[0,col]))
ranking = pd.DataFrame(data, columns = ['term', 'rank'])
words = ranking.sort_values('rank', ascending = False)
print(words.head())

tf_idf_dataframe = pd.DataFrame(tfidf_matrix.todense())

# Creating list of terms that can be used as column names
terms = []
for col, term in enumerate(features):
    terms.append(term)
print(terms)

tf_idf_dataframe.columns = [x for x in terms]

print("Now running sentiment analysis!")
sid = SentimentIntensityAnalyzer()
sentiments = pd.DataFrame(columns = ['neg', 'neu', 'pos', 'compound'])
for text in health_liwc_labeled['processed_text_bert']:
    sentiments = get_vader_score(sid, text, sentiments)

print("Now running feature matrix!")
feature_matrix_no_bert = pd.concat([tf_idf_dataframe.reset_index(), 
                            sentiments.reset_index(), 
                            health_liwc_labeled.reset_index()],
                          axis = 1)
file_name = "/211214_feature_matrix_no_bert_health_comparator.csv"
feature_matrix_no_bert.to_csv(str(file_path + file_name))

Now running TFIDF!
Count Vectorizer
Get Feature Names
TFIDF Vectorizer
Compiling Rankings
               term          rank
4           com url  13153.893147
10    give headache  12207.176996
24      sore throat   6943.649977
21  severe headache   6591.665318
9      get headache   6077.591715
['back pain', 'bite ly', 'blood pressure', 'body ache', 'com url', 'cough shortness', 'difficulty breathe', 'fever cough', 'fever headache', 'get headache', 'give headache', 'headache sore', 'include fever', 'loss taste', 'ly url', 'migraine headache', 'migraine headaches', 'muscle pain', 'nausea vomit', 'pain headache', 'runny nose', 'severe headache', 'shortness breath', 'side effect', 'sore throat', 'symptoms include', 'taste smell']
Now running sentiment analysis!
Now running feature matrix!


# NonHealth Comparator

In [12]:
print("Now running TFIDF!")
count_vectorizer = CountVectorizer(ngram_range = (2,2), min_df = 0.01, max_df = 0.75)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,2), min_df = 0.01, max_df = 0.75)
print("Count Vectorizer")
count_matrix = count_vectorizer.fit_transform(nonhealth_liwc_labeled['final_text'])
print("Get Feature Names")
features = count_vectorizer.get_feature_names()
print("TFIDF Vectorizer")
tfidf_matrix = tfidf_vectorizer.fit_transform(nonhealth_liwc_labeled['final_text'])
scores = np.asarray(tfidf_matrix)
sums = tfidf_matrix.sum(axis = 0)
data = []
print("Compiling Rankings")
for col, term in enumerate(features):
    data.append((term, sums[0,col]))
ranking = pd.DataFrame(data, columns = ['term', 'rank'])
words = ranking.sort_values('rank', ascending = False)
print(words.head())

tf_idf_dataframe = pd.DataFrame(tfidf_matrix.todense())

# Creating list of terms that can be used as column names
terms = []
for col, term in enumerate(features):
    terms.append(term)
print(terms)

tf_idf_dataframe.columns = [x for x in terms]

print("Now running sentiment analysis!")
sid = SentimentIntensityAnalyzer()
sentiments = pd.DataFrame(columns = ['neg', 'neu', 'pos', 'compound'])
for text in nonhealth_liwc_labeled['processed_text_bert']:
    sentiments = get_vader_score(sid, text, sentiments)

print("Now running feature matrix!")
feature_matrix_no_bert = pd.concat([tf_idf_dataframe.reset_index(), 
                            sentiments.reset_index(), 
                            nonhealth_liwc_labeled.reset_index()],
                          axis = 1)
file_name = "/211214_feature_matrix_no_bert_nonhealth_comparator.csv"
feature_matrix_no_bert.to_csv(str(file_path + file_name))

Now running TFIDF!
Count Vectorizer
Get Feature Names
TFIDF Vectorizer
Compiling Rankings
                  term         rank
7        bass clarinet  7968.855305
61       play clarinet  5729.969439
25             com url  3977.094907
41      flute clarinet  3910.408249
22  clarinet saxophone  2807.481308
['ableton mix', 'alto flute', 'alto sax', 'alto saxophone', 'alto tenor', 'aquasonic voice', 'bandcamp com', 'bass clarinet', 'bass guitar', 'bass saxophones', 'bass tromboon', 'bob moor', 'brontosaurus tank', 'cello aquasonic', 'christmas flute', 'clarinet alto', 'clarinet bass', 'clarinet contrabass', 'clarinet contralto', 'clarinet flute', 'clarinet piano', 'clarinet player', 'clarinet saxophone', 'clarinets saxophones', 'clown headquarter', 'com url', 'contrabass clarinet', 'contralto clarinet', 'crotales cymbells', 'dahlman trumpet', 'double bass', 'double reeds', 'drum percussion', 'electric bass', 'english horn', 'eric dahlman', 'eric woods', 'evil clown', 'facebook com', 'flute