# Read in Libraries and Packages

In [1]:
import html
import nltk
import numpy as np
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('vader_lexicon')
import pandas as pd
import preprocessor as p 
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from urllib.parse import urlparse


# Functions

## URL Processing

In [2]:
def url_replacement(text, token_type = "url"):
    """
    Function that will take in a block of text and replace the url with a token of some type
    text (str): A block of text that contains a url
    token_type (str): A specfication on what token should replace the url. Default is "url", which just returns "url". Other options include: 
        "domain", which returns the domain (e.g., ".gov url")
        "host", which returns the host of the website and domain (e.g., "cdc.gov url")
    """
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    # Get a list of all urls
    
    if urls == []: # If list is blank
        return text #Return previous text since nothing needs to change
    
    new_text = text[:] #Deep copy of the text to a new object
    
    if token_type == "url": #For the base case
        for url in urls:
            new_text = new_text.replace(url, "url") #Replace each url with the "url" token

    elif token_type == "domain": #For the case of just extracting the domain
        for url in urls:
            try: 
                urlparse(url).netloc.split(".")[-1] #Extract just the domain 
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc.split(".")[-1] #Extract just the domain
                
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
            
    else: #Extracting full host name
        for url in urls:
            try:
                urlparse(url).netloc #Extract the full host name
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc #Extract the full host name
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
        
    return new_text

## VADER Sentiment Analysis

In [3]:
def get_vader_score(sid, text, dataframe):
    """
    Function that will take in a text and return an estimated valence. 
    Note that this assumes that the column names match the VADER output
    (i.e., "neg", "neu", "pos", "compound")
    param sid (str): Name of the SentimentIntensityAnalyzer() defined outside the function 
    param text (str): A string of text to analyze
    param dataframe (DataFrame): The pandas dataframe to append results to
    """
    scores = sid.polarity_scores(text)
    return(dataframe.append(scores, ignore_index = True))

# Bring in Data

In [4]:
file_path = "/Users/catherinepollack/Documents/dartmouth/research/aim3_facebook_covid19_obesity/data"
file_name = "/211103_tokenized_sentences_expanded_with_liwc.csv"
liwc_labeled = pd.read_csv(str(file_path + file_name))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Text Processing

## General Preprocessing

In [5]:
print("Now processing text!")
liwc_labeled['processed_text'] = liwc_labeled.A.apply(str) #Change to string
liwc_labeled['processed_text'] = liwc_labeled.processed_text.apply(html.unescape) #Remove HTML escape characters
liwc_labeled['processed_text_bert'] = liwc_labeled['processed_text'] #Create new column for BERT-specific embeddings (don't want to remove additional information)
liwc_labeled['processed_text'] = liwc_labeled.processed_text.apply(lambda x: url_replacement(x, "host"))
liwc_labeled['processed_text'] = liwc_labeled.processed_text.apply(p.clean) #Preprocessor removes hashtags and cleans text
liwc_labeled['processed_text'] = liwc_labeled.processed_text.apply(str.lower) #Convert to lowercase
liwc_labeled['processed_text'] = liwc_labeled.processed_text.apply(lambda x: ''.join([i for i in x if not i.isdigit()])) #Remove numbers
liwc_labeled['processed_text'] = liwc_labeled.processed_text.apply(lambda x: re.sub('[^a-zA-z]', " ", x)) #Remove non-letters

Now processing text!
URL parse error with https://www.tipsforbehealthy.com](https://l.facebook.com/l.php?u=https%3A%2F%2Fwww.tipsforbehealthy.com%2Fhealthy-ife%2Fget-know-about-gastritis%2F%3Ffbclid%3DIwAR3QNxFY2_BH8FvvHcUBk98bFV7mVmtV2p1FpDBqqkAoLpvdaZeXjtuznHY&h=AT1ORIMPsDN2oFzmqzshTqbfXWpxuJe5gAjqr7UR8z9WQXf2PamX8Uhf_WGUzipFuxNX-B5VWffHkeM0C-wB8mh52qwvU_x25UDSQMupsGdZbta0H2BXTtM-RzChT2msUauKQgQbVD1WgU63os4rE_Bh8N9UkbNRgsP9Iw)
URL parse error with http://www.endobariatric.com](https://www.youtube.com/redirect?q=http%3A%2F%2Fwww.endobariatric.com&redir_token=QUFFLUhqbktmOGlXd05jYWIxWnRqUVJudkRJeE5hNDVFd3xBQ3Jtc0trdDRycTdTUzZqY2ZiNC10dXM3b1hSWC1FX2RnZnYySUFFR0g1T2Q2bEpiekFSNWwxdm9Vcll0NWl5V2xYXy1NZ3NCeFhwNGloOWdHZVBoU3RONjBFVXpGcWlrY0JKOVVWOXNBZDk3N0lENi1TNGd0dw%3D%3D&v=Kr7CkMIFiHU&event=video_description)
URL parse error with http://green-weight-loss-tea.blogspot.com]Click
URL parse error with https://cspinet.org](https://cspinet.org/)).
URL parse error with http://green-weight-loss-t

## Tokenization

In [6]:
print("Now tokenizing!")
liwc_labeled['tokens'] = liwc_labeled.processed_text.apply(word_tokenize)

Now tokenizing!


## Stop Word Removal

In [7]:
stop_words = set(stopwords.words('english'))
liwc_labeled['tokens'] = liwc_labeled.tokens.apply(lambda x: [item for item in x if item not in stop_words])

## Lemmatizing

In [8]:
lemma = WordNetLemmatizer()
liwc_labeled['tokens'] = liwc_labeled.tokens.apply(lambda x: [lemma.lemmatize(word = w, pos = 'v') for w in x])

## Converting Back to Final String

In [9]:
liwc_labeled['final_text'] = liwc_labeled.tokens.apply(lambda x: ' '.join(x))

# TF-IDF Bigrams

In [25]:
print("Now running TFIDF!")
count_vectorizer = CountVectorizer(ngram_range = (2,2), min_df = 0.01, max_df = 0.75)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,2), min_df = 0.01, max_df = 0.75)
print("Count Vectorizer")
count_matrix = count_vectorizer.fit_transform(liwc_labeled['final_text'])
print("Get Feature Names")
features = count_vectorizer.get_feature_names()
print("TFIDF Vectorizer")
tfidf_matrix = tfidf_vectorizer.fit_transform(liwc_labeled['final_text'])
scores = np.asarray(tfidf_matrix)
sums = tfidf_matrix.sum(axis = 0)
data = []
print("Compiling Rankings")
for col, term in enumerate(features):
    data.append((term, sums[0,col]))
ranking = pd.DataFrame(data, columns = ['term', 'rank'])
words = ranking.sort_values('rank', ascending = False)
print(words.head())

tf_idf_dataframe = pd.DataFrame(tfidf_matrix.todense())

# Creating list of terms that can be used as column names
terms = []
for col, term in enumerate(features):
    terms.append(term)
print(terms)

tf_idf_dataframe.columns = [x for x in terms]

Now running TFIDF!
Count Vectorizer
Get Feature Names
TFIDF Vectorizer
Compiling Rankings
                 term          rank
29      heart disease  18460.524645
6   childhood obesity  17806.254527
45   overweight obese  17576.063684
7             com url  17399.354790
65        weight loss  16643.897074
['api whatsapp', 'arm reduce', 'belly fat', 'blood pressure', 'body fat', 'body weight', 'childhood obesity', 'com url', 'detail whatsapp', 'diabetes heart', 'diabetes obesity', 'diet exercise', 'diet without', 'disease obesity', 'effect skip', 'effect without', 'exercise side', 'exercise without', 'fat post', 'fat thyroid', 'fatigue exhaustion', 'fight obesity', 'figure reduce', 'figure thighs', 'fit figure', 'forever detail', 'formula relieve', 'get slim', 'health problems', 'heart disease', 'high blood', 'high cholesterol', 'increase risk', 'intensive formula', 'lose weight', 'ly url', 'meals get', 'meals slim', 'morbidly obese', 'mummy tummy', 'new study', 'obesity diabetes', 'obes

# Sentiment Analysis

In [33]:
print("Now running sentiment analysis!")
sid = SentimentIntensityAnalyzer()
sentiments = pd.DataFrame(columns = ['neg', 'neu', 'pos', 'compound'])
for text in liwc_labeled['processed_text_bert']:
    sentiments = get_vader_score(sid, text, sentiments)

Now running sentiment analysis!


# BERT Embeddings

In [37]:
print("Now running BERT Embeddings!")
bert_model = SentenceTransformer('bert-base-cased')
bert_embeddings = bert_model.encode(list(liwc_labeled["processed_text_bert"]))

bert_dataframe = pd.DataFrame(bert_embeddings)
bert_dataframe.columns = [f"BERT {x}" for x in range(0, len(bert_dataframe.columns))]

Now running BERT Embeddings!


Downloading: 100%|██████████| 437/437 [00:00<00:00, 57.5kB/s]
Downloading: 100%|██████████| 8.98k/8.98k [00:00<00:00, 8.64MB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 897kB/s]
Downloading: 100%|██████████| 436M/436M [00:10<00:00, 41.6MB/s]
Downloading: 100%|██████████| 436k/436k [00:00<00:00, 882kB/s]
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 28.7kB/s]
Downloading: 100%|██████████| 213k/213k [00:00<00:00, 777kB/s]
Some weights of the model checkpoint at /Users/catherinepollack/.cache/torch/sentence_transformers/bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task 

KeyboardInterrupt: 

In [20]:
len(list(liwc_labeled["processed_text_bert"]))

629960

In [39]:
print("Now running feature matrix!")
feature_matrix_no_bert = pd.concat([tf_idf_dataframe.reset_index(), 
                            sentiments.reset_index(), 
                            liwc_labeled.reset_index()],
                          axis = 1)
file_name = "/211105_feature_matrix_no_bert.csv"
feature_matrix_no_bert.to_csv(str(file_path + file_name))

Now running feature matrix!
