# Bring in Libraries and Packages

In [1]:
import html
import nltk
import numpy as np
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('vader_lexicon')
import pandas as pd
import preprocessor as p 
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from urllib.parse import urlparse


# Functions

## URL Replacement

In [2]:
def url_replacement(text, token_type = "url"):
    """
    Function that will take in a block of text and replace the url with a token of some type
    text (str): A block of text that contains a url
    token_type (str): A specfication on what token should replace the url. Default is "url", which just returns "url". Other options include: 
        "domain", which returns the domain (e.g., ".gov url")
        "host", which returns the host of the website and domain (e.g., "cdc.gov url")
    """
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    # Get a list of all urls
    
    if urls == []: # If list is blank
        return text #Return previous text since nothing needs to change
    
    new_text = text[:] #Deep copy of the text to a new object
    
    if token_type == "url": #For the base case
        for url in urls:
            new_text = new_text.replace(url, "url") #Replace each url with the "url" token

    elif token_type == "domain": #For the case of just extracting the domain
        for url in urls:
            try: 
                urlparse(url).netloc.split(".")[-1] #Extract just the domain 
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc.split(".")[-1] #Extract just the domain
                
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
            
    else: #Extracting full host name
        for url in urls:
            try:
                urlparse(url).netloc #Extract the full host name
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc #Extract the full host name
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
        
    return new_text

## VADER Sentiment Analysis

In [3]:
def get_vader_score(sid, text, dataframe):
    """
    Function that will take in a text and return an estimated valence. 
    Note that this assumes that the column names match the VADER output
    (i.e., "neg", "neu", "pos", "compound")
    param sid (str): Name of the SentimentIntensityAnalyzer() defined outside the function 
    param text (str): A string of text to analyze
    param dataframe (DataFrame): The pandas dataframe to append results to
    """
    scores = sid.polarity_scores(text)
    return(dataframe.append(scores, ignore_index = True))

# Bring in Data

In [4]:
data_file_path = "/Users/catherinepollack/Documents/dartmouth/research/aim3_facebook_covid19_obesity/data/"
#facebook_1 = pd.read_csv(str(data_file_path + "210627_headaches_1.csv"))
#facebook_2 = pd.read_csv(str(data_file_path + "210627_headaches_2.csv"))
#facebook_2 = pd.read_csv(str(data_file_path + "210627_headaches_3.csv"))
#facebook = pd.concat([facebook_1, facebook_2], axis = 0)
#facebook = facebook.reset_index()
#facebook.to_csv(data_file_path + "220119_combined_facebook_data_health_control.csv")

facebook_liwc = pd.read_csv(data_file_path + "220204_combined_facebook_data_health_control_liwc.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/catherinepollack/Documents/dartmouth/research/aim3_facebook_covid19_obesity/data/220204_combined_facebook_data_health_control_liwc.csv.csv'

# Rename Columns with Metadata

In [6]:
colname_mapping = dict(zip(list(facebook_liwc.columns[1:42]), list(facebook_liwc.loc[0, "B":"AQ"])))
facebook_liwc = facebook_liwc.rename(columns = colname_mapping)
facebook_liwc = facebook_liwc.iloc[1:,1:]

# Remove Posts on Pet-Specific Pages

In [8]:
#Identifying which could be pet related
set(facebook_liwc["Page Category"]) 
#ADOPTION_SERVICE, ANIMAL_RESCUE_SERVICE, ANIMAL_SHELTER, AQUARIUM, AQUATIC_PET_STORE,
#DOG_BREEDER, DOG_DAY_CARE_CENTER, DOG_PARK, DOG_TRAINING, DOG_WALKER, EQUESTRIAN_FACILITY
#HORSEBACK_RIDING_SERVICE, HORSE_TRAINER, KENNEL, PET, PETTING_ZOO, PET_ADOPTION_SERVICE
#PET_BREEDER, PET_CAFE, PET_GROOMER, PET_SERVICE, PET_SITTER, PET_STORE, PET_SUPPLIES
#REPTILE_PET_STORE, ZOO

possible_pet_tags = ["ADOPTION_SERVICE", "ANIMAL_RESCUE_SERVICE", "ANIMAL_SHELTER", "AQUARIUM", "AQUATIC_PET_STORE",
"DOG_BREEDER", "DOG_DAY_CARE_CENTER", "DOG_PARK", "DOG_TRAINING", "DOG_WALKER", "EQUESTRIAN_FACILITY",
"HORSEBACK_RIDING_SERVICE", "HORSE_TRAINER", "KENNEL", "PET", "PETTING_ZOO", "PET_ADOPTION_SERVICE",
"PET_BREEDER", "PET_CAFE", "PET_GROOMER", "PET_SERVICE", "PET_SITTER", "PET_STORE", "PET_SUPPLIES",
"REPTILE_PET_STORE", "ZOO"]

facebook_pet = facebook_liwc[(facebook_liwc["Page Category"].isin(possible_pet_tags))] #4,392 in pet category
facebook_liwc_no_pet = facebook_liwc[-(facebook_liwc["Page Category"].isin(possible_pet_tags))] #522,467 not in pet category

# Text Pre-Processing

In [9]:
print("Now processing text!")
facebook_liwc_no_pet['processed_text'] = facebook_liwc_no_pet.Message.apply(str) #Change to string
facebook_liwc_no_pet['processed_text'] = facebook_liwc_no_pet.processed_text.apply(html.unescape) #Remove HTML escape characters
facebook_liwc_no_pet['processed_text_bert'] = facebook_liwc_no_pet['processed_text'] #Create new column for BERT-specific embeddings (don't want to remove additional information)
facebook_liwc_no_pet['processed_text'] = facebook_liwc_no_pet.processed_text.apply(lambda x: url_replacement(x, "host"))
facebook_liwc_no_pet['processed_text'] = facebook_liwc_no_pet.processed_text.apply(p.clean) #Preprocessor removes hashtags and cleans text
facebook_liwc_no_pet['processed_text'] = facebook_liwc_no_pet.processed_text.apply(str.lower) #Convert to lowercase
facebook_liwc_no_pet['processed_text'] = facebook_liwc_no_pet.processed_text.apply(lambda x: ''.join([i for i in x if not i.isdigit()])) #Remove numbers
facebook_liwc_no_pet['processed_text'] = facebook_liwc_no_pet.processed_text.apply(lambda x: re.sub('[^a-zA-z]', " ", x)) #Remove non-letters

Now processing text!
URL parse error with https://www.ktmrestaurant.co](https://www.ktmrestaurant.com/)m


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['processed_text'] = facebook_liwc_no_pet.Message.apply(str) #Change to string
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['processed_text'] = facebook_liwc_no_pet.processed_text.apply(html.unescape) #Remove HTML escape characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-

URL parse error with https://www.iskm.international](https://www.iskm.international/?fbclid=IwAR13dmb9p4erlkVIughijk-5zsB_6rOO2ykBDkcwYKoLriLbDnzeimQnwL8)
URL parse error with https://www.srikrishnamandir.org](https://www.srikrishnamandir.org/?fbclid=IwAR15h4-7TcXXUuAg0ZKOY3UkVTD5iIlxXD-5SBBdLjuuUMb3qEWV2Vy_axk)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['processed_text'] = facebook_liwc_no_pet.processed_text.apply(lambda x: url_replacement(x, "host"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['processed_text'] = facebook_liwc_no_pet.processed_text.apply(p.clean) #Preprocessor removes hashtags and cleans text
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/use

# Additional Processing

In [10]:
print("Now tokenizing")
facebook_liwc_no_pet['tokens'] = facebook_liwc_no_pet.processed_text.apply(word_tokenize)

print("Now removing stop words")
stop_words = set(stopwords.words('english'))
facebook_liwc_no_pet['tokens'] = facebook_liwc_no_pet.tokens.apply(lambda x: [item for item in x if item not in stop_words])

print("Now lemmatizing")
lemma = WordNetLemmatizer()
facebook_liwc_no_pet['tokens'] = facebook_liwc_no_pet.tokens.apply(lambda x: [lemma.lemmatize(word = w, pos = 'v') for w in x])

print("Now converting back to final string")

facebook_liwc_no_pet['final_text'] = facebook_liwc_no_pet.tokens.apply(lambda x: ' '.join(x))

Now tokenizing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['tokens'] = facebook_liwc_no_pet.processed_text.apply(word_tokenize)


Now removing stop words


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['tokens'] = facebook_liwc_no_pet.tokens.apply(lambda x: [item for item in x if item not in stop_words])


Now lemmatizing
Now converting back to final string


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['tokens'] = facebook_liwc_no_pet.tokens.apply(lambda x: [lemma.lemmatize(word = w, pos = 'v') for w in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['final_text'] = facebook_liwc_no_pet.tokens.apply(lambda x: ' '.join(x))


# Feature Matrix Construction

## TF-IDF

In [11]:
print("Now running TFIDF!")
count_vectorizer = CountVectorizer(ngram_range = (2,2), min_df = 0.01, max_df = 0.75)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,2), min_df = 0.01, max_df = 0.75)
print("Count Vectorizer")
count_matrix = count_vectorizer.fit_transform(facebook_liwc_no_pet['final_text'])
print("Get Feature Names")
features = count_vectorizer.get_feature_names()
print("TFIDF Vectorizer")
tfidf_matrix = tfidf_vectorizer.fit_transform(facebook_liwc_no_pet['final_text'])
scores = np.asarray(tfidf_matrix)
sums = tfidf_matrix.sum(axis = 0)
data = []
print("Compiling Rankings")
for col, term in enumerate(features):
    data.append((term, sums[0,col]))
ranking = pd.DataFrame(data, columns = ['term', 'rank'])
words = ranking.sort_values('rank', ascending = False)
print(words.head())

tf_idf_dataframe = pd.DataFrame(tfidf_matrix.todense())

# Creating list of terms that can be used as column names
terms = []
for col, term in enumerate(features):
    terms.append(term)
print(terms)

tf_idf_dataframe.columns = [x for x in terms]

Now running TFIDF!
Count Vectorizer
Get Feature Names
TFIDF Vectorizer




Compiling Rankings
             term         rank
3         com url  4594.378119
21         ly url  2866.673888
12  give headache  2214.499337
1         bite ly  2200.703441
22      make sure  1397.267744
['back pain', 'bite ly', 'blood pressure', 'com url', 'come back', 'even though', 'every day', 'feel like', 'fever headache', 'first time', 'get headache', 'get rid', 'give headache', 'go away', 'go back', 'go home', 'headache go', 'high blood', 'joint pain', 'last night', 'look like', 'ly url', 'make sure', 'migraine headache', 'migraine headaches', 'nausea vomit', 'one day', 'severe headache', 'side effect', 'sore throat', 'take care', 'year old', 'years ago']


## Sentiment Analysis

In [12]:
print("Now running sentiment analysis!")
sid = SentimentIntensityAnalyzer()
sentiments = pd.DataFrame(columns = ['neg', 'neu', 'pos', 'compound'])
for text in facebook_liwc_no_pet['processed_text_bert']:
    sentiments = get_vader_score(sid, text, sentiments)

Now running sentiment analysis!


## Compiling Feature Matrix

In [13]:
print("Now running feature matrix!")
feature_matrix_no_bert = pd.concat([tf_idf_dataframe.reset_index(), 
                            sentiments.reset_index(), 
                            facebook_liwc_no_pet.reset_index()],
                          axis = 1)
file_name = "/220127_feature_matrix_no_bert_health_comparator.csv"
feature_matrix_no_bert.to_csv(str(data_file_path + file_name))

Now running feature matrix!
