# Bring in Libraries and Packages

In [1]:
import html
import nltk
import numpy as np
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('vader_lexicon')
import pandas as pd
import preprocessor as p 
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from urllib.parse import urlparse

# Functions

## URL Replacement

In [2]:
def url_replacement(text, token_type = "url"):
    """
    Function that will take in a block of text and replace the url with a token of some type
    text (str): A block of text that contains a url
    token_type (str): A specfication on what token should replace the url. Default is "url", which just returns "url". Other options include: 
        "domain", which returns the domain (e.g., ".gov url")
        "host", which returns the host of the website and domain (e.g., "cdc.gov url")
    """
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    # Get a list of all urls
    
    if urls == []: # If list is blank
        return text #Return previous text since nothing needs to change
    
    new_text = text[:] #Deep copy of the text to a new object
    
    if token_type == "url": #For the base case
        for url in urls:
            new_text = new_text.replace(url, "url") #Replace each url with the "url" token

    elif token_type == "domain": #For the case of just extracting the domain
        for url in urls:
            try: 
                urlparse(url).netloc.split(".")[-1] #Extract just the domain 
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc.split(".")[-1] #Extract just the domain
                
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
            
    else: #Extracting full host name
        for url in urls:
            try:
                urlparse(url).netloc #Extract the full host name
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc #Extract the full host name
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
        
    return new_text

## VADER Sentiment Analysis

In [3]:
def get_vader_score(sid, text, dataframe):
    """
    Function that will take in a text and return an estimated valence. 
    Note that this assumes that the column names match the VADER output
    (i.e., "neg", "neu", "pos", "compound")
    param sid (str): Name of the SentimentIntensityAnalyzer() defined outside the function 
    param text (str): A string of text to analyze
    param dataframe (DataFrame): The pandas dataframe to append results to
    """
    scores = sid.polarity_scores(text)
    return(dataframe.append(scores, ignore_index = True))

# Bring in Data

In [18]:
data_file_path = "/Users/catherinepollack/Documents/dartmouth/research/aim3_facebook_covid19_obesity/data/"
#facebook_1 = pd.read_csv(str(data_file_path + "210526_facebook_obesity_1.csv"))
#facebook_2 = pd.read_csv(str(data_file_path + "210526_facebook_obesity_2.csv"))
#facebook = pd.concat([facebook_1, facebook_2], axis = 0)
#facebook = facebook.reset_index()
#facebook.to_csv(data_file_path + "220119_combined_facebook_data.csv")

instagram_liwc = pd.read_csv(data_file_path + "220128_obesity_instagram_liwc.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Rename Columns with Metadata

In [20]:
colname_mapping = dict(zip(list(instagram_liwc.columns[0:22]), list(instagram_liwc.loc[0, "A":"V"])))
instagram_liwc = instagram_liwc.rename(columns = colname_mapping)
instagram_liwc = instagram_liwc.iloc[1:,:]

In [21]:
instagram_liwc.head()

Unnamed: 0,Account,User Name,Followers at Posting,Post Created,Post Created Date,Post Created Time,Type,Total Interactions,Likes,Comments,...,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
1,Madelaine Petsch,madelame,22034634,2020-07-29 11:36:38 EDT,2020-07-29,11:36:38,Photo,1 624 548,1624521,27,...,6.85,0.0,0.0,0.4,0.0,0.81,0.0,1.61,0.81,5.65
2,Madelaine Petsch,madelame,22034634,2020-08-06 11:12:27 EDT,2020-08-06,11:12:27,Photo,1 478 909,1478828,81,...,6.3,0.0,0.0,0.42,0.42,0.42,0.0,1.26,0.84,5.88
3,BIG BOSS 🔑🔑,keke,10285242,2020-12-01 15:29:45 EST,2020-12-01,15:29:45,Album,1 351 318,1282835,68483,...,1.91,0.0,0.0,0.0,0.95,0.0,1.43,2.86,0.0,0.0
4,Michelle - Weightloss⬇️145LBS,michobabyy,433211,2020-11-03 10:18:16 EST,2020-11-03,10:18:16,Video,1 086 092,1058004,28088,...,5.88,0.0,0.0,0.0,0.27,1.07,0.0,1.6,0.0,3.74
5,Lizzo,lizzobeeating,9851742,2021-03-06 17:30:35 EST,2021-03-06,17:30:35,Video,643 410,620898,22512,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.88,0.0,0.0


# Text Pre-Processing

In [24]:
print("Now processing text!")
instagram_liwc['processed_text'] = instagram_liwc.Description.apply(str) #Change to string
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(html.unescape) #Remove HTML escape characters
instagram_liwc['processed_text_bert'] = instagram_liwc['processed_text'] #Create new column for BERT-specific embeddings (don't want to remove additional information)
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(lambda x: url_replacement(x, "host"))
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(p.clean) #Preprocessor removes hashtags and cleans text
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(str.lower) #Convert to lowercase
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(lambda x: ''.join([i for i in x if not i.isdigit()])) #Remove numbers
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(lambda x: re.sub('[^a-zA-z]', " ", x)) #Remove non-letters

Now processing text!


# Additional Processing

In [52]:
print("Now tokenizing")
instagram_liwc['tokens'] = instagram_liwc.processed_text.apply(word_tokenize)

print("Now removing stop words")
stop_words = set(stopwords.words('english'))
instagram_liwc['tokens'] = instagram_liwc.tokens.apply(lambda x: [item for item in x if item not in stop_words])

print("Now lemmatizing")
lemma = WordNetLemmatizer()
instagram_liwc['tokens'] = instagram_liwc.tokens.apply(lambda x: [lemma.lemmatize(word = w, pos = 'v') for w in x])

print("Now converting back to final string")

instagram_liwc['final_text'] = instagram_liwc.tokens.apply(lambda x: ' '.join(x))

Now tokenizing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['tokens'] = facebook_liwc_no_pet.processed_text.apply(word_tokenize)


Now removing stop words


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['tokens'] = facebook_liwc_no_pet.tokens.apply(lambda x: [item for item in x if item not in stop_words])


Now lemmatizing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['tokens'] = facebook_liwc_no_pet.tokens.apply(lambda x: [lemma.lemmatize(word = w, pos = 'v') for w in x])


Now converting back to final string


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_liwc_no_pet['final_text'] = facebook_liwc_no_pet.tokens.apply(lambda x: ' '.join(x))


# Feature Matrix Construction

## TF-IDF

In [53]:
print("Now running TFIDF!")
count_vectorizer = CountVectorizer(ngram_range = (2,2), min_df = 0.01, max_df = 0.75)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,2), min_df = 0.01, max_df = 0.75)
print("Count Vectorizer")
count_matrix = count_vectorizer.fit_transform(facebook_liwc_no_pet['final_text'])
print("Get Feature Names")
features = count_vectorizer.get_feature_names()
print("TFIDF Vectorizer")
tfidf_matrix = tfidf_vectorizer.fit_transform(facebook_liwc_no_pet['final_text'])
scores = np.asarray(tfidf_matrix)
sums = tfidf_matrix.sum(axis = 0)
data = []
print("Compiling Rankings")
for col, term in enumerate(features):
    data.append((term, sums[0,col]))
ranking = pd.DataFrame(data, columns = ['term', 'rank'])
words = ranking.sort_values('rank', ascending = False)
print(words.head())

tf_idf_dataframe = pd.DataFrame(tfidf_matrix.todense())

# Creating list of terms that can be used as column names
terms = []
for col, term in enumerate(features):
    terms.append(term)
print(terms)

tf_idf_dataframe.columns = [x for x in terms]

Now running TFIDF!
Count Vectorizer
Get Feature Names
TFIDF Vectorizer




Compiling Rankings
              term          rank
44         com url  37808.973986
336    weight loss  26146.941193
193         ly url  15374.250588
186    lose weight  14369.514303
138  heart disease  13899.112503
['abnormal hormone', 'abnormal level', 'abuse poor', 'acquire secondary', 'alcohol consumption', 'alcohol drug', 'also help', 'anxiety alcohol', 'anxiety issue', 'api whatsapp', 'arm reduce', 'atherosclerosis gloom', 'attack land', 'back fruit', 'bad dream', 'baldness ulcers', 'become slim', 'begin wth', 'belly fat', 'best result', 'best without', 'bite ly', 'blood pressure', 'blood sugar', 'body fat', 'body image', 'body mass', 'body weight', 'brain chemicals', 'business boom', 'call neurotransmitters', 'call whatsapp', 'cancer baldness', 'capsule natural', 'cardiovascular disease', 'cause cause', 'cause early', 'challenge reduce', 'chemicals call', 'chemicals low', 'childhood obesity', 'chronic diseases', 'click link', 'climax occur', 'com url', 'command tone', 'confiden

## Sentiment Analysis

In [54]:
print("Now running sentiment analysis!")
sid = SentimentIntensityAnalyzer()
sentiments = pd.DataFrame(columns = ['neg', 'neu', 'pos', 'compound'])
for text in facebook_liwc_no_pet['processed_text_bert']:
    sentiments = get_vader_score(sid, text, sentiments)

Now running sentiment analysis!


## Compiling Feature Matrix

In [57]:
print("Now running feature matrix!")
feature_matrix_no_bert = pd.concat([tf_idf_dataframe.reset_index(), 
                            sentiments.reset_index(), 
                            facebook_liwc_no_pet.reset_index()],
                          axis = 1)
file_name = "/220120_feature_matrix_no_bert.csv"
feature_matrix_no_bert.to_csv(str(data_file_path + file_name))

Now running feature matrix!
