# Bring in Libraries and Packages

In [2]:
import html
import nltk
import numpy as np
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('vader_lexicon')
import pandas as pd
import preprocessor as p 
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from urllib.parse import urlparse

# Functions

## URL Replacement

In [3]:
def url_replacement(text, token_type = "url"):
    """
    Function that will take in a block of text and replace the url with a token of some type
    text (str): A block of text that contains a url
    token_type (str): A specfication on what token should replace the url. Default is "url", which just returns "url". Other options include: 
        "domain", which returns the domain (e.g., ".gov url")
        "host", which returns the host of the website and domain (e.g., "cdc.gov url")
    """
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    # Get a list of all urls
    
    if urls == []: # If list is blank
        return text #Return previous text since nothing needs to change
    
    new_text = text[:] #Deep copy of the text to a new object
    
    if token_type == "url": #For the base case
        for url in urls:
            new_text = new_text.replace(url, "url") #Replace each url with the "url" token

    elif token_type == "domain": #For the case of just extracting the domain
        for url in urls:
            try: 
                urlparse(url).netloc.split(".")[-1] #Extract just the domain 
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc.split(".")[-1] #Extract just the domain
                
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
            
    else: #Extracting full host name
        for url in urls:
            try:
                urlparse(url).netloc #Extract the full host name
            except ValueError:
                print(str("URL parse error with " + url))
                domain = ""
            else:
                domain = urlparse(url).netloc #Extract the full host name
            domain_url = domain + " url"
            new_text = new_text.replace(url, domain_url)
        
    return new_text

## VADER Sentiment Analysis

In [4]:
def get_vader_score(sid, text, dataframe):
    """
    Function that will take in a text and return an estimated valence. 
    Note that this assumes that the column names match the VADER output
    (i.e., "neg", "neu", "pos", "compound")
    param sid (str): Name of the SentimentIntensityAnalyzer() defined outside the function 
    param text (str): A string of text to analyze
    param dataframe (DataFrame): The pandas dataframe to append results to
    """
    scores = sid.polarity_scores(text)
    return(dataframe.append(scores, ignore_index = True))

# Bring in Data

In [5]:
data_file_path = "/Users/catherinepollack/Documents/dartmouth/research/aim3_facebook_covid19_obesity/data/"
#facebook_1 = pd.read_csv(str(data_file_path + "210526_facebook_obesity_1.csv"))
#facebook_2 = pd.read_csv(str(data_file_path + "210526_facebook_obesity_2.csv"))
#facebook = pd.concat([facebook_1, facebook_2], axis = 0)
#facebook = facebook.reset_index()
#facebook.to_csv(data_file_path + "220119_combined_facebook_data.csv")

instagram_liwc = pd.read_csv(data_file_path + "220204_non_health_control_instagram_liwc.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Rename Columns with Metadata

In [6]:
colname_mapping = dict(zip(list(instagram_liwc.columns[0:22]), list(instagram_liwc.loc[0, "A":"V"])))
instagram_liwc = instagram_liwc.rename(columns = colname_mapping)
instagram_liwc = instagram_liwc.iloc[1:,:]

In [7]:
instagram_liwc.head()

Unnamed: 0,Account,User Name,Followers at Posting,Post Created,Post Created Date,Post Created Time,Type,Total Interactions,Likes,Comments,...,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
1,9GAG: Go Fun The World,9gag,50345255,2019-04-04 23:01:12 EDT,2019-04-04,23:01:12,Video,622 442,572695,49747,...,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,50.0
2,Instagram,instagram,273642637,2019-01-04 14:39:30 EST,2019-01-04,14:39:30,Photo,607 980,600071,7909,...,6.75,0.61,0.0,0.0,0.0,1.84,4.91,3.68,1.23,1.23
3,jack antonoff,jackantonoff,371822,2020-07-24 10:34:12 EDT,2020-07-24,10:34:12,Photo,126 257,123140,3117,...,1.79,0.6,0.0,0.0,0.0,0.3,0.0,1.19,0.0,2.39
4,Overheard New York,overheardnewyork,1472824,2021-03-15 12:13:22 EDT,2021-03-15,12:13:22,Photo,104 944,103912,1032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.33
5,Rini (Harini Raghavan),rinimusic,18878,2020-03-26 21:57:39 EDT,2020-03-26,21:57:39,Video,94 918,94683,235,...,1.23,0.0,0.0,0.0,4.94,1.23,1.23,1.23,0.0,13.58


# Text Pre-Processing

In [8]:
print("Now processing text!")
instagram_liwc['processed_text'] = instagram_liwc.Description.apply(str) #Change to string
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(html.unescape) #Remove HTML escape characters
instagram_liwc['processed_text_bert'] = instagram_liwc['processed_text'] #Create new column for BERT-specific embeddings (don't want to remove additional information)
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(lambda x: url_replacement(x, "host"))
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(p.clean) #Preprocessor removes hashtags and cleans text
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(str.lower) #Convert to lowercase
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(lambda x: ''.join([i for i in x if not i.isdigit()])) #Remove numbers
instagram_liwc['processed_text'] = instagram_liwc.processed_text.apply(lambda x: re.sub('[^a-zA-z]', " ", x)) #Remove non-letters

Now processing text!
URL parse error with https://taipeimaf.com]


# Additional Processing

In [9]:
print("Now tokenizing")
instagram_liwc['tokens'] = instagram_liwc.processed_text.apply(word_tokenize)

print("Now removing stop words")
stop_words = set(stopwords.words('english'))
instagram_liwc['tokens'] = instagram_liwc.tokens.apply(lambda x: [item for item in x if item not in stop_words])

print("Now lemmatizing")
lemma = WordNetLemmatizer()
instagram_liwc['tokens'] = instagram_liwc.tokens.apply(lambda x: [lemma.lemmatize(word = w, pos = 'v') for w in x])

print("Now converting back to final string")

instagram_liwc['final_text'] = instagram_liwc.tokens.apply(lambda x: ' '.join(x))

Now tokenizing
Now removing stop words
Now lemmatizing
Now converting back to final string


# Feature Matrix Construction

## TF-IDF

In [10]:
print("Now running TFIDF!")
count_vectorizer = CountVectorizer(ngram_range = (2,2), min_df = 0.01, max_df = 0.75)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,2), min_df = 0.01, max_df = 0.75)
print("Count Vectorizer")
count_matrix = count_vectorizer.fit_transform(instagram_liwc['final_text'])
print("Get Feature Names")
features = count_vectorizer.get_feature_names()
print("TFIDF Vectorizer")
tfidf_matrix = tfidf_vectorizer.fit_transform(instagram_liwc['final_text'])
scores = np.asarray(tfidf_matrix)
sums = tfidf_matrix.sum(axis = 0)
data = []
print("Compiling Rankings")
for col, term in enumerate(features):
    data.append((term, sums[0,col]))
ranking = pd.DataFrame(data, columns = ['term', 'rank'])
words = ranking.sort_values('rank', ascending = False)
print(words.head())

tf_idf_dataframe = pd.DataFrame(tfidf_matrix.todense())

# Creating list of terms that can be used as column names
terms = []
for col, term in enumerate(features):
    terms.append(term)
print(terms)

tf_idf_dataframe.columns = [x for x in terms]

Now running TFIDF!
Count Vectorizer
Get Feature Names
TFIDF Vectorizer




Compiling Rankings
                  term         rank
16            link bio  1589.639510
0        bass clarinet   686.058867
23       play clarinet   583.488713
8         follow daily   437.870319
26  principal clarinet   297.771456
['bass clarinet', 'bio listen', 'chamber music', 'chat soon', 'clarinet concerto', 'classical music', 'click link', 'com url', 'follow daily', 'full version', 'full video', 'happy birthday', 'happy practice', 'high school', 'hit follow', 'join us', 'link bio', 'listen full', 'look forward', 'much love', 'new york', 'niv week', 'part video', 'play clarinet', 'playlist bio', 'please hit', 'principal clarinet', 'version post', 'week playlist', 'youtube channel']


## Sentiment Analysis

In [11]:
print("Now running sentiment analysis!")
sid = SentimentIntensityAnalyzer()
sentiments = pd.DataFrame(columns = ['neg', 'neu', 'pos', 'compound'])
for text in instagram_liwc['processed_text_bert']:
    sentiments = get_vader_score(sid, text, sentiments)

Now running sentiment analysis!


## Compiling Feature Matrix

In [12]:
print("Now running feature matrix!")
feature_matrix_no_bert = pd.concat([tf_idf_dataframe.reset_index(), 
                            sentiments.reset_index(), 
                            instagram_liwc.reset_index()],
                          axis = 1)
file_name = "/220204_feature_matrix_no_bert_instagram_non_health_comparator.csv"
feature_matrix_no_bert.to_csv(str(data_file_path + file_name))

Now running feature matrix!
