# Bring in Libraries and Packages

In [7]:
import itertools
import nltk
import pandas as pd

from nltk.tokenize import word_tokenize

# Functions

In [8]:
def token_sample(counter, text, terms = "obese_only"):
    """
    Function that takes in a text and returns only those tokens that contain the word "obese" or "obesity"
    
    Params:
        text (str): The text of interest
        terms (str): The search terms of interest. Takes "obese_only" (only looks for obese or obesity) or "expanded" (looks for words that are synonymous with obesity as curated previously)
        counter (int): Keeps track of which text the token comes from
    Exceptions:
        TypeError (if text is nan)
        
    Returns:
        sent_obese (list): A list of sentences that contain "obese" or "obesity"
    """
    
    local_sent_obese = [] #Initialize list
    
    try:
        word_tokenize(text)
        
    except TypeError:
        #print("Encountered a nan! Returning nothing")
        return local_sent_obese   
    else:
        sent_all = word_tokenize(text) #Tokenize text as sentences
    
    if terms == "obese_only": #Only pull obese terms
        for sent in sent_all: #For each sentences
        #print(sent)
            if "obese" in sent or "obesity" in sent: #If "obesity" or "obese" in sentence
                local_sent_obese.append(sent) #Add to obesity-specific string
                
    else: #If "expanded" list of search terms
        expanded_obese_terms = ["obese", "obesity", "body weight", "bariatrics", "chubb", "fat", "paunch", "plump", "rotund", "stout", "chunk", "portl", "adipos", "corpulent", "porcine"]
        for sent in sent_all:
            if any(x in sent for x in expanded_obese_terms):
                local_sent_obese.append(sent)
    
    return local_sent_obese, [counter]*len(local_sent_obese)

# Bring in Data

In [2]:
data_file_path = "/Users/catherinepollack/Documents/dartmouth/research/aim3_facebook_covid19_obesity/data/"
facebook_1 = pd.read_csv(str(data_file_path + "210526_facebook_obesity_1.csv"))
facebook_2 = pd.read_csv(str(data_file_path + "210526_facebook_obesity_2.csv"))
facebook = pd.concat([facebook_1, facebook_2], axis = 0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
facebook = facebook.reset_index()

# Remove Posts on Pet-Specific Pages

In [4]:
#Identifying which could be pet related
set(facebook["Page Category"]) 
#ADOPTION_SERVICE, ANIMAL_RESCUE_SERVICE, ANIMAL_SHELTER, AQUARIUM, AQUATIC_PET_STORE,
#DOG_BREEDER, DOG_DAY_CARE_CENTER, DOG_PARK, DOG_TRAINING, DOG_WALKER, EQUESTRIAN_FACILITY
#HORSEBACK_RIDING_SERVICE, HORSE_TRAINER, KENNEL, PET, PETTING_ZOO, PET_ADOPTION_SERVICE
#PET_BREEDER, PET_CAFE, PET_GROOMER, PET_SERVICE, PET_SITTER, PET_STORE, PET_SUPPLIES
#REPTILE_PET_STORE, ZOO

possible_pet_tags = ["ADOPTION_SERVICE", "ANIMAL_RESCUE_SERVICE", "ANIMAL_SHELTER", "AQUARIUM", "AQUATIC_PET_STORE",
"DOG_BREEDER", "DOG_DAY_CARE_CENTER", "DOG_PARK", "DOG_TRAINING", "DOG_WALKER", "EQUESTRIAN_FACILITY",
"HORSEBACK_RIDING_SERVICE", "HORSE_TRAINER", "KENNEL", "PET", "PETTING_ZOO", "PET_ADOPTION_SERVICE",
"PET_BREEDER", "PET_CAFE", "PET_GROOMER", "PET_SERVICE", "PET_SITTER", "PET_STORE", "PET_SUPPLIES",
"REPTILE_PET_STORE", "ZOO"]

facebook_pet = facebook[(facebook["Page Category"].isin(possible_pet_tags))] #4,392 in pet category
facebook_no_pet = facebook[-(facebook["Page Category"].isin(possible_pet_tags))] #522,467 not in pet category

In [17]:
facebook_no_pet = facebook_no_pet.reset_index(drop = True)

# Tokenzing at Message Level

In [21]:
facebook_no_pet["message_tokenize"] = ""
for i in range(0, len(facebook_no_pet)):
    try: 
        word_tokenize(facebook_no_pet.loc[i, "Message"])

    except TypeError:
        print(f"Encountered a nan at {i}! Returning nothing")
        
    else:
        facebook_no_pet.message_tokenize[i] = word_tokenize(facebook_no_pet.Message[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_no_pet.message_tokenize[i] = word_tokenize(facebook_no_pet.Message[i])


Encountered a nan at 4321! Returning nothing
Encountered a nan at 6080! Returning nothing
Encountered a nan at 7782! Returning nothing
Encountered a nan at 15252! Returning nothing


KeyboardInterrupt: 

['A',
 'morbidly',
 'obese',
 'baby',
 'has',
 'baffled',
 'doctors',
 '-',
 'by',
 'weighing',
 'a',
 'whopping',
 '38lbs',
 '!']

In [None]:
tokenized_sentences = []
indices = []
for i in range(0, len(lines)):
    tokenized_sentences_loop, indices_loop = token_sample(i, lines[i], "expanded")
    tokenized_sentences.append(tokenized_sentences_loop)
    indices.append(indices_loop)

#tokenized_sentences, indices = [token_sample(x, "expanded") for x in lines]