# Read in Modules

In [2]:
import itertools
import nltk
import pandas as pd
#nltk.download('punkt')

from nltk.tokenize import sent_tokenize

# Bring in Data

In [5]:
data_file_path = "/Users/catherinepollack/Documents/dartmouth/research/aim3_facebook_covid19_obesity/data/"
with open(str(data_file_path + 'full_list_of.txt')) as sentences:
    lines = sentences.readlines()

index_list = pd.read_csv(data_file_path + "220108_index_list.csv")

# Functions

## Selectively Sample Sentences

In [8]:
def token_sample(text, terms = "obese_only"):
    """
    Function that takes in a text and returns only those tokens that contain the word "obese" or "obesity"
    
    Params:
        text (str): The text of interest
        terms (str): The search terms of interest. Takes "obese_only" (only looks for obese or obesity) or "expanded" (looks for words that are synonymous with obesity as curated previously)
        
    Exceptions:
        TypeError (if text is nan)
        
    Returns:
        sent_obese (list): A list of sentences that contain "obese" or "obesity"
    """
    
    local_sent_obese = [] #Initialize list
    
    try:
        sent_tokenize(text)
        
    except TypeError:
        #print("Encountered a nan! Returning nothing")
        return local_sent_obese   
    else:
        sent_all = sent_tokenize(text) #Tokenize text as sentences
    
    if terms == "obese_only": #Only pull obese terms
        for sent in sent_all: #For each sentences
        #print(sent)
            if "obese" in sent or "obesity" in sent: #If "obesity" or "obese" in sentence
                local_sent_obese.append(sent) #Add to obesity-specific string
    else: #If "expanded" list of search terms
        expanded_obese_terms = ["obese", "obesity", "body weight", "bariatrics", "chubb", "fat", "paunch", "plump", "rotund", "stout", "chunk", "portl", "adipos", "corpulent", "porcine"]
        for sent in sent_all:
            if any(x in sent for x in expanded_obese_terms):
                local_sent_obese.append(sent)
    
    return local_sent_obese

# Tokenize

In [9]:
tokenized_sentences = [token_sample(x, "expanded") for x in lines]

## Turning Multiple Lists Into One List

In [5]:
tokenized_sentences_combined = list(itertools.chain.from_iterable(tokenized_sentences))

# Saving All Tweets

In [9]:
textfile = open(str(data_file_path + "211103_tokenized_sentences_expanded.txt"), "w")
for element in tokenized_sentences_combined:
    textfile.write(element + "\n")
textfile.close()

In [27]:
tokenized_sentences[2]

['Now, ask yourself why me jiggling my butt fat made you feel weird...🧐 ...After you’ve sorted through that, decide if you’re interested & committed to growing your knowledge around how fat works in your body, & then, if you feel so inclined, please read on: That three letter word: F-A-T... we allow it to rule our lives without really considering what it is, why it is, & what we can do about it.',
 'types of fat cells in the body: white, brown, & beige.',
 'While some white fat cells are necessary for good hormonal health, too much white fat is very harmful.',
 'On the other end of the spectrum, brown fat, when fully activated, generates 300 times more heat than any other tissue in the body; Just 2oz.',
 'of brown fat appear capable of burning several hundred calories per day—the equivalent of a 30-min.',
 'Similarly, beige cells can help burn fat rather than store it.',
 'It’s believed that certain hormones & enzymes released when you’re stressed, cold, or when you exercise can help c

In [47]:
pd.DataFrame.from_dict(zip(list(index_list["Unnamed: 0"]), tokenized_sentences)).to_csv("220109_meta_text_mapping.csv")