# Read in Modules

In [12]:
import pandas as pd
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/catherinepollack/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Bring in Data

In [3]:
data_file_path = data_file_path = "/Users/catherinepollack/Documents/dartmouth/research/aim3_facebook_covid19_obesity/data/"
with open(str(data_file_path + 'full_list_of.txt')) as sentences:
    lines = sentences.readlines()

# Functions

## Selectively Sample Sentences

In [21]:
def token_sample(text, terms = "obese_only"):
    """
    Function that takes in a text and returns only those tokens that contain the word "obese" or "obesity"
    
    Params:
        text (str): The text of interest
        terms (str): The search terms of interest. Takes "obese_only" (only looks for obese or obesity) or "expanded" (looks for words that are synonymous with obesity as curated previously)
        
    Exceptions:
        TypeError (if text is nan)
        
    Returns:
        sent_obese (list): A list of sentences that contain "obese" or "obesity"
    """
    
    local_sent_obese = [] #Initialize list
    
    try:
        sent_tokenize(text)
        
    except TypeError:
        #print("Encountered a nan! Returning nothing")
        return local_sent_obese   
    else:
        sent_all = sent_tokenize(text) #Tokenize text as sentences
    
    if terms == "obese_only": #Only pull obese terms
        for sent in sent_all: #For each sentences
        #print(sent)
            if "obese" in sent or "obesity" in sent: #If "obesity" or "obese" in sentence
                local_sent_obese.append(sent) #Add to obesity-specific string
    else: #If "expanded" list of search terms
        expanded_obese_terms = ["obese", "obesity", "body weight", "bariatrics", "chubb", "fat", "paunch", "plump", "rotund", "stout", "chunk", "portl", "adipos", "corpulent", "porcine"]
        for sent in sent_all:
            if any(x in sent for x in expanded_obese_terms):
                local_sent_obese.append(sent)
    
    return local_sent_obese

In [22]:
tokenized_sentences = [token_sample(x, "expanded") for x in lines]