# Read in Modules

In [1]:
import itertools
import nltk
import pandas as pd
#nltk.download('punkt')

from nltk.tokenize import sent_tokenize

# Bring in Data

In [2]:
data_file_path = data_file_path = "/Users/catherinepollack/Documents/dartmouth/research/aim3_facebook_covid19_obesity/data/"
with open(str(data_file_path + '211118full_list_of_nonhealth_texts.txt')) as sentences:
    nonhealth_lines = sentences.readlines()

with open(str(data_file_path + '211118full_list_of_health_texts.txt')) as sentences:
    health_lines = sentences.readlines()

# Functions

## Selectively Sample Sentences

In [3]:
def token_sample(text, terms = "obese_only"):
    """
    Function that takes in a text and returns only those tokens that contain the word "obese" or "obesity"
    
    Params:
        text (str): The text of interest
        terms (str): The search terms of interest. Takes "obese_only" (only looks for obese or obesity) or "expanded" (looks for words that are synonymous with obesity as curated previously)
        
    Exceptions:
        TypeError (if text is nan)
        
    Returns:
        sent_obese (list): A list of sentences that contain "obese" or "obesity"
    """
    
    local_sent_obese = [] #Initialize list
    
    try:
        sent_tokenize(text)
        
    except TypeError:
        #print("Encountered a nan! Returning nothing")
        return local_sent_obese   
    else:
        sent_all = sent_tokenize(text) #Tokenize text as sentences
    
    if terms == "obese_only": #Only pull obese terms
        for sent in sent_all: #For each sentences
        #print(sent)
            if "obese" in sent or "obesity" in sent: #If "obesity" or "obese" in sentence
                local_sent_obese.append(sent) #Add to obesity-specific string
    elif terms == "expanded": #If "expanded" list of search terms
        expanded_obese_terms = ["obese", "obesity", "body weight", "bariatrics", "chubb", "fat", "paunch", "plump", "rotund", "stout", "chunk", "portl", "adipos", "corpulent", "porcine"]
        for sent in sent_all:
            if any(x in sent for x in expanded_obese_terms):
                local_sent_obese.append(sent)

    elif terms == "health_comparator": #Only pull health comparator terms
        for sent in sent_all: #For each sentences
        #print(sent)
            if "headache" in sent or "migraine" in sent: 
                local_sent_obese.append(sent) #Add to obesity-specific string

    else: #Only pull non-health terms for clarinet
        for sent in sent_all: #For each sentences
        #print(sent)
            if "clarinet" in sent: 
                local_sent_obese.append(sent) #Add to obesity-specific string
    
    return local_sent_obese

# Tokenize

In [4]:
health_tokenized_sentences = [token_sample(x, "health_comparator") for x in health_lines]
nonhealth_tokenized_sentences = [token_sample(x, "non-health") for x in nonhealth_lines]


In [10]:
health_tokenized_sentences_combined = list(itertools.chain.from_iterable(health_tokenized_sentences))
nonhealth_tokenized_sentences_combined = list(itertools.chain.from_iterable(nonhealth_tokenized_sentences))

# Saving All Tweets

In [11]:
textfile = open(str(data_file_path + "211124_tokenized_sentences_expanded_health.txt"), "w")
for element in health_tokenized_sentences_combined:
    textfile.write(element + "\n")
textfile.close()

textfile = open(str(data_file_path + "211124_tokenized_sentences_expanded_nonhealth.txt"), "w")
for element in nonhealth_tokenized_sentences_combined:
    textfile.write(element + "\n")
textfile.close()