# Tokenizing data collected from Reddit

Emilio Lehoucq - 5/15/24

## Importing libraries

In [1]:
import pandas as pd
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Defining functions for this script

In [2]:
def remove_urls(text, replacement_text=""):
  """
  Function to remove URLs from a string.
  Input: text (string), replacement_text (string)
  Output: text_without_urls (string)
  Dependencies: re
  Taken from: https://www.geeksforgeeks.org/remove-urls-from-string-in-python/
  """
  # Define a regex pattern to match URLs
  url_pattern = re.compile(r'https?://\S+|www\.\S+') # https?:// protocol (optional s), \S+ one or more non-white space characters, | or, www\.\S+ URLs starting with www.
  # Use the sub() method to replace URLs with the specified replacement text
  text_without_urls = url_pattern.sub(replacement_text, str(text))
  return text_without_urls

def tokenizer(post):
  """
  Function to tokenize posts.
  Input: post (string)
  Output: unigrams, stems, lemmas, bigrams, bigrams_stems, bigrams_lemmas, trigrams, trigrams_stems, trigrams_lemmas (tuple)
  Dependencies: NLTK
  """
  # Remove URLs from post
  post = remove_urls(post)
  # Create variables to store unigrams, stems, and lemmas
  unigrams, stems, lemmas = [], [], []
  # Tokenize into unigrams, stems, and lemmas
  for unigram in nltk.word_tokenize(str(post)):
    # Lowercase
    unigram = unigram.lower()
    # Keep only alphanumeric and remove stopwords
    if unigram.isalnum() and unigram not in stopwords:
      # Store unigrams
      unigrams.append(unigram)
      # Store stems
      stems.append(nltk.PorterStemmer().stem(unigram))
      # Store lemmas
      lemmas.append(nltk.WordNetLemmatizer().lemmatize(unigram))
  # Create lists of bigrams
  bigrams = list(nltk.bigrams(unigrams))
  bigrams_stems = list(nltk.bigrams(stems))
  bigrams_lemmas = list(nltk.bigrams(lemmas))
  # Create lists of trigrams
  trigrams = list(nltk.trigrams(unigrams))
  trigrams_stems = list(nltk.trigrams(stems))
  trigrams_lemmas = list(nltk.trigrams(lemmas))
  # Return tuple with everything
  return unigrams, stems, lemmas, bigrams, bigrams_stems, bigrams_lemmas, trigrams, trigrams_stems, trigrams_lemmas

## Reading data

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/emiliolehoucq/mindfulness/main/data_raw_reddit_4_17_2024.csv')

## Tokenizing posts

In [4]:
# Define columns to store results
df['selftext_unigrams'] = None
df['selftext_stems'] = None
df['selftext_lemmas'] = None
df['selftext_bigrams'] = None
df['selftext_bigrams_stems'] = None
df['selftext_bigrams_lemmas'] = None
df['selftext_trigrams'] = None
df['selftext_trigrams_stems'] = None
df['selftext_trigrams_lemmas'] = None

# Iterate over rows of the data frame
for i, row in df.iterrows():
  # Tokenize post
  unigrams, stems, lemmas, bigrams, bigrams_stems, bigrams_lemmas, trigrams, trigrams_stems, trigrams_lemmas = tokenizer(row['selftext'])
  # Store results
  df.at[i, 'selftext_unigrams'] = unigrams
  df.at[i, 'selftext_stems'] = stems
  df.at[i, 'selftext_lemmas'] = lemmas
  df.at[i, 'selftext_bigrams'] = bigrams
  df.at[i, 'selftext_bigrams_stems'] = bigrams_stems
  df.at[i, 'selftext_bigrams_lemmas'] = bigrams_lemmas
  df.at[i, 'selftext_trigrams'] = trigrams
  df.at[i, 'selftext_trigrams_stems'] = trigrams_stems
  df.at[i, 'selftext_trigrams_lemmas'] = trigrams_lemmas

## Make sure data frame looks as expected

In [5]:
df.filter(regex='^selftext_').head()

Unnamed: 0,selftext_unigrams,selftext_stems,selftext_lemmas,selftext_bigrams,selftext_bigrams_stems,selftext_bigrams_lemmas,selftext_trigrams,selftext_trigrams_stems,selftext_trigrams_lemmas
0,"[hello, friends, ready, make, meditation, habi...","[hello, friend, readi, make, medit, habit, lif...","[hello, friend, ready, make, meditation, habit...","[(hello, friends), (friends, ready), (ready, m...","[(hello, friend), (friend, readi), (readi, mak...","[(hello, friend), (friend, ready), (ready, mak...","[(hello, friends, ready), (friends, ready, mak...","[(hello, friend, readi), (friend, readi, make)...","[(hello, friend, ready), (friend, ready, make)..."
1,"[would, say, radical, stuff, parents, ensured,...","[would, say, radic, stuff, parent, ensur, prep...","[would, say, radical, stuff, parent, ensured, ...","[(would, say), (say, radical), (radical, stuff...","[(would, say), (say, radic), (radic, stuff), (...","[(would, say), (say, radical), (radical, stuff...","[(would, say, radical), (say, radical, stuff),...","[(would, say, radic), (say, radic, stuff), (ra...","[(would, say, radical), (say, radical, stuff),..."
2,"[curious, hear, opinion, edit, guess, would, k...","[curiou, hear, opinion, edit, guess, would, kn...","[curious, hear, opinion, edit, guess, would, k...","[(curious, hear), (hear, opinion), (opinion, e...","[(curiou, hear), (hear, opinion), (opinion, ed...","[(curious, hear), (hear, opinion), (opinion, e...","[(curious, hear, opinion), (hear, opinion, edi...","[(curiou, hear, opinion), (hear, opinion, edit...","[(curious, hear, opinion), (hear, opinion, edi..."
3,"[monks, say, right, way, meditate, usually, we...","[monk, say, right, way, medit, usual, western,...","[monk, say, right, way, meditate, usually, wes...","[(monks, say), (say, right), (right, way), (wa...","[(monk, say), (say, right), (right, way), (way...","[(monk, say), (say, right), (right, way), (way...","[(monks, say, right), (say, right, way), (righ...","[(monk, say, right), (say, right, way), (right...","[(monk, say, right), (say, right, way), (right..."
4,"[happened, several, years, ago, remembered, me...","[happen, sever, year, ago, rememb, medit, good...","[happened, several, year, ago, remembered, med...","[(happened, several), (several, years), (years...","[(happen, sever), (sever, year), (year, ago), ...","[(happened, several), (several, year), (year, ...","[(happened, several, years), (several, years, ...","[(happen, sever, year), (sever, year, ago), (y...","[(happened, several, year), (several, year, ag..."


## Export data to CSV

In [6]:
df.to_csv('data_tokenized_reddit_5_15_24.csv')