# Tokenizing data collected from Wikipedia

Emilio Lehoucq - 5/21/24

## Importing libraries

In [1]:
import pandas as pd
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/emiliolehoucq/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emiliolehoucq/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/emiliolehoucq/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Defining functions for this script

In [2]:
def remove_urls(text, replacement_text=""):
  """
  Function to remove URLs from a string.
  Input: text (string), replacement_text (string)
  Output: text_without_urls (string)
  Dependencies: re
  Taken from: https://www.geeksforgeeks.org/remove-urls-from-string-in-python/
  """
  # Define a regex pattern to match URLs
  url_pattern = re.compile(r'https?://\S+|www\.\S+') # https?:// protocol (optional s), \S+ one or more non-white space characters, | or, www\.\S+ URLs starting with www.
  # Use the sub() method to replace URLs with the specified replacement text
  text_without_urls = url_pattern.sub(replacement_text, str(text))
  return text_without_urls

def tokenizer(article):
  """
  Function to tokenize articles.
  Input: article (string)
  Output: unigrams, stems, lemmas, bigrams, bigrams_stems, bigrams_lemmas, trigrams, trigrams_stems, trigrams_lemmas (tuple)
  Dependencies: NLTK
  """
  # Remove URLs from article
  article = remove_urls(article)
  # Create variables to store unigrams, stems, and lemmas
  unigrams, stems, lemmas = [], [], []
  # Tokenize into unigrams, stems, and lemmas
  for unigram in nltk.word_tokenize(str(article)):
    # Lowercase
    unigram = unigram.lower()
    # Keep only alphanumeric and remove stopwords
    if unigram.isalnum() and unigram not in stopwords:
      # Store unigrams
      unigrams.append(unigram)
      # Store stems
      stems.append(nltk.PorterStemmer().stem(unigram))
      # Store lemmas
      lemmas.append(nltk.WordNetLemmatizer().lemmatize(unigram))
  # Create lists of bigrams
  bigrams = list(nltk.bigrams(unigrams))
  bigrams_stems = list(nltk.bigrams(stems))
  bigrams_lemmas = list(nltk.bigrams(lemmas))
  # Create lists of trigrams
  trigrams = list(nltk.trigrams(unigrams))
  trigrams_stems = list(nltk.trigrams(stems))
  trigrams_lemmas = list(nltk.trigrams(lemmas))
  # Return tuple with everything
  return unigrams, stems, lemmas, bigrams, bigrams_stems, bigrams_lemmas, trigrams, trigrams_stems, trigrams_lemmas

## Reading data

In [3]:
df = pd.read_csv('data_raw_wikipedia_4_17_2024.csv')

## Tokenizing articles

In [4]:
# Define columns to store results
df['text_unigrams'] = None
df['text_stems'] = None
df['text_lemmas'] = None
df['text_bigrams'] = None
df['text_bigrams_stems'] = None
df['text_bigrams_lemmas'] = None
df['text_trigrams'] = None
df['text_trigrams_stems'] = None
df['text_trigrams_lemmas'] = None

# Iterate over rows of the data frame
for i, row in df.iterrows():
  # Tokenize article
  unigrams, stems, lemmas, bigrams, bigrams_stems, bigrams_lemmas, trigrams, trigrams_stems, trigrams_lemmas = tokenizer(row['text'])
  # Store results
  df.at[i, 'text_unigrams'] = unigrams
  df.at[i, 'text_stems'] = stems
  df.at[i, 'text_lemmas'] = lemmas
  df.at[i, 'text_bigrams'] = bigrams
  df.at[i, 'text_bigrams_stems'] = bigrams_stems
  df.at[i, 'text_bigrams_lemmas'] = bigrams_lemmas
  df.at[i, 'text_trigrams'] = trigrams
  df.at[i, 'text_trigrams_stems'] = trigrams_stems
  df.at[i, 'text_trigrams_lemmas'] = trigrams_lemmas

## Make sure data frame looks as expected

In [5]:
df.filter(regex='^text_').head()

Unnamed: 0,text_unigrams,text_stems,text_lemmas,text_bigrams,text_bigrams_stems,text_bigrams_lemmas,text_trigrams,text_trigrams_stems,text_trigrams_lemmas
0,"[mindfulness, wikipedia, jump, content, main, ...","[mind, wikipedia, jump, content, main, menu, m...","[mindfulness, wikipedia, jump, content, main, ...","[(mindfulness, wikipedia), (wikipedia, jump), ...","[(mind, wikipedia), (wikipedia, jump), (jump, ...","[(mindfulness, wikipedia), (wikipedia, jump), ...","[(mindfulness, wikipedia, jump), (wikipedia, j...","[(mind, wikipedia, jump), (wikipedia, jump, co...","[(mindfulness, wikipedia, jump), (wikipedia, j..."
1,"[mudita, wikipedia, jump, content, main, menu,...","[mudita, wikipedia, jump, content, main, menu,...","[mudita, wikipedia, jump, content, main, menu,...","[(mudita, wikipedia), (wikipedia, jump), (jump...","[(mudita, wikipedia), (wikipedia, jump), (jump...","[(mudita, wikipedia), (wikipedia, jump), (jump...","[(mudita, wikipedia, jump), (wikipedia, jump, ...","[(mudita, wikipedia, jump), (wikipedia, jump, ...","[(mudita, wikipedia, jump), (wikipedia, jump, ..."
2,"[buddhism, pakistan, wikipedia, jump, content,...","[buddhism, pakistan, wikipedia, jump, content,...","[buddhism, pakistan, wikipedia, jump, content,...","[(buddhism, pakistan), (pakistan, wikipedia), ...","[(buddhism, pakistan), (pakistan, wikipedia), ...","[(buddhism, pakistan), (pakistan, wikipedia), ...","[(buddhism, pakistan, wikipedia), (pakistan, w...","[(buddhism, pakistan, wikipedia), (pakistan, w...","[(buddhism, pakistan, wikipedia), (pakistan, w..."
3,"[art, wikipedia, jump, content, main, menu, ma...","[art, wikipedia, jump, content, main, menu, ma...","[art, wikipedia, jump, content, main, menu, ma...","[(art, wikipedia), (wikipedia, jump), (jump, c...","[(art, wikipedia), (wikipedia, jump), (jump, c...","[(art, wikipedia), (wikipedia, jump), (jump, c...","[(art, wikipedia, jump), (wikipedia, jump, con...","[(art, wikipedia, jump), (wikipedia, jump, con...","[(art, wikipedia, jump), (wikipedia, jump, con..."
4,"[roman, empire, wikipedia, jump, content, main...","[roman, empir, wikipedia, jump, content, main,...","[roman, empire, wikipedia, jump, content, main...","[(roman, empire), (empire, wikipedia), (wikipe...","[(roman, empir), (empir, wikipedia), (wikipedi...","[(roman, empire), (empire, wikipedia), (wikipe...","[(roman, empire, wikipedia), (empire, wikipedi...","[(roman, empir, wikipedia), (empir, wikipedia,...","[(roman, empire, wikipedia), (empire, wikipedi..."


## Export data to CSV

In [6]:
df.to_csv('data_tokenized_wikipedia_5_21_24.csv')