<a href="https://colab.research.google.com/github/danielruskin/inst808_fall_2025_final_project/blob/main/inst_808_final_project_preprocess_nyt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Code for preprocessing OCR text from the NYT ads

In [None]:
# Optional code block for mounting your Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import os
import re
import string
from datetime import datetime
import numpy as np

### Load data

In [None]:
data_path = '/content/drive/My Drive/Courses/INST808'
file_path = os.path.join(data_path, 'nyt_ads_modified.tsv')
df_nyt = pd.read_csv(file_path, sep='\t')

### Preprocess text

In [None]:
# for now, ignore rows with OCR tab issue
df_nyt = df_nyt.drop(df_nyt.index[[4, 10, 16, 18, 19, 28, 32, 33, 39, 60, 63, 65, 69]]).reset_index(drop=True)

In [None]:
# drop na's. Should leave you with just two columns
df_nyt = df_nyt.dropna(axis=1)

In [None]:
# Split for strings before and after dates in the text output, such that only dates are returned
first_split = df_nyt['text'].str.split('; ProQuest', expand=True)[0]
second_split = first_split.str.split(';')
dates = [splits[-1].strip() for splits in second_split] # also strip whitespace at beginning and end to put in proper datetime form
dates = [datetime.strptime(date, "%b %d, %Y") for date in dates] # convert to datetime
df_nyt['date'] = dates

In [None]:
texts = df_nyt['text_modified']

# Handle newlines. Ignore paragraphs for now

#texts = [text.replace('\\n\\n', ' PARAGRAPH') for text in texts] #  indicate paragraphs
texts = [text.replace('-\\n','') for text in texts] # dashes followed by newline characters are just one word each
texts = [text.replace('\\n',' ') for text in texts] # all other newline characters should be replcaed by spaces
#texts = [text.replace('PARAGRAPH', '\\n\\n') for text in texts] # replace paragraphs with newlines
texts = [re.sub(r'\s+', ' ', text) for text in texts] # replace all multiple whitespace with just one whitespace
df_nyt['text_modified_2'] = texts

In [None]:
# NOTE: I cleared the print output of this when pushing to github because it is very long
for i, row in df_nyt.iterrows():
  print(f'TEXT {i}')
  print(row['text_modified_2'])

### Tokenize and Lemmatize

In [None]:
!pip install spacy
import spacy
nlp = spacy.load("en_core_web_sm")



In [None]:
def tokenize_rule_based(text):
  # Return a list of lemmas as processed by SpaCy
  doc = nlp(text)
  # grab result of lemmatization for each token, as long as token is not stop token or punctuation or 1 char or a number
  # also keep just the lowercase version
  tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not (len(token) == 1) and not token.like_num]
  return tokens

In [None]:
df_nyt['spacy'] = df_nyt['text_modified_2'].apply(tokenize_rule_based) # tokenize and lemmatize using spacy

### Combine bi-grams that occur often

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
# combine 2-word phrases (e.g., "climate change"). They must appear at least 10 times. Use default threshold from gensim
bigram_model = Phrases(df_nyt['spacy'], min_count=10, threshold=10)
bigram_phraser = Phraser(bigram_model)
df_nyt['final_tokens'] = [bigram_phraser[tokens] for tokens in df_nyt['spacy']]
# See example of climate change being put into one phrase below:
print(df_nyt['final_tokens'].iloc[20])
print(df_nyt['spacy'].iloc[20])

['weather', 'climate', 'debate', 'climate_change', 'understandable', 'tendency', 'use', 'recent', 'weather', 'event', 'draw', 'conclusion', 'global_warming', 'weather', 'climate', 'climate', 'far', 'complex', 'know', 'weather', 'clear', 'climate', 'region', 'climate', 'define', 'prevail', 'behavior', 'weather', 'include', 'variability', 'decade', 'weather', 'ordinarily', 'consider', 'establish', 'average', 'condition', 'variability', 'climate', 'recent', 'record', 'cold', 'weather', 'northeast', 'u.s.', 'indicate', 'cool', 'clitate', 'year', 'record', 'summer', 'heat', 'europe', 'confirm', 'warm', 'world', 'geological', 'evidence', 'indicate', 'earth', 'climate', 'hds', 'vary', 'continuously', 'warm', 'cool', 'change', 'earth', 'factor', 'diverse', 'variation', 'sunlight', 'earth', 'magnetic', 'field', 'asteroid', 'impact', 'sun', 'moon', 'earth', 'orbital', 'interaction', 'cosmic', 'ray', 'flux', 'continental', 'drift', 'fluctuation', 'ea', 'level', 'volcanic', 'eruption', 'change', '

### Save preprocessed text to file

In [None]:
file_path = os.path.join(data_path, 'nyt_ads_modified_preprocessed.tsv')
df_nyt.to_csv(file_path, sep="\t", index=False)