<a href="https://colab.research.google.com/github/danielruskin/inst808_fall_2025_final_project/blob/main/inst_808_final_project_preprocess_facebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Code for preprocessing text from the Facebook ads

In [1]:
# Optional code block for mounting your Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import os
import re
import string
from datetime import datetime
import numpy as np

### Load data

In [3]:
data_path = '/content/drive/My Drive/Courses/INST808'
df_facebook = pd.read_csv(os.path.join(data_path, 'ads_data', 'ads.tsv'), sep='\t')
df1 = pd.read_csv(os.path.join(data_path, 'ads_data', 'ads1.tsv'), sep='\t')
df_facebook = pd.concat([df_facebook, df1])
df2 = pd.read_csv(os.path.join(data_path, 'ads_data', 'ads2.tsv'), sep='\t')
df_facebook = pd.concat([df_facebook, df2])
df3 = pd.read_csv(os.path.join(data_path, 'ads_data', 'ads3.tsv'), sep='\t')
df_facebook = pd.concat([df_facebook, df3])
df4 = pd.read_csv(os.path.join(data_path, 'ads_data', 'ads4.tsv'), sep='\t')
df_facebook = pd.concat([df_facebook, df4])
df5 = pd.read_csv(os.path.join(data_path, 'ads_data', 'ads5.tsv'), sep='\t')
df_facebook = pd.concat([df_facebook, df5])
df6 = pd.read_csv(os.path.join(data_path, 'ads_data', 'ads6.tsv'), sep='\t')
df_facebook = pd.concat([df_facebook, df6])

In [None]:
# Define list of US states
us_states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"]

In [None]:
df_facebook['ad'].iloc[0]

"['CCS projects could help unlock…\\n\\n✓ Job Creation\\n✓ Billions in Investments\\n✓ Lower Emissions\\n✓ American Energy Security']"

In [None]:
df_facebook['ad_text'] = df_facebook['ad'].apply(lambda x: x[2:-2]) # get rid of weird list formatting from API request

In [None]:
df_facebook['ad_text'].iloc[0]

'CCS projects could help unlock…\\n\\n✓ Job Creation\\n✓ Billions in Investments\\n✓ Lower Emissions\\n✓ American Energy Security'

In [None]:
texts = df_facebook['ad_text']

# replace every state name with [STATE], so we can identify duplicates that only differ by state name
for state in us_states:
  texts = texts.str.replace(state, '[STATE]', case=False) # case-insensitive, to handle uppercase/lowercase differences

df_facebook['ad_text_modified'] = texts

### Preprocess text by removing newlines and dropping duplicates

In [None]:
texts = df_facebook['ad_text_modified']

# Handle newlines. Ignore paragraphs for now

#texts = [text.replace('\\n\\n', ' PARAGRAPH') for text in texts] #  indicate paragraphs
texts = [text.replace('-\\n','') for text in texts] # dashes followed by newline characters are just one word each
texts = [text.replace('\\n',' ') for text in texts] # all other newline characters should be replcaed by spaces
#texts = [text.replace('PARAGRAPH', '\\n\\n') for text in texts] # replace paragraphs with newlines
texts = [re.sub(r'\s+', ' ', text) for text in texts] # replace all multiple whitespace with just one whitespace
df_facebook['ad_text_modified_2'] = texts

In [None]:
len(df_facebook)

13986

In [None]:
df_facebook['ad_text_modified_2'].head()

Unnamed: 0,ad_text_modified_2
0,CCS projects could help unlock… ✓ Job Creation...
1,States are already bringing in new jobs and at...
2,Carbon capture and storage could help create t...
3,States are already bringing in new jobs and at...
4,CCS projects could help unlock… ✓ Job Creation...


In [None]:
# Drop duplicate texts. Keep last row, as this is the earliest ad start date
df_facebook = df_facebook.drop_duplicates(keep = 'last', subset='ad_text_modified_2').reset_index(drop=True)

In [None]:
len(df_facebook)

509

In [None]:
df_facebook['ad_text_modified_2'].head()

Unnamed: 0,ad_text_modified_2
0,A low carbon fuel standard could help reduce t...
1,"With a low carbon fuel standard, [STATE] can r..."
2,Click below to hear EPA Administrator Lee Zeld...
3,Check out Khan Academy's free STEM education r...
4,Parents! You won’t want to miss this. Help you...


### Tokenize and Lemmatize

In [None]:
!pip install spacy
import spacy
nlp = spacy.load("en_core_web_sm")



In [None]:
def tokenize_rule_based(text):
  # Return a list of lemmas as processed by SpaCy
  doc = nlp(text)
  # grab result of lemmatization for each token, as long as token is not stop token or punctuation or 1 char or a number
  # lowercase too
  tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not (len(token) == 1) and not token.like_num]
  return tokens

In [None]:
df_facebook['spacy'] = df_facebook['ad_text_modified_2'].apply(tokenize_rule_based) # tokenize and lemmatize using spacy

### Combine bi-grams that occur often

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
# combine 2-word phrases (e.g., "climate change"). They must appear at least 10 times. Use default threshold from gensim
bigram_model = Phrases(df_facebook['spacy'], min_count=10, threshold=10)
bigram_phraser = Phraser(bigram_model)
df_facebook['final_tokens'] = [bigram_phraser[tokens] for tokens in df_facebook['spacy']]
# See example of climate change being put into one phrsae below:
print(df_facebook['final_tokens'].iloc[20])
print(df_facebook['spacy'].iloc[20])

['state', 'economic', 'competitiveness', 'line', 'action']
['state', 'economic', 'competitiveness', 'line', 'action']


### Save preprocessed text to file

In [None]:
file_path = os.path.join(data_path, 'facebook_ads_modified_preprocessed.tsv')
df_facebook.to_csv(file_path, sep="\t", index=False)