# **BERT Experimentation Phase 1**
Note: all of this has to be transferred into a Jupyter notebook later, but Colab is easier for testing and experimentation

In [None]:
# mounting google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl.metadata (86 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m81.9/86.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.7/86.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting boto3 (from pytorch-pretrained-bert)
  Downloading boto3-1.39.7-py3-none-any.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=0.4.1->pytorch-pretrained-bert)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=0.4.1->pytorch-pretrained-bert)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupt

# Data Preprocessing

In [None]:
# Data Imports

import pandas as pd
import numpy as np
import time
import datetime
import random

In [None]:
# NTLK Setup

# ensure nltk resources are downloaded (run once)
import re

import nltk
from nltk.corpus import stopwords   # natural language toolkit
from nltk.tokenize import word_tokenize

# run ntlk resources once - working check
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# load data - from WaPo Analysis folder, file after paragraph preprocessing for NLP

# current data for rough notebook is stored in drive/research/"file"


"""
# Only interested in the "paragraph" part of the spreadsheet - e.g. the article content
# Note: currently 2_articles_2024.csv doesn't exist - do I generate my own or use 1_articles...?
"""

# this applies to the JupyterLab, not the Google Colab version
file = '../scratch/2_articles_2024.csv'

In [None]:
df = pd.read_csv(file)
print(df.columns)

In [None]:
df.head()

In [None]:
# set of English stop words for filtering

stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Applies the following clean-up operations to the input text:
    - Converts to lowercase
    - Removes underscores
    - Removes punctuation
    - Removes numbers
    - Removes stop words
    Returns cleaned text as a string.
    """
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()
    # Remove underscores
    text = text.replace("_", " ")
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Re-join words into a string
    return " ".join(words)


In [None]:
# apply cleaning to "paragraphs" - sanity check

df["paragraphs_cleaned"] = df["paragraphs"].apply(clean_text)

### **Time Period Work**

Splitting corpus by custom time periods (first 30% and last 30%)
* leave black box in the middle for contextual analysis

In [None]:
# Example: split by publish year (assuming there's a 'publish_date' or 'year' column)
df['year'] = pd.to_datetime(df['publish_date']).dt.year

# Define time periods
period1 = df[(df['year'] >= 1977) & (df['year'] <= 1991)]
period2 = df[(df['year'] >= 2010) & (df['year'] <= 2024)]

# Save/inspect splits
period1.to_csv("corpus_1977_1991.csv", index=False)
period2.to_csv("corpus_2010_2024.csv", index=False)

### **Start BERT Experimentation**

Fine-tuning BERT on the first and last time period after split

In [None]:
# BERT Imports

import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

import matplotlib.pyplot as plt
% matplotlib inline

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# tokenize and prepare your data for each period (replace with your corpus)
def prepare_dataset(texts):
    from datasets import Dataset
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    return Dataset.from_dict(inputs)

train_dataset_p1 = prepare_dataset(period1["paragraphs_cleaned"].tolist())
train_dataset_p2 = prepare_dataset(period2["paragraphs_cleaned"].tolist())

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

training_args_p1 = TrainingArguments(
    output_dir="./bert_finetune_period1",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=10_000,
    overwrite_output_dir=True,
)

trainer_p1 = Trainer(
    model=model,
    args=training_args_p1,
    train_dataset=train_dataset_p1,
    data_collator=data_collator,
)
# trainer_p1.train(), trainer_p1.save_model()  # uncomment this run

# repeat the process for last-split period with a fresh model

In [None]:
# Cosine Similarity
# - function for implementation later

def cosine_sim(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# sim_score = cosine_sim(gender_axis_1977_1991, gender_axis_2010_2024)
# print(f"Similarity between gender axes: {sim_score:.4f}")