### Installs

In [1]:
!pip install pytesseract
!pip install ollama
!apt install poppler-utils
!pip install pdf2image
!pip install colab-xterm
%load_ext colabxterm
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Collecting ollama
  Downloading ollama-0.4.8-py3-none-any.whl.metadata (4.7 kB)
Downloading ollama-0.4.8-py3-none-any.whl (13 kB)
Installing collected packages: ollama
Successfully installed ollama-0.4.8
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 34 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.7 [186 kB]
Fetched 186 kB in 1s (314 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126101 files and dire

### Running PyTesseract on a single pdf
##### This takes about 50 seconds on colab, 15-20 on SCC

In [4]:
pages = convert_from_path("pdf/2014-02-11.pdf") #example pdf

# Access first page
first_page = pages[0]

text = pytesseract.image_to_string(first_page)
print(text)

CMYK

 

“All the News
That’s Fit to Print”

 

 

 

Nxxx,2014-02-11,A,001,Bs-BK,E2

Che New York Cimes

Late Edition

Today, quite cold despite sunshine,
high 26. Tonight, mostly clear, cold,
low 14. Tomorrow, sunshine, then
some clouds, remaining cold, high
27. Weather map is on Page Al7.

 

VOL. CLXIII... No. 56,409

© 2014 The New York Times

NEW YORK, TUESDAY, FEBRUARY 11, 2014

$2.50

 

 

CHANG W. LEE/THE NEW YORK TIMES

EB0000 COLLISION COURSE Competitors were sent flying in the consolation finals of the men’s 1,500-meter short-track
==) i!__=ameaa speedskating. In skiing, the American Julia Mancuso won a bronze in her third Olympics. Pages B8-12.

 

U.S. Debates
Drone Strike
On American

By MARK MAZZETTI
and ERIC SCHMITT

WASHINGTON — The Obama
administration is debating
whether to authorize a lethal
strike against an American citi-
zen living in Pakistan who some
believe is actively plotting terror-
ist attacks, according to current
and former government officials.

It is

### Using xterm for Ollama
Credit to Abonia Sojasingarayar's tutorial here: https://medium.com/@abonia/running-ollama-in-google-colab-free-tier-545609258453
- Launch xterm with the codeblock below
- run `curl https://ollama.ai/install.sh | sh` to install ollama
- run `ollama serve &` to start the ollama server but keep it so you can write in the terminal
- run `ollama pull nomic-embed-text` to download the nomic embeddings model that we used
- Continue to later codeblocks for embedding

If at any point ollama stops working just restart `xterm` and run `ollama serve` (you don't have to redownload the model)


In [5]:
%xterm

Launching Xterm...

<IPython.core.display.Javascript object>

### Embeddings through Ollama

In [6]:
import ollama
text = "testing testing"
ollama.embeddings(model='nomic-embed-text', prompt=text)

EmbeddingsResponse(embedding=[0.9172549843788147, 0.9729501008987427, -4.494909763336182, -0.5133851766586304, 2.0696353912353516, 0.026846468448638916, 0.8263369798660278, -0.28975552320480347, 0.7219882011413574, -0.7230998277664185, 0.33635812997817993, 1.3690578937530518, 0.37989795207977295, 0.493349552154541, -0.7268327474594116, -1.1191602945327759, 1.443676233291626, -1.651324987411499, -0.49247074127197266, -0.3557901382446289, 0.7960456609725952, -0.974124550819397, -1.3985193967819214, -0.12787799537181854, 1.9273086786270142, -0.8790619373321533, -1.3871982097625732, 0.4138542711734772, -0.5494940280914307, -0.3728175759315491, 0.6897900700569153, -0.7735300660133362, 0.9262555837631226, -1.435060739517212, -0.28328144550323486, -1.1309964656829834, 0.31350934505462646, 1.2099120616912842, 0.38851046562194824, 0.08452555537223816, -0.28260886669158936, 0.8316298127174377, -0.1488131731748581, -0.6782640218734741, 1.3761224746704102, 0.16432183980941772, 0.4682244062423706, 

In [9]:
from datetime import datetime, timedelta
import os
import numpy as np
import csv
import json


start_date = datetime.strptime("2014-02-11", "%Y-%m-%d")
end_date = datetime.strptime("2014-02-11", "%Y-%m-%d")
current_date = start_date

output_path = './output.csv'

# Make sure header is written once if file doesn't exist
if not os.path.exists(output_path):
    with open(output_path, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Date', 'Text', 'Embedding'])

while current_date <= end_date:
    filename = f"{current_date.strftime('%Y-%m-%d')}.pdf"
    filepath = "./pdf/" + filename
    print(f"Trying {filepath}")

    try:
        pages = convert_from_path(filepath)
        first_page = pages[0]
        text = pytesseract.image_to_string(first_page)
        print(f"OCR result (preview): {text[:10]!r}")

        try:
            embedding = ollama.embeddings(model='nomic-embed-text', prompt=text)
            print(f"Got embedding for {filename}")
        except Exception as embed_err:
            print(f"Embedding failed for {filename}: {embed_err}")
            embedding = {"embedding": []}

        # Write result to CSV immediately
        with open(output_path, mode='a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([current_date.strftime('%Y-%m-%d'), text, json.dumps(embedding['embedding'])])
            f.flush()
            os.fsync(f.fileno())
            print(f"Wrote row for {filename}")

    except Exception as e:
        print(f"Failed to process {filename}: {e}")

    try:
        # os.remove(filepath)
        print(f"Finished {filename}")
    except Exception as e:
        print(f"Could not delete {filename}: {e}")

    current_date += timedelta(days=1)

print("Done. All processed entries saved to output.csv")


Trying ./pdf/2014-02-11.pdf
OCR result (preview): 'CMYK\n\n \n\n“'
Got embedding for 2014-02-11.pdf
Wrote row for 2014-02-11.pdf
Finished 2014-02-11.pdf
Done. All processed entries saved to output.csv


### Splitting By Articles
- Starts with `justtext.csv` which was acquired by running the above loop on all dates in our dataset of pdfs (on SCC since it's way faster there and does not time out like colab does)


In [7]:
import re
import pandas as pd
embeddings = pd.read_csv("justtext.csv")
texts = embeddings['Text']
dates = embeddings['Date']

split_articles = []

keywords = [
    "stock", "equities", "shares", "markets", "trading", "IPO", "stock exchange",
    "Wall Street", "S&P 500", "Dow Jones", "NASDAQ", "rally", "selloff", "recession",
    "inflation", "deflation", "interest rates", "economy", "GDP", "consumer spending",
    "monetary policy", "fiscal policy", "unemployment", "jobless claims", "labor market",
    "economic", "slowdown", "stimulus", "federal reserve", "central bank",
    "bonds", "treasuries", "yields", "fixed income", "corporate bonds", "municipal bonds",
    "bond market", "debt issuance", "credit ratings", "bond auction", "treasury yields",
    "maturity", "coupon", "budget"
]

# regex!
pattern = r'\b(' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b'
compiled_pattern = re.compile(pattern, flags=re.IGNORECASE)

# split by bylines but not ". By" in the case of sentences that start with "By". This mostly filters out improper "By"s
byline_pattern = re.compile(r'\n\s*By\s+', flags=re.IGNORECASE)

collected_data = []

for t, d in zip(texts, dates):
    t = t.replace('\r', '\n')

    # Split only at real "By" starting a new line
    parts = byline_pattern.split(t)

    for part in parts:
        matches = compiled_pattern.findall(part)
        if matches:
            unique_matches = list(set([m.lower() for m in matches]))  # lowercased + no duplicates
            print(f"\nMatched keywords: {unique_matches}")
            # print(f"Article snippet:\n{part[:500]}")  # Print first 500 chars
            collected_data.append({
                'Date': d,
                'Article': part.strip(),
                'MatchedKeywords': ', '.join(unique_matches)
            })

split_df = pd.DataFrame(collected_data)
split_df.to_csv('split_articles_with_keywords.csv', index=False)

print(f"\nTotal split articles: {len(split_df)}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Matched keywords: ['rally']

Matched keywords: ['stock', 'economy']

Matched keywords: ['budget']

Matched keywords: ['budget']

Matched keywords: ['stock']

Matched keywords: ['stock']

Matched keywords: ['economic']

Matched keywords: ['economic']

Matched keywords: ['economy']

Matched keywords: ['rally']

Matched keywords: ['economy']

Matched keywords: ['economic']

Matched keywords: ['yields']

Matched keywords: ['trading', 'stock', 's&p 500', 'markets', 'economy']

Matched keywords: ['economy']

Matched keywords: ['unemployment', 'economic', 'labor market', 'budget', 'economy']

Matched keywords: ['wall street']

Matched keywords: ['economic', 'unemployment']

Matched keywords: ['economic', 'markets', 'stock', 'economy']

Matched keywords: ['stock', 'economy']

Matched keywords: ['economic', 'economy', 'unemployment']

Matched keywords: ['economic']

Matched keywords: ['markets']

Matched keywords: ['inflation']



### Embedding just the articles

In [9]:
import time

articles_df = pd.read_csv('split_articles_with_keywords.csv')

articles = articles_df['Article'].tolist()

embeddings = []

# Loop through articles one by one
for idx, text in enumerate(articles[:10], start=1): # Because Colab-based Ollama is way slower than SCC Ollama, we're only doing the first 10 here
    try:
        # Call Ollama to get the embedding
        response = ollama.embeddings(model='nomic-embed-text', prompt=text)
        embedding = response['embedding']
    except Exception as e:
        print(f"Error at article {idx}: {e}")
        embedding = None

    embeddings.append(embedding)

    if idx % 10 == 0 or idx == len(articles):
        print(f"Progress: {idx}/{len(articles)} articles processed")

    time.sleep(1) # without this ollama often crashes (at least on colab) - but this is too slow to be reasonable to run on Colab
    # On SCC this accepts a lower wait time and runs in a reasonable amount of time
# articles_df['Embedding'] = embeddings
print(embeddings)
# articles_df.to_csv('articles_with_embeddings.csv', index=False)

print("Done embedding and saving!")


Progress: 10/6296 articles processed
[[0.6109590530395508, 2.7261962890625, -3.5783727169036865, 0.3620468080043793, 0.38192927837371826, -0.3018578886985779, -0.6313471794128418, -0.13211768865585327, 0.41227349638938904, 0.7671050429344177, 0.1041581854224205, 0.7203894257545471, 1.2399386167526245, -0.0360327810049057, 1.4024386405944824, -1.5627460479736328, -0.9668004512786865, -1.0229074954986572, 0.5336417555809021, 0.11853691190481186, -0.8009101152420044, -1.7897789478302002, -0.5183221697807312, -0.6534875631332397, -0.0917247086763382, -0.1049908846616745, -0.7289528846740723, -0.10760979354381561, -0.2729363441467285, 1.0126070976257324, 0.8703708052635193, -0.187373548746109, -0.34254005551338196, -1.2508214712142944, -1.3620338439941406, -0.3812137842178345, 0.9597568511962891, -0.28644952178001404, -0.8706586360931396, 0.024938050657510757, 0.4256399869918823, 0.14332038164138794, -1.3313963413238525, -1.1834986209869385, 1.4027774333953857, -1.439467430114746, -0.551709

## Feature Engineering
- First, we ran MNLI (multi-genre natural language inference) through the RoBERTa model with pytorch to get agreement/contradiction with test embeddings
- Then we compute cosine similarity of the embeddings with test embeddings

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

articles_df = pd.read_csv('split_articles_with_keywords.csv')
articles = articles_df['Article'].head(10).tolist() # this takes a long time for full data so stopping at 10 as well

#MNLI
model_name = "roberta-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

test_sentences = [
    "The stock market went up yesterday",
    "The stock market went down yesterday",
    "The bond market went up yesterday",
    "The bond market went down yesterday",
    "There was a bad jobs report yesterday",
    "There was a good jobs report yesterday",
    "There was a bad inflation report yesterday",
    "There was a good inflation report yesterday",
]

# MNLI Scoring
def get_nli_scores(premise, hypothesis):
    try:
        inputs = tokenizer(premise, hypothesis, return_tensors="pt", truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=-1).squeeze()
        probs_dict = {model.config.id2label[i].upper(): prob.item() for i, prob in enumerate(probabilities)}
        return {
            "Entailment Prob": probs_dict.get('ENTAILMENT', 0.0),
            "Neutral Prob": probs_dict.get('NEUTRAL', 0.0),
            "Contradiction Prob": probs_dict.get('CONTRADICTION', 0.0),
            "Error": None
        }
    except Exception as e:
        return {
            "Entailment Prob": 0.0,
            "Neutral Prob": 0.0,
            "Contradiction Prob": 0.0,
            "Error": str(e)
        }

all_results = []

for idx, article in enumerate(articles, start=1):
    article_results = {"Article_Index": idx, "Article_Text": article}

    for hypothesis_idx, hypothesis in enumerate(test_sentences, start=1):
        nli_scores = get_nli_scores(article, hypothesis)
        article_results[f"Hypothesis_{hypothesis_idx}_Sentence"] = hypothesis
        article_results[f"Hypothesis_{hypothesis_idx}_Entailment"] = nli_scores["Entailment Prob"]
        article_results[f"Hypothesis_{hypothesis_idx}_Neutral"] = nli_scores["Neutral Prob"]
        article_results[f"Hypothesis_{hypothesis_idx}_Contradiction"] = nli_scores["Contradiction Prob"]
        article_results[f"Hypothesis_{hypothesis_idx}_Error"] = nli_scores["Error"]

    all_results.append(article_results)

    if idx % 10 == 0 or idx == len(articles):
        print(f"Processed {idx}/{len(articles)} articles")

full_nli_results_df = pd.DataFrame(all_results)
full_nli_results_df.to_csv('nli_analysis_all_articles.csv', index=False)

print("All articles processed and results saved")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processed 10/10 articles
All articles processed and results saved


In [44]:
import ollama
import time
import pandas as pd
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    return dot(a, b) / (norm(a) * norm(b))

articles_df = pd.read_csv('split_articles_with_keywords.csv')

tests = [
    "The stock market went up yesterday",
    "The stock market went down yesterday",
    "The bond market went up yesterday",
    "The bond market went down yesterday",
    "There was a bad jobs report yesterday",
    "There was a good jobs report yesterday",
    "There was a bad inflation report yesterday",
    "There was a good inflation report yesterday",
]

test_embeddings = []
for test in tests:
    emb = ollama.embeddings(model='nomic-embed-text', prompt=test)['embedding']
    test_embeddings.append(emb)
    time.sleep(0.1)

results_cosine = []

for article_idx, article_row in articles_df.head(10).iterrows(): # Again heading it to 10 so that colab works, this is less of an issue on SCC
    article_text = article_row['Article']

    # Get embedding for the article
    try:
        article_embedding = ollama.embeddings(model='nomic-embed-text', prompt=article_text)['embedding']
    except Exception as e:
        print(f"Error embedding article {article_idx}: {e}")
        continue

    time.sleep(1)

    # Calculate cosine similarities to each test embedding
    cosine_sims = {}
    for test_idx, test_embedding in enumerate(test_embeddings, start=1):
        similarity = cos_sim(article_embedding, test_embedding)
        cosine_sims[f'cos_sim_test_{test_idx}'] = similarity

    cosine_sims['Article_Index'] = article_idx
    results_cosine.append(cosine_sims)

    if article_idx % 10 == 0:
        print(f"Processed {article_idx}/{len(articles_df)} articles...")
cosine_features_df = pd.DataFrame(results_cosine)
print(cosine_features_df)
cosine_features_df.to_csv('cosine_features_per_article.csv', index=False)



Processed 0/6296 articles...
   cos_sim_test_1  cos_sim_test_2  cos_sim_test_3  cos_sim_test_4  \
0        0.441808        0.411060        0.464115        0.439113   
1        0.371843        0.391412        0.380566        0.391826   
2        0.660231        0.624969        0.607755        0.577383   
3        0.455799        0.435867        0.440879        0.415366   
4        0.394476        0.400031        0.365845        0.364359   
5        0.354516        0.295913        0.345880        0.282558   
6        0.455821        0.400751        0.443047        0.376814   
7        0.321303        0.310629        0.270658        0.256679   
8        0.437241        0.400421        0.427991        0.387104   
9        0.483071        0.495248        0.453796        0.458033   

   cos_sim_test_5  cos_sim_test_6  cos_sim_test_7  cos_sim_test_8  \
0        0.393521        0.380788        0.455676        0.450265   
1        0.451870        0.429986        0.422493        0.416296   
2   