# Project 3

## Load Necessary Libraries

In [1]:
#!pip install --upgrade PyPDF2

In [2]:
import PyPDF2
import spacy
import re
from spacy import displacy
import pandas as pd

  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]


## 1. Extract all texts from the given pdf file.

In [3]:
mypdf = "/Users/matthewmoore/Downloads/tax-efficient-withdrawal-strategies.pdf"

pdfFile = open(mypdf, 'rb')

In [4]:
pdfReader = PyPDF2.PdfReader(pdfFile)

In [5]:
text = ""
for page in pdfReader.pages:
    text += page.extract_text() + "\n"

print(text[:2000]) 

1
T. ROWE  PRICE INSIGHTS
ON RETIREMENT
KEY INSIGHTS
	■There are alternatives to the conventional strategy of drawing on a taxable 
account first, followed by tax-deferred, and then Roth accounts. 
	■Many people can take advantage of income in a low tax bracket or tax-free 
capital gains.
	■If planning to leave an estate to heirs, consider which assets will ultimately 
maximize the after-tax value. How to Get More Out of 
Your Retirement Account 
Withdrawals 
These approaches can extend the life of your portfolio 
and preserve assets for heirs.
Many people will rely largely 
on Social Security benefits 
and tax-deferred accounts 
such as individual retirement accounts 
(IRAs) and 401(k) plans to support 
their lifestyle in retirement. However, 
a sizable number of retirees will also 
enter retirement with assets in taxable 
accounts (such as brokerage accounts) 
and Roth accounts. Deciding how 
to use that combination of accounts 
to fund spending is a decision 
likely driven by tax co

## 2. Extract all the tokens from the texts.Extract all lemmas from the texts.

In [6]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Extract tokens and lemmas
tokens = [token.text for token in doc]
lemmas = [token.lemma_ for token in doc]

df_tokens = pd.DataFrame({"Token": tokens, "Lemma": lemmas})
df_tokens.head(10) 

Unnamed: 0,Token,Lemma
0,1,1
1,\n,\n
2,T.,T.
3,ROWE,ROWE
4,,
5,PRICE,PRICE
6,INSIGHTS,INSIGHTS
7,\n,\n
8,ON,ON
9,RETIREMENT,RETIREMENT


## 3. Remove all the default stop words in SpaCy from the texts.

In [7]:
# Remove default stop words
filtered_tokens = [token.text for token in doc if not token.is_stop]
df_filtered_tokens = pd.DataFrame({"Filtered Tokens": filtered_tokens})
df_filtered_tokens.head(10)

Unnamed: 0,Filtered Tokens
0,1
1,\n
2,T.
3,ROWE
4,
5,PRICE
6,INSIGHTS
7,\n
8,RETIREMENT
9,\n


## 4. Customize the stop words in SpaCy by:

### + Adding tax" and "account" to the stop words.
### + Remove "full" from the default stop words.

In [8]:
# Customize stop words
custom_stop_words = {"tax", "account"}  # Adding
stop_words_to_remove = {"full"}  # Removing
for word in custom_stop_words:
    nlp.Defaults.stop_words.add(word)
    nlp.vocab[word].is_stop = True
for word in stop_words_to_remove:
    nlp.Defaults.stop_words.remove(word)
    nlp.vocab[word].is_stop = False

# Reprocess text after customizing stop words
doc = nlp(text)
filtered_tokens_custom = [token.text for token in doc if not token.is_stop]
df_filtered_tokens_custom = pd.DataFrame({"Filtered Tokens (Custom)": filtered_tokens_custom})
df_filtered_tokens_custom.head(10)

Unnamed: 0,Filtered Tokens (Custom)
0,1
1,\n
2,T.
3,ROWE
4,
5,PRICE
6,INSIGHTS
7,\n
8,RETIREMENT
9,\n


## 5. Perform the part of speech tagging for the texts.

In [9]:
# Perform Part-of-Speech (POS) tagging
df_pos = pd.DataFrame([(token.text, token.pos_) for token in doc], columns=["Token", "POS"])
df_pos.head(10)

Unnamed: 0,Token,POS
0,1,NUM
1,\n,SPACE
2,T.,PROPN
3,ROWE,PROPN
4,,SPACE
5,PRICE,PROPN
6,INSIGHTS,PROPN
7,\n,SPACE
8,ON,PROPN
9,RETIREMENT,PROPN


## 6. Visualize the dependency parser of the texts.

In [10]:
# Dependency Parsing Visualization
displacy.render(doc, style='dep', jupyter=True)

## 7. Perform the named entities recognition for the texts.

In [11]:
# Named Entity Recognition (NER)
df_entities = pd.DataFrame([(ent.text, ent.label_) for ent in doc.ents], columns=["Entity", "Label"])
df_entities.head(10)

Unnamed: 0,Entity,Label
0,1,CARDINAL
1,T. ROWE,ORG
2,first,ORDINAL
3,Roth,PERSON
4,Social Security,ORG
5,Roth,PERSON
6,1,CARDINAL
7,first,ORDINAL
8,Roth,PERSON
9,first,ORDINAL


In [12]:
# Named Entity Recognition (NER)
displacy.render(doc, style="ent", jupyter=True)

## 8. Visualize the MONEY, QUANTITY and CARDINAL in the texts.

In [13]:
df_filtered_entities = df_entities[df_entities['Label'].isin(["MONEY", "QUANTITY", "CARDINAL"])]
df_filtered_entities.head(10)

Unnamed: 0,Entity,Label
0,1,CARDINAL
6,1,CARDINAL
10,three,CARDINAL
14,1,CARDINAL
15,6,CARDINAL
17,11,CARDINAL
18,16,CARDINAL
19,2,CARDINAL
21,two,CARDINAL
23,three,CARDINAL


In [14]:
# Filter and visualize entities for MONEY, QUANTITY, and CARDINAL
options = {"ents": ["MONEY", "QUANTITY", "CARDINAL"], "colors": {"MONEY": "#FFD700", "QUANTITY": "#FFA07A", "CARDINAL": "#87CEEB"}}
displacy.render(doc, style="ent", jupyter=True, options=options)