# Converting a pdf to text with pdftotext

Source: https://github.com/jalan/pdftotext

`pdftotext` needs to be installed first. Run the `install-pdftotext.sh` script in the parent directory to install it.

In [None]:
import pdftotext

Read the file and convert to text.

In [None]:
with open('../data/Exhibit-A-SAMPLE-CONTRACT.pdf', 'rb') as f:
    pdf = pdftotext.PDF(f)

In [None]:
type(pdf)

A `pdftotext.PDF` object works like a list of strings, each of which corresponds to a page of the document.

Number of pages.

In [None]:
len(pdf)

Print one page.

In [None]:
print(pdf[0])

## Text vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
tf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.95,
    min_df=2,
    use_idf=False,
    ngram_range=(1,1),
    max_features=5000
)
tf_vectors = tf_vectorizer.fit_transform(pdf)

In [None]:
tf_vectors.shape

In [None]:
tf_vectorizer.get_feature_names()

## Dimensional reduction and plotting

In [None]:
from sklearn.decomposition import TruncatedSVD
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [None]:
tsvd = TruncatedSVD(n_components=2)

In [None]:
X_red = tsvd.fit_transform(tf_vectors)

In [None]:
X_red.shape

In [None]:
trace = go.Scatter(
    x = X_red[:,0],
    y = X_red[:,1],
    mode='markers'
)

data = [trace]

fig = go.Figure(data=data)

iplot(fig)

## LDA for topic analysis

In [None]:
lda = LatentDirichletAllocation(
    n_components=5,
    max_iter=20,
    random_state=42,
    learning_method='batch'
)

lda_vectors = lda.fit_transform(tf_vectors)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = " ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        )
        print("Topic #{}: {}".format(topic_idx, top_words))
    print()

In [None]:
print_top_words(lda, tf_vectorizer.get_feature_names(), 3)

## Word embedding with `GloVe` from `spacy`

In [None]:
import sys
sys.path.insert(0, '../src/')
import pdf_lda
import os
import spacy
import numpy as np
from umap import UMAP
import pickle

Read one of the pdf files in the data folder.

In [None]:
DATA_DIR = '../data/'

In [None]:
os.listdir(DATA_DIR)

In [None]:
filename = os.listdir(DATA_DIR)[-2]

if filename.split('.')[-1]=='pdf':
    with open(os.path.join(DATA_DIR, filename), 'rb') as f:
        pdf = pdftotext.PDF(f)
else:
    print("Please use a pdf file")

In [None]:
text = ''.join(pdf)

In [None]:
nlp = spacy.load('en_core_web_lg') 

In [None]:
nlp(text)

In [None]:
glove_vectors = np.concatenate(
    [nlp(token.text).vector.reshape(1,300) for token in nlp(text)]
)

In [None]:
doc = nlp(text)

In [None]:
pos_to_ignore = [
    'ADP',
    'AUX',
    'CONJ',
    'CCONJ',
    'DET',
    'INTJ',
    'NUM',
    'PART',
    'PRON',
    'PROPN',
    'PUNCT',
    'SCONJ',
    'SYM',
    'SPACE',
    'X'
]

pos_to_keep = ['NOUN']

text_to_ignore = ['-', '_', '”', '–']

In [None]:
tokens = {token for token in doc if token.pos_ in pos_to_keep}

words = list({token.lemma_ for token in tokens if token.text not in text_to_ignore})

words

In [None]:
glove_vectors = np.concatenate(
    [nlp(word).vector.reshape(1,300) for word in words]
)
glove_vectors.shape

In [None]:
umapper = UMAP(n_neighbors=25)

In [None]:
umap_vectors = umapper.fit_transform(glove_vectors)

In [None]:
umap_vectors.shape

In [None]:
# with open('../data/long_contract_2d_vectors.pkl', 'wb') as f:
#     pickle.dump(umap_vectors, f)

# with open('../data/long_contract_words.pkl', 'wb') as f:
#     pickle.dump(words, f)

In [None]:
trace = go.Scatter(
            x = umap_vectors[:,0],
            y = umap_vectors[:,1],
            text=words,
            hoverinfo = 'text',
            mode = 'markers'
        )

layout = go.Layout(
    margin=go.Margin(
        t=25,
        l=20
    ),
    hovermode = 'closest'
)

fig = go.Figure(data=[trace], layout=layout)

iplot(fig)