# Converting a pdf to text with pdftotext

Source: https://github.com/jalan/pdftotext

`pdftotext` needs to be installed first. Run the `install-pdftotext.sh` script in the parent directory to install it.

In [27]:
import pdftotext

Read the file and convert to text.

In [28]:
with open('../data/Exhibit-A-SAMPLE-CONTRACT.pdf', 'rb') as f:
    pdf = pdftotext.PDF(f)

In [29]:
type(pdf)

pdftotext.PDF

A `pdftotext.PDF` object works like a list of strings, each of which corresponds to a page of the document.

Number of pages.

In [30]:
len(pdf)

11

Print one page.

In [32]:
print(pdf[0])

                                 Exhibit A – Sample Contract
SAMPLE CONTRACT
     OFFICE OF HAWAIIAN AFFAIRS
      CONTRACT NUMBER ________
        REQUEST FOR PROPOSALS
    PURCHASE OF GOODS AND SERVICES
          CONTRACT BETWEEN
      OFFICE OF HAWAIIAN AFFAIRS
                 AND
         XXXXXXXXXX



## Text vectorization

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [67]:
tf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.95,
    min_df=2,
    use_idf=False,
    ngram_range=(1,1),
    max_features=5000
)
tf_vectors = tf_vectorizer.fit_transform(pdf)

In [68]:
tf_vectors.shape

(11, 121)

In [84]:
tfidf_vectorizer.get_feature_names()

['09',
 '10',
 '100',
 '12',
 '166',
 '200',
 '2016',
 '201__',
 '20___',
 '30',
 '50',
 '560',
 '712',
 '84',
 '96',
 '96817',
 '99',
 '______',
 '________',
 '_________',
 '___________',
 '____________',
 '_______________',
 '___________________',
 '_____________________',
 '______________________',
 '______________________________',
 '_________________________________',
 '______________________________________',
 '_______________________________________',
 '________________________________________',
 '__________________________________________',
 '_________circuit',
 '______day',
 'accordance',
 'according',
 'acknowledgement',
 'acknowledges',
 'acknowledgment',
 'act',
 'acting',
 'actual',
 'additionally',
 'address',
 'administrative',
 'advantageous',
 'affairs',
 'agree',
 'agreed',
 'agreement',
 'agrees',
 'amended',
 'appeared',
 'applicable',
 'apply',
 'appointed',
 'approval',
 'approved',
 'assist',
 'assisted',
 'attached',
 'attachment',
 'attachments',
 'authority',


## Dimensional reduction and plotting

In [80]:
from sklearn.decomposition import TruncatedSVD
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [77]:
tsvd = TruncatedSVD(n_components=2)

In [78]:
X_red = tsvd.fit_transform(tf_vectors)

In [79]:
X_red.shape

(11, 2)

In [83]:
trace = go.Scatter(
    x = X_red[:,0],
    y = X_red[:,1],
    mode='markers'
)

data = [trace]

fig = go.Figure(data=data)

iplot(fig)

## LDA for topic analysis

In [69]:
lda = LatentDirichletAllocation(
    n_components=5, max_iter=20, random_state=42, learning_method='batch'
)

lda_vectors = lda.fit_transform(tf_vectors)

In [70]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = " ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        )
        print("Topic #{}: {}".format(topic_idx, top_words))
    print()

In [71]:
print_top_words(lda, tf_vectorizer.get_feature_names(), 3)

Topic #0: hawaiian affairs office
Topic #1: contractor oha shall
Topic #2: ________ goods proposals
Topic #3: ________ goods proposals
Topic #4: ________ goods proposals

