In [1]:
# python libraries to import
import pandas as pd
import numpy as np
import re

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 

In [2]:
# local functions must sit in same directory as this file
import usefulNLP

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Denise\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Denise\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# global settings
xDir_Src = "C:/Users/Denise/Documents/DataScience/ASX300/data/raw_ASXIndex"
xDir_Dest = "C:/Users/Denise/Documents/DataScience/ASX300/data/preprocess"
xYearStart = 2015
xYearEnd = 2021

In [None]:
# read data files
lstASXIndex = pd.read_csv(xDir_Src + "/ASXIndex.csv", encoding='utf-8')

lstASXAnnTitle = pd.DataFrame([])
for xYear in range(xYearStart,xYearEnd):
    xASXAnnTitle = pd.read_csv(xDir_Src + "/ASXAnnTitle_" + str(xYear) + ".csv", encoding='utf-8')
    lstASXAnnTitle = pd.concat([lstASXAnnTitle, xASXAnnTitle], axis=0)
lstASXAnnTitle['Year'] = pd.to_datetime(lstASXAnnTitle['Date']).dt.year

lstASXForms = pd.read_csv(xDir_Src + "/ASXForms.csv", encoding='utf-8')

# get unique lists of end dates & codes
lstDate = sorted(lstASXIndex['Date'].unique().tolist())
lstCode = sorted(lstASXIndex['Code'].unique().tolist())

# counts of announcement titles by date and code
# lstASXAnnTitle['Date'].value_counts()
# lstASXAnnTitle['Code'].value_counts()

In [None]:
# scraping directly from website
url1 = 'https://www.asx.com.au/asx/statistics/displayAnnouncement.do?display=pdf&idsId=02218926'
url2 = 'https://www.asx.com.au/asx/statistics/displayAnnouncement.do?display=pdf&idsId=02217220'

# scraping using pypdf
tempfile1 = scrapePDF.download_ASX_pdf(url1)
text1 = scrapePDF.scrapePDF_pypdf2(tempfile1)

# scraping using pdfminer
tempfile2 = scrapePDF.download_ASX_pdf(url2)
text2 = scrapePDF.scrapePDF_pdfminer3(tempfile2)

In [None]:
# scraping pre-saved files
# scraping using pypdf
xfilename1 = xDir_Src + "/text1.pdf"
text1 = scrapePDF.scrapePDF_pypdf2(xfilename1)

# scraping using pdfminer
xfilename2 = xDir_Src + "/text2.pdf"
text2 = scrapePDF.scrapePDF_pdfminer3(xfilename2)

In [None]:
# scraping and joining multiple documents
text = []
for i in range(5) :
    xfilename = xDir_Src + "/text" + str(i+1) + ".pdf"
    #xtext = scrapePDF.scrapePDF_pdfminer3(xfilename)
    totalpages2,documentInfo2,xtext = scrapePDF.scrapePDF_pypdf2(xfilename2)
    xtext = xtext.split('  ')
    xtext = '\n'.join(xtext)
    text.append(xtext)
text

In [None]:
tf = TfidfVectorizer(max_df=1.00, max_features=1000,
                     min_df=0.80, norm='l2', stop_words='english',
                     use_idf=True, tokenizer=usefulNLP.tokenize_only,ngram_range=(5,10))
tfidf_matrix = tf.fit_transform(text)
print(tfidf_matrix.shape)

terms = tf.get_feature_names()
print(terms)

In [None]:
# getting document info metadata
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

fp = open(xfilename1, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)

print(doc.info)  # The "Info" metadata

In [None]:
def getpdfcontent(pdf_content):
    #pdf_content = extract_pdf(path)
    text = pdf_content
    text = text.replace(',','')
    text = text.replace('  ',' <newline>')
    text = text.replace('\n ',' ')
    text = text.replace(' \n',' ')
    text = text.replace("<newline>", "\n\n")
    text = text.splitlines()
    
    # extract paragraphs
    current = ""
    paragraphs = []
    for line in text:
        if not line.strip():
            if current.strip():
                paragraphs.append(current)
                current = ""
            continue
        current += line.strip()
    return paragraphs

converted = getpdfcontent(text1)
converted

In [None]:
def clean_pdf(pdf_content) :
    text = pdf_content
    text = re.sub(r'([0-9](?=[A-Z])|[0-9](?=[A-Z][a-z]))', r'\1 ', text)
    text = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', text)
    text = re.sub(r'([a-z](?=[0-9]))', r'\1 ', text)
    text = text.replace(')',') ')
    text = text.replace('(',' (')
    text = text.replace(',','')
    text = text.replace('?','? ')
    text = text.replace('-',' -')
    return(text)