In [1]:
# python libraries to import
import pandas as pd
import numpy as np
import nltk
import re
import _pickle as cPickle

from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans

In [2]:
# local functions must sit in same directory as this file
import usefulNLP

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Denise\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Denise\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# global settings
xDir_Src = "C:/Users/Denise/Documents/DataScience/ASX300/data/raw_ASXIndex"
xDir_Dest = "C:/Users/Denise/Documents/DataScience/ASX300/data/preprocess"
xYearStart = 2015
xYearEnd = 2021

lstStopwords = set(stopwords.words('english'))

In [4]:
# read data files to get code list
lstASXIndex = pd.read_csv(xDir_Src + "/ASXIndex.csv", encoding='utf-8')

# get unique lists of end dates & codes
lstDate = sorted(lstASXIndex['Date'].unique().tolist())
lstCode = sorted(lstASXIndex['Code'].unique().tolist())

# preview asx index
lstASXIndex.head()

Unnamed: 0,Code,Company,Sector,Market Cap,Weight(%),Date
0,A2M,The A2 Milk Company Limited NZ,Consumer Staples,1460370000,0.09,2017-01-01
1,AAC,Australian Agricultural Company Limited,Consumer Staples,947014000,0.06,2017-01-01
2,AAD,Ardent Leisure Group Stapled,Consumer Discretionary,1097680000,0.07,2017-01-01
3,ABC,Adelaide Brighton Limited,Materials,3527620000,0.21,2017-01-01
4,ABP,Abacus Property Group Stapled,Real Estate,1728420000,0.1,2017-01-01


Train model based on announcement titles

In [5]:
# Read data
lstASXAnnTitle = pd.DataFrame([])
for xYear in range(xYearStart,xYearEnd):
    xASXAnnTitle = pd.read_csv(xDir_Src + "/ASXAnnTitle_" + str(xYear) + ".csv", encoding='utf-8')
    lstASXAnnTitle = pd.concat([lstASXAnnTitle, xASXAnnTitle], axis=0)
lstASXAnnTitle['Year'] = pd.to_datetime(lstASXAnnTitle['Date']).dt.year

# counts of announcement titles by date and code
# lstASXAnnTitle['Date'].value_counts()
# lstASXAnnTitle['Code'].value_counts()

# preview announcement title
print(lstASXAnnTitle.shape)
lstASXAnnTitle.head()

(137439, 6)


Unnamed: 0,Date,Time,Title,Link,Code,Year
0,31/12/2015,10:33 AM,Ceasing to be a substantial holder from CGF,/asx/statistics/displayAnnouncement.do?display...,A2M,2015
1,30/12/2015,5:02 PM,Ceasing to be a substantial holder - Harbour AM,/asx/statistics/displayAnnouncement.do?display...,A2M,2015
2,30/12/2015,5:01 PM,Becoming a substantial holder from CGF,/asx/statistics/displayAnnouncement.do?display...,A2M,2015
3,30/12/2015,4:58 PM,Ceasing to be a substantial holder from CGF - ...,/asx/statistics/displayAnnouncement.do?display...,A2M,2015
4,29/12/2015,8:28 AM,Ceasing to be a substantial holder from CGF,/asx/statistics/displayAnnouncement.do?display...,A2M,2015


In [6]:
lstExclude = list(lstCode) + list(lstStopwords)

texts = lstASXAnnTitle['Title'].tolist()
texts = usefulNLP.preprocess(texts,lstExclude)

In [7]:
# http://brandonrose.org/clustering
# use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in texts:
    allwords_stemmed = usefulNLP.tokenize_and_stem(i) #for each item tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = usefulNLP.tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
#vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_tokenized)
print(vocab_frame.shape)

# counts of most common stems and tokens
# Counter(totalvocab_tokenized).most_common(50)
print(Counter(totalvocab_stemmed).most_common(50))

(549844, 1)
[('substanti', 27800), ('appendix', 25102), ('chang', 23517), ('notic', 23036), ('holder', 15421), ('director', 14331), ('interest', 12928), ('hold', 12604), ('result', 9396), ('3b', 9257), ('share', 8737), ('present', 7754), ('becom', 7611), ('ceas', 7465), ('report', 7282), ('buyback', 7202), ('updat', 6900), ('3e', 6736), ('daili', 6045), ('dividend', 5886), ('year', 5736), ('distribut', 5648), ('annual', 5141), ('meet', 4309), ('half', 3672), ('3y', 3461), ('announc', 3431), ('quarter', 3376), ('sharehold', 3147), ('investor', 2953), ('secur', 2924), ('form', 2874), ('general', 2852), ('agm', 2711), ('financi', 2472), ('statement', 2406), ('group', 2287), ('appoint', 2194), ('releas', 2163), ('corpor', 1946), ('trade', 1911), ('final', 1896), ('address', 1848), ('offer', 1779), ('full', 1770), ('4g', 1622), ('issu', 1603), ('complet', 1587), ('confer', 1581), ('initi', 1546)]


In [8]:
# using vectorizer to do tfidf and normalise the results - more features better for classification
tf = TfidfVectorizer(max_df=0.250, max_features=200,
                     min_df=0.025, norm='l2', # stop_words='english',
                     use_idf=True, tokenizer=usefulNLP.tokenize_and_stem,ngram_range=(1,5))
tfidf_matrix = tf.fit_transform(texts)
print(tfidf_matrix.shape)

terms = tf.get_feature_names()
print(terms)

(137439, 57)
['3b', '3e', '3y', 'annual', 'appendix', 'appendix 3b', 'appendix 3e', 'becom', 'becom substanti', 'becom substanti holder', 'buyback', 'buyback notic', 'buyback notic appendix', 'buyback notic appendix 3e', 'ceas', 'ceas substanti', 'ceas substanti holder', 'chang', 'chang director', 'chang director interest', 'chang director interest notic', 'chang substanti', 'chang substanti hold', 'daili', 'daili share', 'daili share buyback', 'daili share buyback notic', 'daili share buyback notic appendix', 'director', 'director interest', 'director interest notic', 'distribut', 'dividend', 'dividend distribut', 'half', 'half year', 'hold', 'holder', 'interest', 'interest notic', 'meet', 'notic', 'notic appendix', 'notic appendix 3e', 'present', 'report', 'result', 'share', 'share buyback', 'share buyback notic', 'share buyback notic appendix', 'share buyback notic appendix 3e', 'substanti', 'substanti hold', 'substanti holder', 'updat', 'year']


In [9]:
# k means clustering
num_clusters = 10
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

ASX300 = { 'title': texts , 'cluster': clusters}
frame = pd.DataFrame(ASX300, index = [clusters] , columns = ['cluster', 'title'])

# print results
for i in range(num_clusters) :
    ntexts = frame[frame['cluster']==i]['cluster'].shape[0]
    print(ntexts)
    tokens = word_tokenize('\n'.join(frame[frame['cluster']==i]['title'].tolist()))
    print([k for k,v in dict(Counter(tokens).most_common()).items() if (v > ntexts * 0.75)])
    print(frame[frame['cluster']==i]['title'][:10].tolist()) # first 20 titles for this cluster
    print() # add whitespace

7443
['ceasing', 'substantial', 'holder']
['ceasing substantial holder', 'ceasing substantial holder harbour am', 'ceasing substantial holder rerelease', 'ceasing substantial holder', 'ceasing substantial holder', 'ceasing substantial holder challenger', 'ceasing substantial holder greencape', 'ceasing substantial holder harbour am', 'ceasing substantial holder', 'ceasing substantial holder arrovest']

54606
[]
['notice pursuant nzx lr 7.12.1', 'completion share purchase plan', 'ssh notice milford asset management limited', 'sells remaining a2mc shareholding', 'share purchase plan booklet', 'share purchase plan chairman letter', 'notice offer financial products', 'opening share purchase plan', 'ssh notice arrovest pty limited', 'ssh notice freedom foods group limited']

11919
['interest', 'director', 'notice', 'change']
['change director interest notice g babidge', 'change director interest notice d hearn', 'change director interest notice j hoare', 'change director interest notice m m

In [10]:
# cluster distances from centre - make summary of title clusters
# https://stackoverflow.com/questions/54240144/distance-between-nodes-and-the-centroid-in-a-kmeans-cluster
# identify samples furthest from the centers - from each clusters own centre?
xdist = km.transform(tfidf_matrix)
frame['distance'] = xdist.min(axis=1)

title_summary = {'count' : frame['cluster'].value_counts(),
                 'percent' : frame['cluster'].value_counts()/frame.shape[0],
                 'dist_mean' : frame.groupby(['cluster'])['distance'].mean(), 
                 'dist_min' : frame.groupby(['cluster'])['distance'].min(),
                 'dist_max' : frame.groupby(['cluster'])['distance'].max(),
                 'dist_sd' : frame.groupby(['cluster'])['distance'].std(),
                 'name' : 'TBA',}
title_summary = pd.DataFrame(title_summary) # , columns = ['cluster', 'title'])
title_summary['cluster'] = title_summary.index

for i in range(num_clusters) :
    ntexts = frame[frame['cluster']==i]['cluster'].shape[0]
    tokens = word_tokenize('\n'.join(frame[frame['cluster']==i]['title'].tolist()))
    xname = [k for k,v in dict(Counter(tokens).most_common()).items() if (v > ntexts * 0.75)] # in 75% of texts in cluster
    title_summary.loc[i,'name'] = ' '.join(xname)

title_summary = title_summary.sort_values('dist_mean')
title_summary = title_summary.reset_index(drop=True)
title_summary

Unnamed: 0,count,percent,dist_mean,dist_min,dist_max,dist_sd,name,cluster
0,7443,0.054155,0.015916,0.00704,0.980287,0.075816,ceasing substantial holder,0
1,12307,0.089545,0.017189,0.006702,0.970096,0.091136,substantial change holding,4
2,7684,0.055908,0.033543,0.014945,0.946261,0.125896,substantial holder becoming,3
3,9229,0.06715,0.055065,0.021117,0.857277,0.134837,appendix 3b,6
4,6145,0.044711,0.103297,0.046223,0.972588,0.195127,notice appendix share buyback daily 3e,8
5,11919,0.086722,0.271447,0.148504,0.973253,0.203123,interest director notice change,2
6,6759,0.049178,0.416507,0.252258,0.969081,0.237301,update,7
7,54606,0.397311,0.525476,0.165517,1.013328,0.392423,,1
8,7121,0.051812,0.540566,0.402853,0.977326,0.161013,appendix,5
9,14226,0.103508,0.737101,0.41202,0.971142,0.129235,,9


In [11]:
# low,medium and high distance cluster - check the individual distances
# clusters that have higher average cluster distances can split to smaller subgroups when we look at document content
# testing shows that more features shows greater variation in distances within cluster which is good
print(frame[frame['cluster']==3].head())
print(frame[frame['cluster']==6].head())
print(frame[frame['cluster']==1].head())

   cluster                                   title  distance
3        3             becoming substantial holder  0.014945
3        3  becoming substantial holder challenger  0.014945
3        3  becoming substantial holder harbour am  0.014945
3        3             becoming substantial holder  0.014945
3        3             becoming substantial holder  0.014945
   cluster                            title  distance
6        6  appendix 3b share purchase plan  0.544759
6        6                      appendix 3b  0.021117
6        6                      appendix 3b  0.021117
6        6                      appendix 3b  0.021117
6        6                      appendix 3b  0.021117
   cluster                                        title  distance
1        1                notice pursuant nzx lr 7.12.1  0.953746
1        1               completion share purchase plan  0.977633
1        1  ssh notice milford asset management limited  0.953746
1        1            sells remaining a2mc sha

In [12]:
lstASXAnnTitle['Distance'] = xdist.min(axis=1)
lstASXAnnTitle['Cluster'] = clusters
lstASXAnnTitle.head()

Unnamed: 0,Date,Time,Title,Link,Code,Year,Distance,Cluster
0,31/12/2015,10:33 AM,Ceasing to be a substantial holder from CGF,/asx/statistics/displayAnnouncement.do?display...,A2M,2015,0.00704,0
1,30/12/2015,5:02 PM,Ceasing to be a substantial holder - Harbour AM,/asx/statistics/displayAnnouncement.do?display...,A2M,2015,0.00704,0
2,30/12/2015,5:01 PM,Becoming a substantial holder from CGF,/asx/statistics/displayAnnouncement.do?display...,A2M,2015,0.014945,3
3,30/12/2015,4:58 PM,Ceasing to be a substantial holder from CGF - ...,/asx/statistics/displayAnnouncement.do?display...,A2M,2015,0.00704,0
4,29/12/2015,8:28 AM,Ceasing to be a substantial holder from CGF,/asx/statistics/displayAnnouncement.do?display...,A2M,2015,0.00704,0


In [13]:
# save vectoriser and kmeans
xfilename = xDir_Dest + "/Title_tfidf.pickle"
cPickle.dump(tf, open(xfilename, "wb"))
#tf = cPickle.load(open(xfilename, "rb")) # to reload tfidf

xfilename = xDir_Dest + "/Title_kmeans.pickle"
cPickle.dump(km, open(xfilename, "wb"))
#km = cPickle.load(open(xfilename, "rb")) # to reload km

Apply model to Form Titles

In [14]:
# read data 
lstASXForms = pd.read_csv(xDir_Src + "/ASXForms.csv", encoding='utf-8')

# preview announcement title
print(lstASXForms.shape)
lstASXForms.head()

(32, 2)


Unnamed: 0,Title,Link
0,Appendix 1A - Application for Admission to the...,https://www.asxonline.com/static/companies/fil...
1,Appendix 1B - Application for Admission to the...,https://www.asxonline.com/static/companies/fil...
2,Appendix 1C - Application for Admission to the...,https://www.asxonline.com/static/companies/fil...
3,Appendix 2A - Application for quotation of +se...,https://www.asxonline.com/static/companies/fil...
4,Appendix 3A.1 - Notification of +dividend / di...,https://www.asxonline.com/static/companies/fil...


In [15]:
lstExclude = list(lstCode) + list(lstStopwords)

texts = lstASXForms['Title'].tolist()
texts = usefulNLP.preprocess(texts,lstExclude)

In [17]:
# load vectoriser and kmeans
xfilename = xDir_Dest + "/Title_tfidf.pickle"
# cPickle.dump(tf, open(xfilename, "wb")) # to save
tf = cPickle.load(open(xfilename, "rb")) # to reload tfidf

xfilename = xDir_Dest + "/Title_kmeans.pickle"
#cPickle.dump(km, open(xfilename, "wb")) # to save
km = cPickle.load(open(xfilename, "rb")) # to reload km

In [18]:
# fit form titles
tfidf_matrix = tf.transform(texts)
print(tfidf_matrix.shape)

terms = tf.get_feature_names()
print(terms)

(32, 57)
['3b', '3e', '3y', 'annual', 'appendix', 'appendix 3b', 'appendix 3e', 'becom', 'becom substanti', 'becom substanti holder', 'buyback', 'buyback notic', 'buyback notic appendix', 'buyback notic appendix 3e', 'ceas', 'ceas substanti', 'ceas substanti holder', 'chang', 'chang director', 'chang director interest', 'chang director interest notic', 'chang substanti', 'chang substanti hold', 'daili', 'daili share', 'daili share buyback', 'daili share buyback notic', 'daili share buyback notic appendix', 'director', 'director interest', 'director interest notic', 'distribut', 'dividend', 'dividend distribut', 'half', 'half year', 'hold', 'holder', 'interest', 'interest notic', 'meet', 'notic', 'notic appendix', 'notic appendix 3e', 'present', 'report', 'result', 'share', 'share buyback', 'share buyback notic', 'share buyback notic appendix', 'share buyback notic appendix 3e', 'substanti', 'substanti hold', 'substanti holder', 'updat', 'year']


In [25]:
km.transform(tfidf_matrix)
clusters = km.predict(tfidf_matrix)

ASX300 = { 'title': texts , 'cluster': clusters}
frame = pd.DataFrame(ASX300, index = [clusters] , columns = ['cluster', 'title'])

# print results
for i in range(num_clusters) :
    ntexts = frame[frame['cluster']==i]['cluster'].shape[0]
    print(ntexts)
    tokens = word_tokenize('\n'.join(frame[frame['cluster']==i]['title'].tolist()))
    print([k for k,v in dict(Counter(tokens).most_common()).items() if (v > ntexts * 0.75)])
    print(frame[frame['cluster']==i]['title'][:10].tolist()) # first 20 titles for this cluster
    print() # add whitespace

1
['notice', 'ceasing', 'substantial', 'holder']
['notice ceasing substantial holder']

7
[]
['appendix 3a.2 notification interest payment interest rate change', 'notice change interests substantial holder', 'compulsory acquisition following takeover bid', 'right buy remaining holder securities following takeover bid', 'right buy holders convertible securities following takeover bid', 'notice compulsory acquisition', 'right buy holders convertible securities 100% holder']

3
['appendix', 'director', 'interest', 'notice']
['appendix 3x initial director interest notice', 'appendix 3y change director interest notice', 'appendix 3z final director interest notice']

1
['notice', 'initial', 'substantial', 'holder']
['notice initial substantial holder']

0
[]
[]

16
['appendix']
['appendix 1a application admission official list (asx listing)', 'appendix 1b application admission official list (asx debt listing)', 'appendix 1c application admission official list (asx foreign exempt listing)', '

In [27]:
xdist = km.transform(tfidf_matrix)
frame['distance'] = xdist.min(axis=1)

form_summary = {'count' : frame['cluster'].value_counts(),
                 'percent' : frame['cluster'].value_counts()/frame.shape[0],
                 'dist_mean' : frame.groupby(['cluster'])['distance'].mean(), 
                 'dist_min' : frame.groupby(['cluster'])['distance'].min(),
                 'dist_max' : frame.groupby(['cluster'])['distance'].max(),
                 'dist_sd' : frame.groupby(['cluster'])['distance'].std(),
                 'name' : 'TBA',}

form_summary = pd.DataFrame(form_summary) # , columns = ['cluster', 'title'])
form_summary['cluster'] = form_summary.index

for i in range(num_clusters) :
    ntexts = frame[frame['cluster']==i]['cluster'].shape[0]
    tokens = word_tokenize('\n'.join(frame[frame['cluster']==i]['title'].tolist()))
    xname = [k for k,v in dict(Counter(tokens).most_common()).items() if (v > ntexts * 0.75)] # in 75% of texts in cluster
    form_summary.loc[i,'name'] = ' '.join(xname)

form_summary = form_summary.sort_values('dist_mean')
form_summary = form_summary.reset_index(drop=True)
form_summary

Unnamed: 0,count,percent,dist_mean,dist_min,dist_max,dist_sd,name,cluster
0,2.0,0.0625,0.021117,0.021117,0.021117,0.0,appendix 3b issue securities,6.0
1,1.0,0.03125,0.311139,0.311139,0.311139,,notice ceasing substantial holder,0.0
2,3.0,0.09375,0.554912,0.438646,0.613044,0.100689,appendix director interest notice,2.0
3,16.0,0.5,0.588365,0.402853,0.949468,0.248613,appendix,5.0
4,2.0,0.0625,0.766471,0.61546,0.917482,0.213561,appendix share buyback notice,8.0
5,7.0,0.21875,0.872299,0.165517,1.009338,0.312413,,1.0
6,1.0,0.03125,0.946261,0.946261,0.946261,,notice initial substantial holder,3.0
7,,,,,,,,
8,,,,,,,,
9,,,,,,,,


In [29]:
# check results
print(frame[frame['cluster']==3].head())
print(frame[frame['cluster']==6].head())
print(frame[frame['cluster']==1].head())

   cluster                              title  distance
3        3  notice initial substantial holder  0.946261
   cluster                                              title  distance
6        6  appendix 3b new issue announcement, applicatio...  0.021117
6        6              appendix 3b proposed issue securities  0.021117
   cluster                                              title  distance
1        1  appendix 3a.2 notification interest payment in...  0.988102
1        1         notice change interests substantial holder  0.970713
1        1      compulsory acquisition following takeover bid  0.165517
1        1  right buy remaining holder securities followin...  1.009338
1        1  right buy holders convertible securities follo...  1.009338


In [30]:
lstASXForms['Distance'] = xdist.min(axis=1)
lstASXForms['Cluster'] = clusters
lstASXForms.head()

Unnamed: 0,Title,Link,Distance,Cluster
0,Appendix 1A - Application for Admission to the...,https://www.asxonline.com/static/companies/fil...,0.402853,5
1,Appendix 1B - Application for Admission to the...,https://www.asxonline.com/static/companies/fil...,0.402853,5
2,Appendix 1C - Application for Admission to the...,https://www.asxonline.com/static/companies/fil...,0.402853,5
3,Appendix 2A - Application for quotation of +se...,https://www.asxonline.com/static/companies/fil...,0.402853,5
4,Appendix 3A.1 - Notification of +dividend / di...,https://www.asxonline.com/static/companies/fil...,0.926416,5


In [None]:
# if number of titles gets too large to run above all at once then sample per year and transform over entire df
# now that we know cluster similarity we can calculate document similarity within each cluster
# use around 30 to 40 as threshold for cluster distance? we want documents to be 90% similar?

In [None]:
# https://www.asx.com.au/regulation/rules/asx-listing-rules.htm
# https://www.asxonline.com/companies/html/ASICForms.html
# see appendices for forms to anchor